OLD | NEW |
| (Empty) |
1 library tokenizer; | |
2 | |
3 import 'dart:collection'; | |
4 import 'package:html5lib/parser.dart' show HtmlParser; | |
5 import 'constants.dart'; | |
6 import 'inputstream.dart'; | |
7 import 'token.dart'; | |
8 import 'utils.dart'; | |
9 | |
10 // Group entities by their first character, for faster lookups | |
11 | |
12 // TODO(jmesserly): we could use a better data structure here like a trie, if | |
13 // we had it implemented in Dart. | |
14 Map<String, List<String>> entitiesByFirstChar = (() { | |
15 var result = {}; | |
16 for (var k in entities.keys) { | |
17 result.putIfAbsent(k[0], () => []).add(k); | |
18 } | |
19 return result; | |
20 })(); | |
21 | |
22 // TODO(jmesserly): lots of ways to make this faster: | |
23 // - use char codes everywhere instead of 1-char strings | |
24 // - use switch instead of contains, indexOf | |
25 // - use switch instead of the sequential if tests | |
26 // - avoid string concat | |
27 | |
28 /// This class takes care of tokenizing HTML. | |
29 class HtmlTokenizer implements Iterator<Token> { | |
30 // TODO(jmesserly): a lot of these could be made private | |
31 | |
32 final HtmlInputStream stream; | |
33 | |
34 final bool lowercaseElementName; | |
35 | |
36 final bool lowercaseAttrName; | |
37 | |
38 /// True to generate spans in for [Token.span]. | |
39 final bool generateSpans; | |
40 | |
41 /// True to generate spans for attributes. | |
42 final bool attributeSpans; | |
43 | |
44 /// This reference to the parser is used for correct CDATA handling. | |
45 /// The [HtmlParser] will set this at construction time. | |
46 HtmlParser parser; | |
47 | |
48 final Queue<Token> tokenQueue; | |
49 | |
50 /// Holds the token that is currently being processed. | |
51 Token currentToken; | |
52 | |
53 /// Holds a reference to the method to be invoked for the next parser state. | |
54 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode | |
55 // bug prevents us from doing that. See http://dartbug.com/12465 | |
56 Function state; | |
57 | |
58 String temporaryBuffer; | |
59 | |
60 int _lastOffset; | |
61 | |
62 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add | |
63 // an item until it's ready. But the code doesn't have a clear notion of when | |
64 // it's "done" with the attribute. | |
65 List<TagAttribute> _attributes; | |
66 Set<String> _attributeNames; | |
67 | |
68 HtmlTokenizer(doc, {String encoding, bool parseMeta: true, | |
69 this.lowercaseElementName: true, this.lowercaseAttrName: true, | |
70 bool generateSpans: false, String sourceUrl, this.attributeSpans: false}) | |
71 : stream = new HtmlInputStream( | |
72 doc, encoding, parseMeta, generateSpans, sourceUrl), | |
73 tokenQueue = new Queue(), | |
74 generateSpans = generateSpans { | |
75 reset(); | |
76 } | |
77 | |
78 TagToken get currentTagToken => currentToken; | |
79 DoctypeToken get currentDoctypeToken => currentToken; | |
80 StringToken get currentStringToken => currentToken; | |
81 | |
82 Token _current; | |
83 Token get current => _current; | |
84 | |
85 String get _attributeName => _attributes.last.name; | |
86 set _attributeName(String value) { | |
87 _attributes.last.name = value; | |
88 } | |
89 | |
90 String get _attributeValue => _attributes.last.value; | |
91 set _attributeValue(String value) { | |
92 _attributes.last.value = value; | |
93 } | |
94 | |
95 void _markAttributeEnd(int offset) { | |
96 if (attributeSpans) _attributes.last.end = stream.position + offset; | |
97 } | |
98 | |
99 void _markAttributeValueStart(int offset) { | |
100 if (attributeSpans) _attributes.last.startValue = stream.position + offset; | |
101 } | |
102 | |
103 void _markAttributeValueEnd(int offset) { | |
104 if (attributeSpans) { | |
105 _attributes.last.endValue = stream.position + offset; | |
106 _markAttributeEnd(offset); | |
107 } | |
108 } | |
109 | |
110 // Note: we could track the name span here, if we need it. | |
111 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); | |
112 | |
113 void _addAttribute(String name) { | |
114 if (_attributes == null) _attributes = []; | |
115 var attr = new TagAttribute(name); | |
116 _attributes.add(attr); | |
117 if (attributeSpans) attr.start = stream.position - name.length; | |
118 } | |
119 | |
120 /// This is where the magic happens. | |
121 /// | |
122 /// We do our usually processing through the states and when we have a token | |
123 /// to return we yield the token which pauses processing until the next token | |
124 /// is requested. | |
125 bool moveNext() { | |
126 // Start processing. When EOF is reached state will return false; | |
127 // instead of true and the loop will terminate. | |
128 while (stream.errors.length == 0 && tokenQueue.length == 0) { | |
129 if (!state()) { | |
130 _current = null; | |
131 return false; | |
132 } | |
133 } | |
134 if (stream.errors.length > 0) { | |
135 _current = new ParseErrorToken(stream.errors.removeFirst()); | |
136 } else { | |
137 assert (tokenQueue.length > 0); | |
138 _current = tokenQueue.removeFirst(); | |
139 } | |
140 return true; | |
141 } | |
142 | |
143 /// Resets the tokenizer state. Calling this does not reset the [stream] or | |
144 /// the [parser]. | |
145 void reset() { | |
146 _lastOffset = 0; | |
147 tokenQueue.clear(); | |
148 currentToken = null; | |
149 temporaryBuffer = null; | |
150 _attributes = null; | |
151 _attributeNames = null; | |
152 state = dataState; | |
153 } | |
154 | |
155 /// Adds a token to the queue. Sets the span if needed. | |
156 void _addToken(Token token) { | |
157 if (generateSpans && token.span == null) { | |
158 int offset = stream.position; | |
159 token.span = stream.fileInfo.span(_lastOffset, offset); | |
160 if (token is! ParseErrorToken) { | |
161 _lastOffset = offset; | |
162 } | |
163 } | |
164 tokenQueue.add(token); | |
165 } | |
166 | |
167 /// This function returns either U+FFFD or the character based on the | |
168 /// decimal or hexadecimal representation. It also discards ";" if present. | |
169 /// If not present it will add a [ParseErrorToken]. | |
170 String consumeNumberEntity(bool isHex) { | |
171 var allowed = isDigit; | |
172 var radix = 10; | |
173 if (isHex) { | |
174 allowed = isHexDigit; | |
175 radix = 16; | |
176 } | |
177 | |
178 var charStack = []; | |
179 | |
180 // Consume all the characters that are in range while making sure we | |
181 // don't hit an EOF. | |
182 var c = stream.char(); | |
183 while (allowed(c) && c != EOF) { | |
184 charStack.add(c); | |
185 c = stream.char(); | |
186 } | |
187 | |
188 // Convert the set of characters consumed to an int. | |
189 var charAsInt = parseIntRadix(charStack.join(), radix); | |
190 | |
191 // Certain characters get replaced with others | |
192 var char = replacementCharacters[charAsInt]; | |
193 if (char != null) { | |
194 _addToken(new ParseErrorToken( | |
195 "illegal-codepoint-for-numeric-entity", | |
196 messageParams: {"charAsInt": charAsInt})); | |
197 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) | |
198 || (charAsInt > 0x10FFFF)) { | |
199 char = "\uFFFD"; | |
200 _addToken(new ParseErrorToken( | |
201 "illegal-codepoint-for-numeric-entity", | |
202 messageParams: {"charAsInt": charAsInt})); | |
203 } else { | |
204 // Should speed up this check somehow (e.g. move the set to a constant) | |
205 if ((0x0001 <= charAsInt && charAsInt <= 0x0008) || | |
206 (0x000E <= charAsInt && charAsInt <= 0x001F) || | |
207 (0x007F <= charAsInt && charAsInt <= 0x009F) || | |
208 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) || | |
209 const [0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, | |
210 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, | |
211 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, | |
212 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, | |
213 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, | |
214 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, | |
215 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, | |
216 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, | |
217 0xFFFFF, 0x10FFFE, 0x10FFFF].contains(charAsInt)) { | |
218 _addToken(new ParseErrorToken( | |
219 "illegal-codepoint-for-numeric-entity", | |
220 messageParams: {"charAsInt": charAsInt})); | |
221 } | |
222 char = new String.fromCharCodes([charAsInt]); | |
223 } | |
224 | |
225 // Discard the ; if present. Otherwise, put it back on the queue and | |
226 // invoke parseError on parser. | |
227 if (c != ";") { | |
228 _addToken(new ParseErrorToken( | |
229 "numeric-entity-without-semicolon")); | |
230 stream.unget(c); | |
231 } | |
232 return char; | |
233 } | |
234 | |
235 void consumeEntity({String allowedChar, bool fromAttribute: false}) { | |
236 // Initialise to the default output for when no entity is matched | |
237 var output = "&"; | |
238 | |
239 var charStack = [stream.char()]; | |
240 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&' | |
241 || charStack[0] == EOF || allowedChar == charStack[0]) { | |
242 stream.unget(charStack[0]); | |
243 } else if (charStack[0] == "#") { | |
244 // Read the next character to see if it's hex or decimal | |
245 bool hex = false; | |
246 charStack.add(stream.char()); | |
247 if (charStack.last == 'x' || charStack.last == 'X') { | |
248 hex = true; | |
249 charStack.add(stream.char()); | |
250 } | |
251 | |
252 // charStack.last should be the first digit | |
253 if (hex && isHexDigit(charStack.last) || | |
254 (!hex && isDigit(charStack.last))) { | |
255 // At least one digit found, so consume the whole number | |
256 stream.unget(charStack.last); | |
257 output = consumeNumberEntity(hex); | |
258 } else { | |
259 // No digits found | |
260 _addToken(new ParseErrorToken("expected-numeric-entity")); | |
261 stream.unget(charStack.removeLast()); | |
262 output = "&${charStack.join()}"; | |
263 } | |
264 } else { | |
265 // At this point in the process might have named entity. Entities | |
266 // are stored in the global variable "entities". | |
267 // | |
268 // Consume characters and compare to these to a substring of the | |
269 // entity names in the list until the substring no longer matches. | |
270 var filteredEntityList = entitiesByFirstChar[charStack[0]]; | |
271 if (filteredEntityList == null) filteredEntityList = const []; | |
272 | |
273 while (charStack.last != EOF) { | |
274 var name = charStack.join(); | |
275 filteredEntityList = filteredEntityList.where( | |
276 (e) => e.startsWith(name)).toList(); | |
277 | |
278 if (filteredEntityList.length == 0) { | |
279 break; | |
280 } | |
281 charStack.add(stream.char()); | |
282 } | |
283 | |
284 // At this point we have a string that starts with some characters | |
285 // that may match an entity | |
286 String entityName = null; | |
287 | |
288 // Try to find the longest entity the string will match to take care | |
289 // of ¬i for instance. | |
290 | |
291 int entityLen; | |
292 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) { | |
293 var possibleEntityName = charStack.sublist(0, entityLen).join(); | |
294 if (entities.containsKey(possibleEntityName)) { | |
295 entityName = possibleEntityName; | |
296 break; | |
297 } | |
298 } | |
299 | |
300 if (entityName != null) { | |
301 var lastChar = entityName[entityName.length - 1]; | |
302 if (lastChar != ";") { | |
303 _addToken(new ParseErrorToken( | |
304 "named-entity-without-semicolon")); | |
305 } | |
306 if (lastChar != ";" && fromAttribute && | |
307 (isLetterOrDigit(charStack[entityLen]) || | |
308 charStack[entityLen] == '=')) { | |
309 stream.unget(charStack.removeLast()); | |
310 output = "&${charStack.join()}"; | |
311 } else { | |
312 output = entities[entityName]; | |
313 stream.unget(charStack.removeLast()); | |
314 output = '${output}${slice(charStack, entityLen).join()}'; | |
315 } | |
316 } else { | |
317 _addToken(new ParseErrorToken("expected-named-entity")); | |
318 stream.unget(charStack.removeLast()); | |
319 output = "&${charStack.join()}"; | |
320 } | |
321 } | |
322 if (fromAttribute) { | |
323 _attributeValue = '$_attributeValue$output'; | |
324 } else { | |
325 var token; | |
326 if (isWhitespace(output)) { | |
327 token = new SpaceCharactersToken(output); | |
328 } else { | |
329 token = new CharactersToken(output); | |
330 } | |
331 _addToken(token); | |
332 } | |
333 } | |
334 | |
335 /// This method replaces the need for "entityInAttributeValueState". | |
336 void processEntityInAttribute(String allowedChar) { | |
337 consumeEntity(allowedChar: allowedChar, fromAttribute: true); | |
338 } | |
339 | |
340 /// This method is a generic handler for emitting the tags. It also sets | |
341 /// the state to "data" because that's what's needed after a token has been | |
342 /// emitted. | |
343 void emitCurrentToken() { | |
344 var token = currentToken; | |
345 // Add token to the queue to be yielded | |
346 if (token is TagToken) { | |
347 if (lowercaseElementName) { | |
348 token.name = asciiUpper2Lower(token.name); | |
349 } | |
350 if (token is EndTagToken) { | |
351 if (_attributes != null) { | |
352 _addToken(new ParseErrorToken("attributes-in-end-tag")); | |
353 } | |
354 if (token.selfClosing) { | |
355 _addToken(new ParseErrorToken("this-closing-flag-on-end-tag")); | |
356 } | |
357 } else if (token is StartTagToken) { | |
358 // HTML5 specific normalizations to the token stream. | |
359 // Convert the list into a map where first key wins. | |
360 token.data = new LinkedHashMap<Object, String>(); | |
361 if (_attributes != null) { | |
362 for (var attr in _attributes) { | |
363 token.data.putIfAbsent(attr.name, () => attr.value); | |
364 } | |
365 if (attributeSpans) token.attributeSpans = _attributes; | |
366 } | |
367 } | |
368 _attributes = null; | |
369 _attributeNames = null; | |
370 } | |
371 _addToken(token); | |
372 state = dataState; | |
373 } | |
374 | |
375 // Below are the various tokenizer states worked out. | |
376 | |
377 bool dataState() { | |
378 var data = stream.char(); | |
379 if (data == "&") { | |
380 state = entityDataState; | |
381 } else if (data == "<") { | |
382 state = tagOpenState; | |
383 } else if (data == "\u0000") { | |
384 _addToken(new ParseErrorToken("invalid-codepoint")); | |
385 _addToken(new CharactersToken("\u0000")); | |
386 } else if (data == EOF) { | |
387 // Tokenization ends. | |
388 return false; | |
389 } else if (isWhitespace(data)) { | |
390 // Directly after emitting a token you switch back to the "data | |
391 // state". At that point spaceCharacters are important so they are | |
392 // emitted separately. | |
393 _addToken(new SpaceCharactersToken( | |
394 '${data}${stream.charsUntil(spaceCharacters, true)}')); | |
395 // No need to update lastFourChars here, since the first space will | |
396 // have already been appended to lastFourChars and will have broken | |
397 // any <!-- or --> sequences | |
398 } else { | |
399 var chars = stream.charsUntil("&<\u0000"); | |
400 _addToken(new CharactersToken('${data}${chars}')); | |
401 } | |
402 return true; | |
403 } | |
404 | |
405 bool entityDataState() { | |
406 consumeEntity(); | |
407 state = dataState; | |
408 return true; | |
409 } | |
410 | |
411 bool rcdataState() { | |
412 var data = stream.char(); | |
413 if (data == "&") { | |
414 state = characterReferenceInRcdata; | |
415 } else if (data == "<") { | |
416 state = rcdataLessThanSignState; | |
417 } else if (data == EOF) { | |
418 // Tokenization ends. | |
419 return false; | |
420 } else if (data == "\u0000") { | |
421 _addToken(new ParseErrorToken("invalid-codepoint")); | |
422 _addToken(new CharactersToken("\uFFFD")); | |
423 } else if (isWhitespace(data)) { | |
424 // Directly after emitting a token you switch back to the "data | |
425 // state". At that point spaceCharacters are important so they are | |
426 // emitted separately. | |
427 _addToken(new SpaceCharactersToken( | |
428 '${data}${stream.charsUntil(spaceCharacters, true)}')); | |
429 } else { | |
430 var chars = stream.charsUntil("&<"); | |
431 _addToken(new CharactersToken('${data}${chars}')); | |
432 } | |
433 return true; | |
434 } | |
435 | |
436 bool characterReferenceInRcdata() { | |
437 consumeEntity(); | |
438 state = rcdataState; | |
439 return true; | |
440 } | |
441 | |
442 bool rawtextState() { | |
443 var data = stream.char(); | |
444 if (data == "<") { | |
445 state = rawtextLessThanSignState; | |
446 } else if (data == "\u0000") { | |
447 _addToken(new ParseErrorToken("invalid-codepoint")); | |
448 _addToken(new CharactersToken("\uFFFD")); | |
449 } else if (data == EOF) { | |
450 // Tokenization ends. | |
451 return false; | |
452 } else { | |
453 var chars = stream.charsUntil("<\u0000"); | |
454 _addToken(new CharactersToken("${data}${chars}")); | |
455 } | |
456 return true; | |
457 } | |
458 | |
459 bool scriptDataState() { | |
460 var data = stream.char(); | |
461 if (data == "<") { | |
462 state = scriptDataLessThanSignState; | |
463 } else if (data == "\u0000") { | |
464 _addToken(new ParseErrorToken("invalid-codepoint")); | |
465 _addToken(new CharactersToken("\uFFFD")); | |
466 } else if (data == EOF) { | |
467 // Tokenization ends. | |
468 return false; | |
469 } else { | |
470 var chars = stream.charsUntil("<\u0000"); | |
471 _addToken(new CharactersToken("${data}${chars}")); | |
472 } | |
473 return true; | |
474 } | |
475 | |
476 bool plaintextState() { | |
477 var data = stream.char(); | |
478 if (data == EOF) { | |
479 // Tokenization ends. | |
480 return false; | |
481 } else if (data == "\u0000") { | |
482 _addToken(new ParseErrorToken("invalid-codepoint")); | |
483 _addToken(new CharactersToken("\uFFFD")); | |
484 } else { | |
485 _addToken(new CharactersToken( | |
486 '${data}${stream.charsUntil("\u0000")}')); | |
487 } | |
488 return true; | |
489 } | |
490 | |
491 bool tagOpenState() { | |
492 var data = stream.char(); | |
493 if (data == "!") { | |
494 state = markupDeclarationOpenState; | |
495 } else if (data == "/") { | |
496 state = closeTagOpenState; | |
497 } else if (isLetter(data)) { | |
498 currentToken = new StartTagToken(data); | |
499 state = tagNameState; | |
500 } else if (data == ">") { | |
501 // XXX In theory it could be something besides a tag name. But | |
502 // do we really care? | |
503 _addToken(new ParseErrorToken( | |
504 "expected-tag-name-but-got-right-bracket")); | |
505 _addToken(new CharactersToken("<>")); | |
506 state = dataState; | |
507 } else if (data == "?") { | |
508 // XXX In theory it could be something besides a tag name. But | |
509 // do we really care? | |
510 _addToken(new ParseErrorToken( | |
511 "expected-tag-name-but-got-question-mark")); | |
512 stream.unget(data); | |
513 state = bogusCommentState; | |
514 } else { | |
515 // XXX | |
516 _addToken(new ParseErrorToken("expected-tag-name")); | |
517 _addToken(new CharactersToken("<")); | |
518 stream.unget(data); | |
519 state = dataState; | |
520 } | |
521 return true; | |
522 } | |
523 | |
524 bool closeTagOpenState() { | |
525 var data = stream.char(); | |
526 if (isLetter(data)) { | |
527 currentToken = new EndTagToken(data); | |
528 state = tagNameState; | |
529 } else if (data == ">") { | |
530 _addToken(new ParseErrorToken( | |
531 "expected-closing-tag-but-got-right-bracket")); | |
532 state = dataState; | |
533 } else if (data == EOF) { | |
534 _addToken(new ParseErrorToken( | |
535 "expected-closing-tag-but-got-eof")); | |
536 _addToken(new CharactersToken("</")); | |
537 state = dataState; | |
538 } else { | |
539 // XXX data can be _'_... | |
540 _addToken(new ParseErrorToken( | |
541 "expected-closing-tag-but-got-char", messageParams: {"data": data})); | |
542 stream.unget(data); | |
543 state = bogusCommentState; | |
544 } | |
545 return true; | |
546 } | |
547 | |
548 bool tagNameState() { | |
549 var data = stream.char(); | |
550 if (isWhitespace(data)) { | |
551 state = beforeAttributeNameState; | |
552 } else if (data == ">") { | |
553 emitCurrentToken(); | |
554 } else if (data == EOF) { | |
555 _addToken(new ParseErrorToken("eof-in-tag-name")); | |
556 state = dataState; | |
557 } else if (data == "/") { | |
558 state = selfClosingStartTagState; | |
559 } else if (data == "\u0000") { | |
560 _addToken(new ParseErrorToken("invalid-codepoint")); | |
561 currentTagToken.name = '${currentTagToken.name}\uFFFD'; | |
562 } else { | |
563 currentTagToken.name = '${currentTagToken.name}$data'; | |
564 // (Don't use charsUntil here, because tag names are | |
565 // very short and it's faster to not do anything fancy) | |
566 } | |
567 return true; | |
568 } | |
569 | |
570 bool rcdataLessThanSignState() { | |
571 var data = stream.char(); | |
572 if (data == "/") { | |
573 temporaryBuffer = ""; | |
574 state = rcdataEndTagOpenState; | |
575 } else { | |
576 _addToken(new CharactersToken("<")); | |
577 stream.unget(data); | |
578 state = rcdataState; | |
579 } | |
580 return true; | |
581 } | |
582 | |
583 bool rcdataEndTagOpenState() { | |
584 var data = stream.char(); | |
585 if (isLetter(data)) { | |
586 temporaryBuffer = '${temporaryBuffer}$data'; | |
587 state = rcdataEndTagNameState; | |
588 } else { | |
589 _addToken(new CharactersToken("</")); | |
590 stream.unget(data); | |
591 state = rcdataState; | |
592 } | |
593 return true; | |
594 } | |
595 | |
596 bool _tokenIsAppropriate() { | |
597 return currentToken is TagToken && | |
598 currentTagToken.name.toLowerCase() == temporaryBuffer.toLowerCase(); | |
599 } | |
600 | |
601 bool rcdataEndTagNameState() { | |
602 var appropriate = _tokenIsAppropriate(); | |
603 var data = stream.char(); | |
604 if (isWhitespace(data) && appropriate) { | |
605 currentToken = new EndTagToken(temporaryBuffer); | |
606 state = beforeAttributeNameState; | |
607 } else if (data == "/" && appropriate) { | |
608 currentToken = new EndTagToken(temporaryBuffer); | |
609 state = selfClosingStartTagState; | |
610 } else if (data == ">" && appropriate) { | |
611 currentToken = new EndTagToken(temporaryBuffer); | |
612 emitCurrentToken(); | |
613 state = dataState; | |
614 } else if (isLetter(data)) { | |
615 temporaryBuffer = '${temporaryBuffer}$data'; | |
616 } else { | |
617 _addToken(new CharactersToken("</$temporaryBuffer")); | |
618 stream.unget(data); | |
619 state = rcdataState; | |
620 } | |
621 return true; | |
622 } | |
623 | |
624 bool rawtextLessThanSignState() { | |
625 var data = stream.char(); | |
626 if (data == "/") { | |
627 temporaryBuffer = ""; | |
628 state = rawtextEndTagOpenState; | |
629 } else { | |
630 _addToken(new CharactersToken("<")); | |
631 stream.unget(data); | |
632 state = rawtextState; | |
633 } | |
634 return true; | |
635 } | |
636 | |
637 bool rawtextEndTagOpenState() { | |
638 var data = stream.char(); | |
639 if (isLetter(data)) { | |
640 temporaryBuffer = '${temporaryBuffer}$data'; | |
641 state = rawtextEndTagNameState; | |
642 } else { | |
643 _addToken(new CharactersToken("</")); | |
644 stream.unget(data); | |
645 state = rawtextState; | |
646 } | |
647 return true; | |
648 } | |
649 | |
650 bool rawtextEndTagNameState() { | |
651 var appropriate = _tokenIsAppropriate(); | |
652 var data = stream.char(); | |
653 if (isWhitespace(data) && appropriate) { | |
654 currentToken = new EndTagToken(temporaryBuffer); | |
655 state = beforeAttributeNameState; | |
656 } else if (data == "/" && appropriate) { | |
657 currentToken = new EndTagToken(temporaryBuffer); | |
658 state = selfClosingStartTagState; | |
659 } else if (data == ">" && appropriate) { | |
660 currentToken = new EndTagToken(temporaryBuffer); | |
661 emitCurrentToken(); | |
662 state = dataState; | |
663 } else if (isLetter(data)) { | |
664 temporaryBuffer = '${temporaryBuffer}$data'; | |
665 } else { | |
666 _addToken(new CharactersToken("</$temporaryBuffer")); | |
667 stream.unget(data); | |
668 state = rawtextState; | |
669 } | |
670 return true; | |
671 } | |
672 | |
673 bool scriptDataLessThanSignState() { | |
674 var data = stream.char(); | |
675 if (data == "/") { | |
676 temporaryBuffer = ""; | |
677 state = scriptDataEndTagOpenState; | |
678 } else if (data == "!") { | |
679 _addToken(new CharactersToken("<!")); | |
680 state = scriptDataEscapeStartState; | |
681 } else { | |
682 _addToken(new CharactersToken("<")); | |
683 stream.unget(data); | |
684 state = scriptDataState; | |
685 } | |
686 return true; | |
687 } | |
688 | |
689 bool scriptDataEndTagOpenState() { | |
690 var data = stream.char(); | |
691 if (isLetter(data)) { | |
692 temporaryBuffer = '${temporaryBuffer}$data'; | |
693 state = scriptDataEndTagNameState; | |
694 } else { | |
695 _addToken(new CharactersToken("</")); | |
696 stream.unget(data); | |
697 state = scriptDataState; | |
698 } | |
699 return true; | |
700 } | |
701 | |
702 bool scriptDataEndTagNameState() { | |
703 var appropriate = _tokenIsAppropriate(); | |
704 var data = stream.char(); | |
705 if (isWhitespace(data) && appropriate) { | |
706 currentToken = new EndTagToken(temporaryBuffer); | |
707 state = beforeAttributeNameState; | |
708 } else if (data == "/" && appropriate) { | |
709 currentToken = new EndTagToken(temporaryBuffer); | |
710 state = selfClosingStartTagState; | |
711 } else if (data == ">" && appropriate) { | |
712 currentToken = new EndTagToken(temporaryBuffer); | |
713 emitCurrentToken(); | |
714 state = dataState; | |
715 } else if (isLetter(data)) { | |
716 temporaryBuffer = '${temporaryBuffer}$data'; | |
717 } else { | |
718 _addToken(new CharactersToken("</$temporaryBuffer")); | |
719 stream.unget(data); | |
720 state = scriptDataState; | |
721 } | |
722 return true; | |
723 } | |
724 | |
725 bool scriptDataEscapeStartState() { | |
726 var data = stream.char(); | |
727 if (data == "-") { | |
728 _addToken(new CharactersToken("-")); | |
729 state = scriptDataEscapeStartDashState; | |
730 } else { | |
731 stream.unget(data); | |
732 state = scriptDataState; | |
733 } | |
734 return true; | |
735 } | |
736 | |
737 bool scriptDataEscapeStartDashState() { | |
738 var data = stream.char(); | |
739 if (data == "-") { | |
740 _addToken(new CharactersToken("-")); | |
741 state = scriptDataEscapedDashDashState; | |
742 } else { | |
743 stream.unget(data); | |
744 state = scriptDataState; | |
745 } | |
746 return true; | |
747 } | |
748 | |
749 bool scriptDataEscapedState() { | |
750 var data = stream.char(); | |
751 if (data == "-") { | |
752 _addToken(new CharactersToken("-")); | |
753 state = scriptDataEscapedDashState; | |
754 } else if (data == "<") { | |
755 state = scriptDataEscapedLessThanSignState; | |
756 } else if (data == "\u0000") { | |
757 _addToken(new ParseErrorToken("invalid-codepoint")); | |
758 _addToken(new CharactersToken("\uFFFD")); | |
759 } else if (data == EOF) { | |
760 state = dataState; | |
761 } else { | |
762 var chars = stream.charsUntil("<-\u0000"); | |
763 _addToken(new CharactersToken("${data}${chars}")); | |
764 } | |
765 return true; | |
766 } | |
767 | |
768 bool scriptDataEscapedDashState() { | |
769 var data = stream.char(); | |
770 if (data == "-") { | |
771 _addToken(new CharactersToken("-")); | |
772 state = scriptDataEscapedDashDashState; | |
773 } else if (data == "<") { | |
774 state = scriptDataEscapedLessThanSignState; | |
775 } else if (data == "\u0000") { | |
776 _addToken(new ParseErrorToken("invalid-codepoint")); | |
777 _addToken(new CharactersToken("\uFFFD")); | |
778 state = scriptDataEscapedState; | |
779 } else if (data == EOF) { | |
780 state = dataState; | |
781 } else { | |
782 _addToken(new CharactersToken(data)); | |
783 state = scriptDataEscapedState; | |
784 } | |
785 return true; | |
786 } | |
787 | |
788 bool scriptDataEscapedDashDashState() { | |
789 var data = stream.char(); | |
790 if (data == "-") { | |
791 _addToken(new CharactersToken("-")); | |
792 } else if (data == "<") { | |
793 state = scriptDataEscapedLessThanSignState; | |
794 } else if (data == ">") { | |
795 _addToken(new CharactersToken(">")); | |
796 state = scriptDataState; | |
797 } else if (data == "\u0000") { | |
798 _addToken(new ParseErrorToken("invalid-codepoint")); | |
799 _addToken(new CharactersToken("\uFFFD")); | |
800 state = scriptDataEscapedState; | |
801 } else if (data == EOF) { | |
802 state = dataState; | |
803 } else { | |
804 _addToken(new CharactersToken(data)); | |
805 state = scriptDataEscapedState; | |
806 } | |
807 return true; | |
808 } | |
809 | |
810 bool scriptDataEscapedLessThanSignState() { | |
811 var data = stream.char(); | |
812 if (data == "/") { | |
813 temporaryBuffer = ""; | |
814 state = scriptDataEscapedEndTagOpenState; | |
815 } else if (isLetter(data)) { | |
816 _addToken(new CharactersToken("<$data")); | |
817 temporaryBuffer = data; | |
818 state = scriptDataDoubleEscapeStartState; | |
819 } else { | |
820 _addToken(new CharactersToken("<")); | |
821 stream.unget(data); | |
822 state = scriptDataEscapedState; | |
823 } | |
824 return true; | |
825 } | |
826 | |
827 bool scriptDataEscapedEndTagOpenState() { | |
828 var data = stream.char(); | |
829 if (isLetter(data)) { | |
830 temporaryBuffer = data; | |
831 state = scriptDataEscapedEndTagNameState; | |
832 } else { | |
833 _addToken(new CharactersToken("</")); | |
834 stream.unget(data); | |
835 state = scriptDataEscapedState; | |
836 } | |
837 return true; | |
838 } | |
839 | |
840 bool scriptDataEscapedEndTagNameState() { | |
841 var appropriate = _tokenIsAppropriate(); | |
842 var data = stream.char(); | |
843 if (isWhitespace(data) && appropriate) { | |
844 currentToken = new EndTagToken(temporaryBuffer); | |
845 state = beforeAttributeNameState; | |
846 } else if (data == "/" && appropriate) { | |
847 currentToken = new EndTagToken(temporaryBuffer); | |
848 state = selfClosingStartTagState; | |
849 } else if (data == ">" && appropriate) { | |
850 currentToken = new EndTagToken(temporaryBuffer); | |
851 emitCurrentToken(); | |
852 state = dataState; | |
853 } else if (isLetter(data)) { | |
854 temporaryBuffer = '${temporaryBuffer}$data'; | |
855 } else { | |
856 _addToken(new CharactersToken("</$temporaryBuffer")); | |
857 stream.unget(data); | |
858 state = scriptDataEscapedState; | |
859 } | |
860 return true; | |
861 } | |
862 | |
863 bool scriptDataDoubleEscapeStartState() { | |
864 var data = stream.char(); | |
865 if (isWhitespace(data) || data == "/" || data == ">") { | |
866 _addToken(new CharactersToken(data)); | |
867 if (temporaryBuffer.toLowerCase() == "script") { | |
868 state = scriptDataDoubleEscapedState; | |
869 } else { | |
870 state = scriptDataEscapedState; | |
871 } | |
872 } else if (isLetter(data)) { | |
873 _addToken(new CharactersToken(data)); | |
874 temporaryBuffer = '${temporaryBuffer}$data'; | |
875 } else { | |
876 stream.unget(data); | |
877 state = scriptDataEscapedState; | |
878 } | |
879 return true; | |
880 } | |
881 | |
882 bool scriptDataDoubleEscapedState() { | |
883 var data = stream.char(); | |
884 if (data == "-") { | |
885 _addToken(new CharactersToken("-")); | |
886 state = scriptDataDoubleEscapedDashState; | |
887 } else if (data == "<") { | |
888 _addToken(new CharactersToken("<")); | |
889 state = scriptDataDoubleEscapedLessThanSignState; | |
890 } else if (data == "\u0000") { | |
891 _addToken(new ParseErrorToken("invalid-codepoint")); | |
892 _addToken(new CharactersToken("\uFFFD")); | |
893 } else if (data == EOF) { | |
894 _addToken(new ParseErrorToken("eof-in-script-in-script")); | |
895 state = dataState; | |
896 } else { | |
897 _addToken(new CharactersToken(data)); | |
898 } | |
899 return true; | |
900 } | |
901 | |
902 bool scriptDataDoubleEscapedDashState() { | |
903 var data = stream.char(); | |
904 if (data == "-") { | |
905 _addToken(new CharactersToken("-")); | |
906 state = scriptDataDoubleEscapedDashDashState; | |
907 } else if (data == "<") { | |
908 _addToken(new CharactersToken("<")); | |
909 state = scriptDataDoubleEscapedLessThanSignState; | |
910 } else if (data == "\u0000") { | |
911 _addToken(new ParseErrorToken("invalid-codepoint")); | |
912 _addToken(new CharactersToken("\uFFFD")); | |
913 state = scriptDataDoubleEscapedState; | |
914 } else if (data == EOF) { | |
915 _addToken(new ParseErrorToken("eof-in-script-in-script")); | |
916 state = dataState; | |
917 } else { | |
918 _addToken(new CharactersToken(data)); | |
919 state = scriptDataDoubleEscapedState; | |
920 } | |
921 return true; | |
922 } | |
923 | |
924 // TODO(jmesserly): report bug in original code | |
925 // (was "Dash" instead of "DashDash") | |
926 bool scriptDataDoubleEscapedDashDashState() { | |
927 var data = stream.char(); | |
928 if (data == "-") { | |
929 _addToken(new CharactersToken("-")); | |
930 } else if (data == "<") { | |
931 _addToken(new CharactersToken("<")); | |
932 state = scriptDataDoubleEscapedLessThanSignState; | |
933 } else if (data == ">") { | |
934 _addToken(new CharactersToken(">")); | |
935 state = scriptDataState; | |
936 } else if (data == "\u0000") { | |
937 _addToken(new ParseErrorToken("invalid-codepoint")); | |
938 _addToken(new CharactersToken("\uFFFD")); | |
939 state = scriptDataDoubleEscapedState; | |
940 } else if (data == EOF) { | |
941 _addToken(new ParseErrorToken("eof-in-script-in-script")); | |
942 state = dataState; | |
943 } else { | |
944 _addToken(new CharactersToken(data)); | |
945 state = scriptDataDoubleEscapedState; | |
946 } | |
947 return true; | |
948 } | |
949 | |
950 bool scriptDataDoubleEscapedLessThanSignState() { | |
951 var data = stream.char(); | |
952 if (data == "/") { | |
953 _addToken(new CharactersToken("/")); | |
954 temporaryBuffer = ""; | |
955 state = scriptDataDoubleEscapeEndState; | |
956 } else { | |
957 stream.unget(data); | |
958 state = scriptDataDoubleEscapedState; | |
959 } | |
960 return true; | |
961 } | |
962 | |
963 bool scriptDataDoubleEscapeEndState() { | |
964 var data = stream.char(); | |
965 if (isWhitespace(data) || data == "/" || data == ">") { | |
966 _addToken(new CharactersToken(data)); | |
967 if (temporaryBuffer.toLowerCase() == "script") { | |
968 state = scriptDataEscapedState; | |
969 } else { | |
970 state = scriptDataDoubleEscapedState; | |
971 } | |
972 } else if (isLetter(data)) { | |
973 _addToken(new CharactersToken(data)); | |
974 temporaryBuffer = '${temporaryBuffer}$data'; | |
975 } else { | |
976 stream.unget(data); | |
977 state = scriptDataDoubleEscapedState; | |
978 } | |
979 return true; | |
980 } | |
981 | |
982 bool beforeAttributeNameState() { | |
983 var data = stream.char(); | |
984 if (isWhitespace(data)) { | |
985 stream.charsUntil(spaceCharacters, true); | |
986 } else if (isLetter(data)) { | |
987 _addAttribute(data); | |
988 state = attributeNameState; | |
989 } else if (data == ">") { | |
990 emitCurrentToken(); | |
991 } else if (data == "/") { | |
992 state = selfClosingStartTagState; | |
993 } else if (data == EOF) { | |
994 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof")); | |
995 state = dataState; | |
996 } else if ("'\"=<".contains(data)) { | |
997 _addToken(new ParseErrorToken("invalid-character-in-attribute-name")); | |
998 _addAttribute(data); | |
999 state = attributeNameState; | |
1000 } else if (data == "\u0000") { | |
1001 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1002 _addAttribute("\uFFFD"); | |
1003 state = attributeNameState; | |
1004 } else { | |
1005 _addAttribute(data); | |
1006 state = attributeNameState; | |
1007 } | |
1008 return true; | |
1009 } | |
1010 | |
1011 bool attributeNameState() { | |
1012 var data = stream.char(); | |
1013 bool leavingThisState = true; | |
1014 bool emitToken = false; | |
1015 if (data == "=") { | |
1016 state = beforeAttributeValueState; | |
1017 } else if (isLetter(data)) { | |
1018 _attributeName = '$_attributeName$data' | |
1019 '${stream.charsUntil(asciiLetters, true)}'; | |
1020 leavingThisState = false; | |
1021 } else if (data == ">") { | |
1022 // XXX If we emit here the attributes are converted to a dict | |
1023 // without being checked and when the code below runs we error | |
1024 // because data is a dict not a list | |
1025 emitToken = true; | |
1026 } else if (isWhitespace(data)) { | |
1027 state = afterAttributeNameState; | |
1028 } else if (data == "/") { | |
1029 state = selfClosingStartTagState; | |
1030 } else if (data == "\u0000") { | |
1031 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1032 _attributeName = '${_attributeName}\uFFFD'; | |
1033 leavingThisState = false; | |
1034 } else if (data == EOF) { | |
1035 _addToken(new ParseErrorToken("eof-in-attribute-name")); | |
1036 state = dataState; | |
1037 } else if ("'\"<".contains(data)) { | |
1038 _addToken(new ParseErrorToken("invalid-character-in-attribute-name")); | |
1039 _attributeName = '$_attributeName$data'; | |
1040 leavingThisState = false; | |
1041 } else { | |
1042 _attributeName = '$_attributeName$data'; | |
1043 leavingThisState = false; | |
1044 } | |
1045 | |
1046 if (leavingThisState) { | |
1047 _markAttributeNameEnd(-1); | |
1048 | |
1049 // Attributes are not dropped at this stage. That happens when the | |
1050 // start tag token is emitted so values can still be safely appended | |
1051 // to attributes, but we do want to report the parse error in time. | |
1052 if (lowercaseAttrName) { | |
1053 _attributeName = asciiUpper2Lower(_attributeName); | |
1054 } | |
1055 if (_attributeNames == null) _attributeNames = new Set(); | |
1056 if (_attributeNames.contains(_attributeName)) { | |
1057 _addToken(new ParseErrorToken("duplicate-attribute")); | |
1058 } | |
1059 _attributeNames.add(_attributeName); | |
1060 | |
1061 // XXX Fix for above XXX | |
1062 if (emitToken) { | |
1063 emitCurrentToken(); | |
1064 } | |
1065 } | |
1066 return true; | |
1067 } | |
1068 | |
1069 bool afterAttributeNameState() { | |
1070 var data = stream.char(); | |
1071 if (isWhitespace(data)) { | |
1072 stream.charsUntil(spaceCharacters, true); | |
1073 } else if (data == "=") { | |
1074 state = beforeAttributeValueState; | |
1075 } else if (data == ">") { | |
1076 emitCurrentToken(); | |
1077 } else if (isLetter(data)) { | |
1078 _addAttribute(data); | |
1079 state = attributeNameState; | |
1080 } else if (data == "/") { | |
1081 state = selfClosingStartTagState; | |
1082 } else if (data == "\u0000") { | |
1083 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1084 _addAttribute("\uFFFD"); | |
1085 state = attributeNameState; | |
1086 } else if (data == EOF) { | |
1087 _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof")); | |
1088 state = dataState; | |
1089 } else if ("'\"<".contains(data)) { | |
1090 _addToken(new ParseErrorToken("invalid-character-after-attribute-name")); | |
1091 _addAttribute(data); | |
1092 state = attributeNameState; | |
1093 } else { | |
1094 _addAttribute(data); | |
1095 state = attributeNameState; | |
1096 } | |
1097 return true; | |
1098 } | |
1099 | |
1100 bool beforeAttributeValueState() { | |
1101 var data = stream.char(); | |
1102 if (isWhitespace(data)) { | |
1103 stream.charsUntil(spaceCharacters, true); | |
1104 } else if (data == "\"") { | |
1105 _markAttributeValueStart(0); | |
1106 state = attributeValueDoubleQuotedState; | |
1107 } else if (data == "&") { | |
1108 state = attributeValueUnQuotedState; | |
1109 stream.unget(data); | |
1110 _markAttributeValueStart(0); | |
1111 } else if (data == "'") { | |
1112 _markAttributeValueStart(0); | |
1113 state = attributeValueSingleQuotedState; | |
1114 } else if (data == ">") { | |
1115 _addToken(new ParseErrorToken( | |
1116 "expected-attribute-value-but-got-right-bracket")); | |
1117 emitCurrentToken(); | |
1118 } else if (data == "\u0000") { | |
1119 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1120 _markAttributeValueStart(-1); | |
1121 _attributeValue = '${_attributeValue}\uFFFD'; | |
1122 state = attributeValueUnQuotedState; | |
1123 } else if (data == EOF) { | |
1124 _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof")); | |
1125 state = dataState; | |
1126 } else if ("=<`".contains(data)) { | |
1127 _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value")); | |
1128 _markAttributeValueStart(-1); | |
1129 _attributeValue = '$_attributeValue$data'; | |
1130 state = attributeValueUnQuotedState; | |
1131 } else { | |
1132 _markAttributeValueStart(-1); | |
1133 _attributeValue = '$_attributeValue$data'; | |
1134 state = attributeValueUnQuotedState; | |
1135 } | |
1136 return true; | |
1137 } | |
1138 | |
1139 bool attributeValueDoubleQuotedState() { | |
1140 var data = stream.char(); | |
1141 if (data == "\"") { | |
1142 _markAttributeValueEnd(-1); | |
1143 _markAttributeEnd(0); | |
1144 state = afterAttributeValueState; | |
1145 } else if (data == "&") { | |
1146 processEntityInAttribute('"'); | |
1147 } else if (data == "\u0000") { | |
1148 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1149 _attributeValue = '${_attributeValue}\uFFFD'; | |
1150 } else if (data == EOF) { | |
1151 _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote")); | |
1152 _markAttributeValueEnd(-1); | |
1153 state = dataState; | |
1154 } else { | |
1155 _attributeValue = '$_attributeValue$data${stream.charsUntil("\"&")}'; | |
1156 } | |
1157 return true; | |
1158 } | |
1159 | |
1160 bool attributeValueSingleQuotedState() { | |
1161 var data = stream.char(); | |
1162 if (data == "'") { | |
1163 _markAttributeValueEnd(-1); | |
1164 _markAttributeEnd(0); | |
1165 state = afterAttributeValueState; | |
1166 } else if (data == "&") { | |
1167 processEntityInAttribute("'"); | |
1168 } else if (data == "\u0000") { | |
1169 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1170 _attributeValue = '${_attributeValue}\uFFFD'; | |
1171 } else if (data == EOF) { | |
1172 _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote")); | |
1173 _markAttributeValueEnd(-1); | |
1174 state = dataState; | |
1175 } else { | |
1176 _attributeValue = '$_attributeValue$data${stream.charsUntil("\'&")}'; | |
1177 } | |
1178 return true; | |
1179 } | |
1180 | |
1181 bool attributeValueUnQuotedState() { | |
1182 var data = stream.char(); | |
1183 if (isWhitespace(data)) { | |
1184 _markAttributeValueEnd(-1); | |
1185 state = beforeAttributeNameState; | |
1186 } else if (data == "&") { | |
1187 processEntityInAttribute(">"); | |
1188 } else if (data == ">") { | |
1189 _markAttributeValueEnd(-1); | |
1190 emitCurrentToken(); | |
1191 } else if (data == EOF) { | |
1192 _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes")); | |
1193 _markAttributeValueEnd(-1); | |
1194 state = dataState; | |
1195 } else if ('"\'=<`'.contains(data)) { | |
1196 _addToken(new ParseErrorToken( | |
1197 "unexpected-character-in-unquoted-attribute-value")); | |
1198 _attributeValue = '$_attributeValue$data'; | |
1199 } else if (data == "\u0000") { | |
1200 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1201 _attributeValue = '${_attributeValue}\uFFFD'; | |
1202 } else { | |
1203 _attributeValue = '$_attributeValue$data' | |
1204 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}'; | |
1205 } | |
1206 return true; | |
1207 } | |
1208 | |
1209 bool afterAttributeValueState() { | |
1210 var data = stream.char(); | |
1211 if (isWhitespace(data)) { | |
1212 state = beforeAttributeNameState; | |
1213 } else if (data == ">") { | |
1214 emitCurrentToken(); | |
1215 } else if (data == "/") { | |
1216 state = selfClosingStartTagState; | |
1217 } else if (data == EOF) { | |
1218 _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value")); | |
1219 stream.unget(data); | |
1220 state = dataState; | |
1221 } else { | |
1222 _addToken(new ParseErrorToken( | |
1223 "unexpected-character-after-attribute-value")); | |
1224 stream.unget(data); | |
1225 state = beforeAttributeNameState; | |
1226 } | |
1227 return true; | |
1228 } | |
1229 | |
1230 bool selfClosingStartTagState() { | |
1231 var data = stream.char(); | |
1232 if (data == ">") { | |
1233 currentTagToken.selfClosing = true; | |
1234 emitCurrentToken(); | |
1235 } else if (data == EOF) { | |
1236 _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag")); | |
1237 stream.unget(data); | |
1238 state = dataState; | |
1239 } else { | |
1240 _addToken(new ParseErrorToken( | |
1241 "unexpected-character-after-soldius-in-tag")); | |
1242 stream.unget(data); | |
1243 state = beforeAttributeNameState; | |
1244 } | |
1245 return true; | |
1246 } | |
1247 | |
1248 bool bogusCommentState() { | |
1249 // Make a new comment token and give it as value all the characters | |
1250 // until the first > or EOF (charsUntil checks for EOF automatically) | |
1251 // and emit it. | |
1252 var data = stream.charsUntil(">"); | |
1253 data = data.replaceAll("\u0000", "\uFFFD"); | |
1254 _addToken(new CommentToken(data)); | |
1255 | |
1256 // Eat the character directly after the bogus comment which is either a | |
1257 // ">" or an EOF. | |
1258 stream.char(); | |
1259 state = dataState; | |
1260 return true; | |
1261 } | |
1262 | |
1263 bool markupDeclarationOpenState() { | |
1264 var charStack = [stream.char()]; | |
1265 if (charStack.last == "-") { | |
1266 charStack.add(stream.char()); | |
1267 if (charStack.last == "-") { | |
1268 currentToken = new CommentToken(""); | |
1269 state = commentStartState; | |
1270 return true; | |
1271 } | |
1272 } else if (charStack.last == 'd' || charStack.last == 'D') { | |
1273 var matched = true; | |
1274 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) { | |
1275 var char = stream.char(); | |
1276 charStack.add(char); | |
1277 if (char == EOF || !expected.contains(char)) { | |
1278 matched = false; | |
1279 break; | |
1280 } | |
1281 } | |
1282 if (matched) { | |
1283 currentToken = new DoctypeToken(correct: true); | |
1284 state = doctypeState; | |
1285 return true; | |
1286 } | |
1287 } else if (charStack.last == "[" && | |
1288 parser != null && parser.tree.openElements.length > 0 && | |
1289 parser.tree.openElements.last.namespaceUri | |
1290 != parser.tree.defaultNamespace) { | |
1291 var matched = true; | |
1292 for (var expected in const ["C", "D", "A", "T", "A", "["]) { | |
1293 charStack.add(stream.char()); | |
1294 if (charStack.last != expected) { | |
1295 matched = false; | |
1296 break; | |
1297 } | |
1298 } | |
1299 if (matched) { | |
1300 state = cdataSectionState; | |
1301 return true; | |
1302 } | |
1303 } | |
1304 | |
1305 _addToken(new ParseErrorToken("expected-dashes-or-doctype")); | |
1306 | |
1307 while (charStack.length > 0) { | |
1308 stream.unget(charStack.removeLast()); | |
1309 } | |
1310 state = bogusCommentState; | |
1311 return true; | |
1312 } | |
1313 | |
1314 bool commentStartState() { | |
1315 var data = stream.char(); | |
1316 if (data == "-") { | |
1317 state = commentStartDashState; | |
1318 } else if (data == "\u0000") { | |
1319 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1320 currentStringToken.data = '${currentStringToken.data}\uFFFD'; | |
1321 } else if (data == ">") { | |
1322 _addToken(new ParseErrorToken("incorrect-comment")); | |
1323 _addToken(currentToken); | |
1324 state = dataState; | |
1325 } else if (data == EOF) { | |
1326 _addToken(new ParseErrorToken("eof-in-comment")); | |
1327 _addToken(currentToken); | |
1328 state = dataState; | |
1329 } else { | |
1330 currentStringToken.data = '${currentStringToken.data}$data'; | |
1331 state = commentState; | |
1332 } | |
1333 return true; | |
1334 } | |
1335 | |
1336 bool commentStartDashState() { | |
1337 var data = stream.char(); | |
1338 if (data == "-") { | |
1339 state = commentEndState; | |
1340 } else if (data == "\u0000") { | |
1341 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1342 currentStringToken.data = '${currentStringToken.data}-\uFFFD'; | |
1343 } else if (data == ">") { | |
1344 _addToken(new ParseErrorToken("incorrect-comment")); | |
1345 _addToken(currentToken); | |
1346 state = dataState; | |
1347 } else if (data == EOF) { | |
1348 _addToken(new ParseErrorToken("eof-in-comment")); | |
1349 _addToken(currentToken); | |
1350 state = dataState; | |
1351 } else { | |
1352 currentStringToken.data = '${currentStringToken.data}-${data}'; | |
1353 state = commentState; | |
1354 } | |
1355 return true; | |
1356 } | |
1357 | |
1358 bool commentState() { | |
1359 var data = stream.char(); | |
1360 if (data == "-") { | |
1361 state = commentEndDashState; | |
1362 } else if (data == "\u0000") { | |
1363 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1364 currentStringToken.data = '${currentStringToken.data}\uFFFD'; | |
1365 } else if (data == EOF) { | |
1366 _addToken(new ParseErrorToken("eof-in-comment")); | |
1367 _addToken(currentToken); | |
1368 state = dataState; | |
1369 } else { | |
1370 currentStringToken.data = '${currentStringToken.data}$data' | |
1371 '${stream.charsUntil("-\u0000")}'; | |
1372 } | |
1373 return true; | |
1374 } | |
1375 | |
1376 bool commentEndDashState() { | |
1377 var data = stream.char(); | |
1378 if (data == "-") { | |
1379 state = commentEndState; | |
1380 } else if (data == "\u0000") { | |
1381 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1382 currentStringToken.data = "${currentStringToken.data}-\uFFFD"; | |
1383 state = commentState; | |
1384 } else if (data == EOF) { | |
1385 _addToken(new ParseErrorToken("eof-in-comment-end-dash")); | |
1386 _addToken(currentToken); | |
1387 state = dataState; | |
1388 } else { | |
1389 currentStringToken.data = "${currentStringToken.data}-${data}"; | |
1390 state = commentState; | |
1391 } | |
1392 return true; | |
1393 } | |
1394 | |
1395 bool commentEndState() { | |
1396 var data = stream.char(); | |
1397 if (data == ">") { | |
1398 _addToken(currentToken); | |
1399 state = dataState; | |
1400 } else if (data == "\u0000") { | |
1401 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1402 currentStringToken.data = '${currentStringToken.data}--\uFFFD'; | |
1403 state = commentState; | |
1404 } else if (data == "!") { | |
1405 _addToken(new ParseErrorToken( | |
1406 "unexpected-bang-after-double-dash-in-comment")); | |
1407 state = commentEndBangState; | |
1408 } else if (data == "-") { | |
1409 _addToken(new ParseErrorToken( | |
1410 "unexpected-dash-after-double-dash-in-comment")); | |
1411 currentStringToken.data = '${currentStringToken.data}$data'; | |
1412 } else if (data == EOF) { | |
1413 _addToken(new ParseErrorToken("eof-in-comment-double-dash")); | |
1414 _addToken(currentToken); | |
1415 state = dataState; | |
1416 } else { | |
1417 // XXX | |
1418 _addToken(new ParseErrorToken("unexpected-char-in-comment")); | |
1419 currentStringToken.data = "${currentStringToken.data}--${data}"; | |
1420 state = commentState; | |
1421 } | |
1422 return true; | |
1423 } | |
1424 | |
1425 bool commentEndBangState() { | |
1426 var data = stream.char(); | |
1427 if (data == ">") { | |
1428 _addToken(currentToken); | |
1429 state = dataState; | |
1430 } else if (data == "-") { | |
1431 currentStringToken.data = '${currentStringToken.data}--!'; | |
1432 state = commentEndDashState; | |
1433 } else if (data == "\u0000") { | |
1434 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1435 currentStringToken.data = '${currentStringToken.data}--!\uFFFD'; | |
1436 state = commentState; | |
1437 } else if (data == EOF) { | |
1438 _addToken(new ParseErrorToken("eof-in-comment-end-bang-state")); | |
1439 _addToken(currentToken); | |
1440 state = dataState; | |
1441 } else { | |
1442 currentStringToken.data = "${currentStringToken.data}--!${data}"; | |
1443 state = commentState; | |
1444 } | |
1445 return true; | |
1446 } | |
1447 | |
1448 bool doctypeState() { | |
1449 var data = stream.char(); | |
1450 if (isWhitespace(data)) { | |
1451 state = beforeDoctypeNameState; | |
1452 } else if (data == EOF) { | |
1453 _addToken(new ParseErrorToken( | |
1454 "expected-doctype-name-but-got-eof")); | |
1455 currentDoctypeToken.correct = false; | |
1456 _addToken(currentToken); | |
1457 state = dataState; | |
1458 } else { | |
1459 _addToken(new ParseErrorToken("need-space-after-doctype")); | |
1460 stream.unget(data); | |
1461 state = beforeDoctypeNameState; | |
1462 } | |
1463 return true; | |
1464 } | |
1465 | |
1466 bool beforeDoctypeNameState() { | |
1467 var data = stream.char(); | |
1468 if (isWhitespace(data)) { | |
1469 return true; | |
1470 } else if (data == ">") { | |
1471 _addToken(new ParseErrorToken( | |
1472 "expected-doctype-name-but-got-right-bracket")); | |
1473 currentDoctypeToken.correct = false; | |
1474 _addToken(currentToken); | |
1475 state = dataState; | |
1476 } else if (data == "\u0000") { | |
1477 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1478 currentDoctypeToken.name = "\uFFFD"; | |
1479 state = doctypeNameState; | |
1480 } else if (data == EOF) { | |
1481 _addToken(new ParseErrorToken( | |
1482 "expected-doctype-name-but-got-eof")); | |
1483 currentDoctypeToken.correct = false; | |
1484 _addToken(currentToken); | |
1485 state = dataState; | |
1486 } else { | |
1487 currentDoctypeToken.name = data; | |
1488 state = doctypeNameState; | |
1489 } | |
1490 return true; | |
1491 } | |
1492 | |
1493 bool doctypeNameState() { | |
1494 var data = stream.char(); | |
1495 if (isWhitespace(data)) { | |
1496 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); | |
1497 state = afterDoctypeNameState; | |
1498 } else if (data == ">") { | |
1499 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); | |
1500 _addToken(currentToken); | |
1501 state = dataState; | |
1502 } else if (data == "\u0000") { | |
1503 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1504 currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD"; | |
1505 state = doctypeNameState; | |
1506 } else if (data == EOF) { | |
1507 _addToken(new ParseErrorToken("eof-in-doctype-name")); | |
1508 currentDoctypeToken.correct = false; | |
1509 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); | |
1510 _addToken(currentToken); | |
1511 state = dataState; | |
1512 } else { | |
1513 currentDoctypeToken.name = '${currentDoctypeToken.name}$data'; | |
1514 } | |
1515 return true; | |
1516 } | |
1517 | |
1518 bool afterDoctypeNameState() { | |
1519 var data = stream.char(); | |
1520 if (isWhitespace(data)) { | |
1521 return true; | |
1522 } else if (data == ">") { | |
1523 _addToken(currentToken); | |
1524 state = dataState; | |
1525 } else if (data == EOF) { | |
1526 currentDoctypeToken.correct = false; | |
1527 stream.unget(data); | |
1528 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1529 _addToken(currentToken); | |
1530 state = dataState; | |
1531 } else { | |
1532 if (data == "p" || data == "P") { | |
1533 // TODO(jmesserly): would be nice to have a helper for this. | |
1534 var matched = true; | |
1535 for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) { | |
1536 data = stream.char(); | |
1537 if (data == EOF || !expected.contains(data)) { | |
1538 matched = false; | |
1539 break; | |
1540 } | |
1541 } | |
1542 if (matched) { | |
1543 state = afterDoctypePublicKeywordState; | |
1544 return true; | |
1545 } | |
1546 } else if (data == "s" || data == "S") { | |
1547 var matched = true; | |
1548 for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) { | |
1549 data = stream.char(); | |
1550 if (data == EOF || !expected.contains(data)) { | |
1551 matched = false; | |
1552 break; | |
1553 } | |
1554 } | |
1555 if (matched) { | |
1556 state = afterDoctypeSystemKeywordState; | |
1557 return true; | |
1558 } | |
1559 } | |
1560 | |
1561 // All the characters read before the current 'data' will be | |
1562 // [a-zA-Z], so they're garbage in the bogus doctype and can be | |
1563 // discarded; only the latest character might be '>' or EOF | |
1564 // and needs to be ungetted | |
1565 stream.unget(data); | |
1566 _addToken(new ParseErrorToken( | |
1567 "expected-space-or-right-bracket-in-doctype", | |
1568 messageParams: {"data": data})); | |
1569 currentDoctypeToken.correct = false; | |
1570 state = bogusDoctypeState; | |
1571 } | |
1572 return true; | |
1573 } | |
1574 | |
1575 bool afterDoctypePublicKeywordState() { | |
1576 var data = stream.char(); | |
1577 if (isWhitespace(data)) { | |
1578 state = beforeDoctypePublicIdentifierState; | |
1579 } else if (data == "'" || data == '"') { | |
1580 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1581 stream.unget(data); | |
1582 state = beforeDoctypePublicIdentifierState; | |
1583 } else if (data == EOF) { | |
1584 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1585 currentDoctypeToken.correct = false; | |
1586 _addToken(currentToken); | |
1587 state = dataState; | |
1588 } else { | |
1589 stream.unget(data); | |
1590 state = beforeDoctypePublicIdentifierState; | |
1591 } | |
1592 return true; | |
1593 } | |
1594 | |
1595 bool beforeDoctypePublicIdentifierState() { | |
1596 var data = stream.char(); | |
1597 if (isWhitespace(data)) { | |
1598 return true; | |
1599 } else if (data == "\"") { | |
1600 currentDoctypeToken.publicId = ""; | |
1601 state = doctypePublicIdentifierDoubleQuotedState; | |
1602 } else if (data == "'") { | |
1603 currentDoctypeToken.publicId = ""; | |
1604 state = doctypePublicIdentifierSingleQuotedState; | |
1605 } else if (data == ">") { | |
1606 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1607 currentDoctypeToken.correct = false; | |
1608 _addToken(currentToken); | |
1609 state = dataState; | |
1610 } else if (data == EOF) { | |
1611 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1612 currentDoctypeToken.correct = false; | |
1613 _addToken(currentToken); | |
1614 state = dataState; | |
1615 } else { | |
1616 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1617 currentDoctypeToken.correct = false; | |
1618 state = bogusDoctypeState; | |
1619 } | |
1620 return true; | |
1621 } | |
1622 | |
1623 bool doctypePublicIdentifierDoubleQuotedState() { | |
1624 var data = stream.char(); | |
1625 if (data == '"') { | |
1626 state = afterDoctypePublicIdentifierState; | |
1627 } else if (data == "\u0000") { | |
1628 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1629 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD"; | |
1630 } else if (data == ">") { | |
1631 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1632 currentDoctypeToken.correct = false; | |
1633 _addToken(currentToken); | |
1634 state = dataState; | |
1635 } else if (data == EOF) { | |
1636 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1637 currentDoctypeToken.correct = false; | |
1638 _addToken(currentToken); | |
1639 state = dataState; | |
1640 } else { | |
1641 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data'; | |
1642 } | |
1643 return true; | |
1644 } | |
1645 | |
1646 bool doctypePublicIdentifierSingleQuotedState() { | |
1647 var data = stream.char(); | |
1648 if (data == "'") { | |
1649 state = afterDoctypePublicIdentifierState; | |
1650 } else if (data == "\u0000") { | |
1651 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1652 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD"; | |
1653 } else if (data == ">") { | |
1654 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1655 currentDoctypeToken.correct = false; | |
1656 _addToken(currentToken); | |
1657 state = dataState; | |
1658 } else if (data == EOF) { | |
1659 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1660 currentDoctypeToken.correct = false; | |
1661 _addToken(currentToken); | |
1662 state = dataState; | |
1663 } else { | |
1664 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data'; | |
1665 } | |
1666 return true; | |
1667 } | |
1668 | |
1669 bool afterDoctypePublicIdentifierState() { | |
1670 var data = stream.char(); | |
1671 if (isWhitespace(data)) { | |
1672 state = betweenDoctypePublicAndSystemIdentifiersState; | |
1673 } else if (data == ">") { | |
1674 _addToken(currentToken); | |
1675 state = dataState; | |
1676 } else if (data == '"') { | |
1677 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1678 currentDoctypeToken.systemId = ""; | |
1679 state = doctypeSystemIdentifierDoubleQuotedState; | |
1680 } else if (data == "'") { | |
1681 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1682 currentDoctypeToken.systemId = ""; | |
1683 state = doctypeSystemIdentifierSingleQuotedState; | |
1684 } else if (data == EOF) { | |
1685 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1686 currentDoctypeToken.correct = false; | |
1687 _addToken(currentToken); | |
1688 state = dataState; | |
1689 } else { | |
1690 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1691 currentDoctypeToken.correct = false; | |
1692 state = bogusDoctypeState; | |
1693 } | |
1694 return true; | |
1695 } | |
1696 | |
1697 bool betweenDoctypePublicAndSystemIdentifiersState() { | |
1698 var data = stream.char(); | |
1699 if (isWhitespace(data)) { | |
1700 return true; | |
1701 } else if (data == ">") { | |
1702 _addToken(currentToken); | |
1703 state = dataState; | |
1704 } else if (data == '"') { | |
1705 currentDoctypeToken.systemId = ""; | |
1706 state = doctypeSystemIdentifierDoubleQuotedState; | |
1707 } else if (data == "'") { | |
1708 currentDoctypeToken.systemId = ""; | |
1709 state = doctypeSystemIdentifierSingleQuotedState; | |
1710 } else if (data == EOF) { | |
1711 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1712 currentDoctypeToken.correct = false; | |
1713 _addToken(currentToken); | |
1714 state = dataState; | |
1715 } else { | |
1716 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1717 currentDoctypeToken.correct = false; | |
1718 state = bogusDoctypeState; | |
1719 } | |
1720 return true; | |
1721 } | |
1722 | |
1723 bool afterDoctypeSystemKeywordState() { | |
1724 var data = stream.char(); | |
1725 if (isWhitespace(data)) { | |
1726 state = beforeDoctypeSystemIdentifierState; | |
1727 } else if (data == "'" || data == '"') { | |
1728 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1729 stream.unget(data); | |
1730 state = beforeDoctypeSystemIdentifierState; | |
1731 } else if (data == EOF) { | |
1732 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1733 currentDoctypeToken.correct = false; | |
1734 _addToken(currentToken); | |
1735 state = dataState; | |
1736 } else { | |
1737 stream.unget(data); | |
1738 state = beforeDoctypeSystemIdentifierState; | |
1739 } | |
1740 return true; | |
1741 } | |
1742 | |
1743 bool beforeDoctypeSystemIdentifierState() { | |
1744 var data = stream.char(); | |
1745 if (isWhitespace(data)) { | |
1746 return true; | |
1747 } else if (data == "\"") { | |
1748 currentDoctypeToken.systemId = ""; | |
1749 state = doctypeSystemIdentifierDoubleQuotedState; | |
1750 } else if (data == "'") { | |
1751 currentDoctypeToken.systemId = ""; | |
1752 state = doctypeSystemIdentifierSingleQuotedState; | |
1753 } else if (data == ">") { | |
1754 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1755 currentDoctypeToken.correct = false; | |
1756 _addToken(currentToken); | |
1757 state = dataState; | |
1758 } else if (data == EOF) { | |
1759 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1760 currentDoctypeToken.correct = false; | |
1761 _addToken(currentToken); | |
1762 state = dataState; | |
1763 } else { | |
1764 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1765 currentDoctypeToken.correct = false; | |
1766 state = bogusDoctypeState; | |
1767 } | |
1768 return true; | |
1769 } | |
1770 | |
1771 bool doctypeSystemIdentifierDoubleQuotedState() { | |
1772 var data = stream.char(); | |
1773 if (data == "\"") { | |
1774 state = afterDoctypeSystemIdentifierState; | |
1775 } else if (data == "\u0000") { | |
1776 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1777 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD"; | |
1778 } else if (data == ">") { | |
1779 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1780 currentDoctypeToken.correct = false; | |
1781 _addToken(currentToken); | |
1782 state = dataState; | |
1783 } else if (data == EOF) { | |
1784 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1785 currentDoctypeToken.correct = false; | |
1786 _addToken(currentToken); | |
1787 state = dataState; | |
1788 } else { | |
1789 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data'; | |
1790 } | |
1791 return true; | |
1792 } | |
1793 | |
1794 bool doctypeSystemIdentifierSingleQuotedState() { | |
1795 var data = stream.char(); | |
1796 if (data == "'") { | |
1797 state = afterDoctypeSystemIdentifierState; | |
1798 } else if (data == "\u0000") { | |
1799 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1800 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD"; | |
1801 } else if (data == ">") { | |
1802 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1803 currentDoctypeToken.correct = false; | |
1804 _addToken(currentToken); | |
1805 state = dataState; | |
1806 } else if (data == EOF) { | |
1807 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1808 currentDoctypeToken.correct = false; | |
1809 _addToken(currentToken); | |
1810 state = dataState; | |
1811 } else { | |
1812 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data'; | |
1813 } | |
1814 return true; | |
1815 } | |
1816 | |
1817 bool afterDoctypeSystemIdentifierState() { | |
1818 var data = stream.char(); | |
1819 if (isWhitespace(data)) { | |
1820 return true; | |
1821 } else if (data == ">") { | |
1822 _addToken(currentToken); | |
1823 state = dataState; | |
1824 } else if (data == EOF) { | |
1825 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1826 currentDoctypeToken.correct = false; | |
1827 _addToken(currentToken); | |
1828 state = dataState; | |
1829 } else { | |
1830 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1831 state = bogusDoctypeState; | |
1832 } | |
1833 return true; | |
1834 } | |
1835 | |
1836 bool bogusDoctypeState() { | |
1837 var data = stream.char(); | |
1838 if (data == ">") { | |
1839 _addToken(currentToken); | |
1840 state = dataState; | |
1841 } else if (data == EOF) { | |
1842 // XXX EMIT | |
1843 stream.unget(data); | |
1844 _addToken(currentToken); | |
1845 state = dataState; | |
1846 } | |
1847 return true; | |
1848 } | |
1849 | |
1850 bool cdataSectionState() { | |
1851 var data = []; | |
1852 int matchedEnd = 0; | |
1853 while (true) { | |
1854 var ch = stream.char(); | |
1855 if (ch == EOF) { | |
1856 break; | |
1857 } | |
1858 // Deal with null here rather than in the parser | |
1859 if (ch == "\u0000") { | |
1860 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1861 ch = "\uFFFD"; | |
1862 } | |
1863 data.add(ch); | |
1864 // TODO(jmesserly): it'd be nice if we had an easier way to match the end, | |
1865 // perhaps with a "peek" API. | |
1866 if (ch == "]" && matchedEnd < 2) { | |
1867 matchedEnd++; | |
1868 } else if (ch == ">" && matchedEnd == 2) { | |
1869 // Remove "]]>" from the end. | |
1870 data.removeLast(); | |
1871 data.removeLast(); | |
1872 data.removeLast(); | |
1873 break; | |
1874 } else { | |
1875 matchedEnd = 0; | |
1876 } | |
1877 } | |
1878 | |
1879 if (data.length > 0) { | |
1880 _addToken(new CharactersToken(data.join())); | |
1881 } | |
1882 state = dataState; | |
1883 return true; | |
1884 } | |
1885 } | |
1886 | |
OLD | NEW |