OLD | NEW |
| (Empty) |
1 library tokenizer; | |
2 | |
3 import 'dart:collection'; | |
4 import 'package:html/parser.dart' show HtmlParser; | |
5 import 'constants.dart'; | |
6 import 'inputstream.dart'; | |
7 import 'token.dart'; | |
8 import 'utils.dart'; | |
9 | |
10 // Group entities by their first character, for faster lookups | |
11 | |
12 // TODO(jmesserly): we could use a better data structure here like a trie, if | |
13 // we had it implemented in Dart. | |
14 Map<String, List<String>> entitiesByFirstChar = (() { | |
15 var result = {}; | |
16 for (var k in entities.keys) { | |
17 result.putIfAbsent(k[0], () => []).add(k); | |
18 } | |
19 return result; | |
20 })(); | |
21 | |
22 // TODO(jmesserly): lots of ways to make this faster: | |
23 // - use char codes everywhere instead of 1-char strings | |
24 // - use switch instead of contains, indexOf | |
25 // - use switch instead of the sequential if tests | |
26 // - avoid string concat | |
27 | |
28 /// This class takes care of tokenizing HTML. | |
29 class HtmlTokenizer implements Iterator<Token> { | |
30 // TODO(jmesserly): a lot of these could be made private | |
31 | |
32 final HtmlInputStream stream; | |
33 | |
34 final bool lowercaseElementName; | |
35 | |
36 final bool lowercaseAttrName; | |
37 | |
38 /// True to generate spans in for [Token.span]. | |
39 final bool generateSpans; | |
40 | |
41 /// True to generate spans for attributes. | |
42 final bool attributeSpans; | |
43 | |
44 /// This reference to the parser is used for correct CDATA handling. | |
45 /// The [HtmlParser] will set this at construction time. | |
46 HtmlParser parser; | |
47 | |
48 final Queue<Token> tokenQueue; | |
49 | |
50 /// Holds the token that is currently being processed. | |
51 Token currentToken; | |
52 | |
53 /// Holds a reference to the method to be invoked for the next parser state. | |
54 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode | |
55 // bug prevents us from doing that. See http://dartbug.com/12465 | |
56 Function state; | |
57 | |
58 final StringBuffer _buffer = new StringBuffer(); | |
59 | |
60 int _lastOffset; | |
61 | |
62 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add | |
63 // an item until it's ready. But the code doesn't have a clear notion of when | |
64 // it's "done" with the attribute. | |
65 List<TagAttribute> _attributes; | |
66 Set<String> _attributeNames; | |
67 | |
68 HtmlTokenizer(doc, {String encoding, bool parseMeta: true, | |
69 this.lowercaseElementName: true, this.lowercaseAttrName: true, | |
70 bool generateSpans: false, String sourceUrl, this.attributeSpans: false}) | |
71 : stream = new HtmlInputStream( | |
72 doc, encoding, parseMeta, generateSpans, sourceUrl), | |
73 tokenQueue = new Queue(), | |
74 generateSpans = generateSpans { | |
75 reset(); | |
76 } | |
77 | |
78 TagToken get currentTagToken => currentToken; | |
79 DoctypeToken get currentDoctypeToken => currentToken; | |
80 StringToken get currentStringToken => currentToken; | |
81 | |
82 Token _current; | |
83 Token get current => _current; | |
84 | |
85 final StringBuffer _attributeName = new StringBuffer(); | |
86 final StringBuffer _attributeValue = new StringBuffer(); | |
87 | |
88 void _markAttributeEnd(int offset) { | |
89 _attributes.last.value = '$_attributeValue'; | |
90 if (attributeSpans) _attributes.last.end = stream.position + offset; | |
91 } | |
92 | |
93 void _markAttributeValueStart(int offset) { | |
94 if (attributeSpans) _attributes.last.startValue = stream.position + offset; | |
95 } | |
96 | |
97 void _markAttributeValueEnd(int offset) { | |
98 if (attributeSpans) _attributes.last.endValue = stream.position + offset; | |
99 _markAttributeEnd(offset); | |
100 } | |
101 | |
102 // Note: we could track the name span here, if we need it. | |
103 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); | |
104 | |
105 void _addAttribute(String name) { | |
106 if (_attributes == null) _attributes = []; | |
107 _attributeName.clear(); | |
108 _attributeName.write(name); | |
109 _attributeValue.clear(); | |
110 var attr = new TagAttribute(); | |
111 _attributes.add(attr); | |
112 if (attributeSpans) attr.start = stream.position - name.length; | |
113 } | |
114 | |
115 /// This is where the magic happens. | |
116 /// | |
117 /// We do our usually processing through the states and when we have a token | |
118 /// to return we yield the token which pauses processing until the next token | |
119 /// is requested. | |
120 bool moveNext() { | |
121 // Start processing. When EOF is reached state will return false; | |
122 // instead of true and the loop will terminate. | |
123 while (stream.errors.length == 0 && tokenQueue.length == 0) { | |
124 if (!state()) { | |
125 _current = null; | |
126 return false; | |
127 } | |
128 } | |
129 if (stream.errors.length > 0) { | |
130 _current = new ParseErrorToken(stream.errors.removeFirst()); | |
131 } else { | |
132 assert(tokenQueue.length > 0); | |
133 _current = tokenQueue.removeFirst(); | |
134 } | |
135 return true; | |
136 } | |
137 | |
138 /// Resets the tokenizer state. Calling this does not reset the [stream] or | |
139 /// the [parser]. | |
140 void reset() { | |
141 _lastOffset = 0; | |
142 tokenQueue.clear(); | |
143 currentToken = null; | |
144 _buffer.clear(); | |
145 _attributes = null; | |
146 _attributeNames = null; | |
147 state = dataState; | |
148 } | |
149 | |
150 /// Adds a token to the queue. Sets the span if needed. | |
151 void _addToken(Token token) { | |
152 if (generateSpans && token.span == null) { | |
153 int offset = stream.position; | |
154 token.span = stream.fileInfo.span(_lastOffset, offset); | |
155 if (token is! ParseErrorToken) { | |
156 _lastOffset = offset; | |
157 } | |
158 } | |
159 tokenQueue.add(token); | |
160 } | |
161 | |
162 /// This function returns either U+FFFD or the character based on the | |
163 /// decimal or hexadecimal representation. It also discards ";" if present. | |
164 /// If not present it will add a [ParseErrorToken]. | |
165 String consumeNumberEntity(bool isHex) { | |
166 var allowed = isDigit; | |
167 var radix = 10; | |
168 if (isHex) { | |
169 allowed = isHexDigit; | |
170 radix = 16; | |
171 } | |
172 | |
173 var charStack = []; | |
174 | |
175 // Consume all the characters that are in range while making sure we | |
176 // don't hit an EOF. | |
177 var c = stream.char(); | |
178 while (allowed(c) && c != EOF) { | |
179 charStack.add(c); | |
180 c = stream.char(); | |
181 } | |
182 | |
183 // Convert the set of characters consumed to an int. | |
184 var charAsInt = parseIntRadix(charStack.join(), radix); | |
185 | |
186 // Certain characters get replaced with others | |
187 var char = replacementCharacters[charAsInt]; | |
188 if (char != null) { | |
189 _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity", | |
190 messageParams: {"charAsInt": charAsInt})); | |
191 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) || | |
192 (charAsInt > 0x10FFFF)) { | |
193 char = "\uFFFD"; | |
194 _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity", | |
195 messageParams: {"charAsInt": charAsInt})); | |
196 } else { | |
197 // Should speed up this check somehow (e.g. move the set to a constant) | |
198 if ((0x0001 <= charAsInt && charAsInt <= 0x0008) || | |
199 (0x000E <= charAsInt && charAsInt <= 0x001F) || | |
200 (0x007F <= charAsInt && charAsInt <= 0x009F) || | |
201 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) || | |
202 const [ | |
203 0x000B, | |
204 0xFFFE, | |
205 0xFFFF, | |
206 0x1FFFE, | |
207 0x1FFFF, | |
208 0x2FFFE, | |
209 0x2FFFF, | |
210 0x3FFFE, | |
211 0x3FFFF, | |
212 0x4FFFE, | |
213 0x4FFFF, | |
214 0x5FFFE, | |
215 0x5FFFF, | |
216 0x6FFFE, | |
217 0x6FFFF, | |
218 0x7FFFE, | |
219 0x7FFFF, | |
220 0x8FFFE, | |
221 0x8FFFF, | |
222 0x9FFFE, | |
223 0x9FFFF, | |
224 0xAFFFE, | |
225 0xAFFFF, | |
226 0xBFFFE, | |
227 0xBFFFF, | |
228 0xCFFFE, | |
229 0xCFFFF, | |
230 0xDFFFE, | |
231 0xDFFFF, | |
232 0xEFFFE, | |
233 0xEFFFF, | |
234 0xFFFFE, | |
235 0xFFFFF, | |
236 0x10FFFE, | |
237 0x10FFFF | |
238 ].contains(charAsInt)) { | |
239 _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity", | |
240 messageParams: {"charAsInt": charAsInt})); | |
241 } | |
242 char = new String.fromCharCodes([charAsInt]); | |
243 } | |
244 | |
245 // Discard the ; if present. Otherwise, put it back on the queue and | |
246 // invoke parseError on parser. | |
247 if (c != ";") { | |
248 _addToken(new ParseErrorToken("numeric-entity-without-semicolon")); | |
249 stream.unget(c); | |
250 } | |
251 return char; | |
252 } | |
253 | |
254 void consumeEntity({String allowedChar, bool fromAttribute: false}) { | |
255 // Initialise to the default output for when no entity is matched | |
256 var output = "&"; | |
257 | |
258 var charStack = [stream.char()]; | |
259 if (isWhitespace(charStack[0]) || | |
260 charStack[0] == '<' || | |
261 charStack[0] == '&' || | |
262 charStack[0] == EOF || | |
263 allowedChar == charStack[0]) { | |
264 stream.unget(charStack[0]); | |
265 } else if (charStack[0] == "#") { | |
266 // Read the next character to see if it's hex or decimal | |
267 bool hex = false; | |
268 charStack.add(stream.char()); | |
269 if (charStack.last == 'x' || charStack.last == 'X') { | |
270 hex = true; | |
271 charStack.add(stream.char()); | |
272 } | |
273 | |
274 // charStack.last should be the first digit | |
275 if (hex && isHexDigit(charStack.last) || | |
276 (!hex && isDigit(charStack.last))) { | |
277 // At least one digit found, so consume the whole number | |
278 stream.unget(charStack.last); | |
279 output = consumeNumberEntity(hex); | |
280 } else { | |
281 // No digits found | |
282 _addToken(new ParseErrorToken("expected-numeric-entity")); | |
283 stream.unget(charStack.removeLast()); | |
284 output = "&${charStack.join()}"; | |
285 } | |
286 } else { | |
287 // At this point in the process might have named entity. Entities | |
288 // are stored in the global variable "entities". | |
289 // | |
290 // Consume characters and compare to these to a substring of the | |
291 // entity names in the list until the substring no longer matches. | |
292 var filteredEntityList = entitiesByFirstChar[charStack[0]]; | |
293 if (filteredEntityList == null) filteredEntityList = const []; | |
294 | |
295 while (charStack.last != EOF) { | |
296 var name = charStack.join(); | |
297 filteredEntityList = | |
298 filteredEntityList.where((e) => e.startsWith(name)).toList(); | |
299 | |
300 if (filteredEntityList.length == 0) { | |
301 break; | |
302 } | |
303 charStack.add(stream.char()); | |
304 } | |
305 | |
306 // At this point we have a string that starts with some characters | |
307 // that may match an entity | |
308 String entityName = null; | |
309 | |
310 // Try to find the longest entity the string will match to take care | |
311 // of ¬i for instance. | |
312 | |
313 int entityLen; | |
314 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) { | |
315 var possibleEntityName = charStack.sublist(0, entityLen).join(); | |
316 if (entities.containsKey(possibleEntityName)) { | |
317 entityName = possibleEntityName; | |
318 break; | |
319 } | |
320 } | |
321 | |
322 if (entityName != null) { | |
323 var lastChar = entityName[entityName.length - 1]; | |
324 if (lastChar != ";") { | |
325 _addToken(new ParseErrorToken("named-entity-without-semicolon")); | |
326 } | |
327 if (lastChar != ";" && | |
328 fromAttribute && | |
329 (isLetterOrDigit(charStack[entityLen]) || | |
330 charStack[entityLen] == '=')) { | |
331 stream.unget(charStack.removeLast()); | |
332 output = "&${charStack.join()}"; | |
333 } else { | |
334 output = entities[entityName]; | |
335 stream.unget(charStack.removeLast()); | |
336 output = '${output}${slice(charStack, entityLen).join()}'; | |
337 } | |
338 } else { | |
339 _addToken(new ParseErrorToken("expected-named-entity")); | |
340 stream.unget(charStack.removeLast()); | |
341 output = "&${charStack.join()}"; | |
342 } | |
343 } | |
344 if (fromAttribute) { | |
345 _attributeValue.write(output); | |
346 } else { | |
347 var token; | |
348 if (isWhitespace(output)) { | |
349 token = new SpaceCharactersToken(output); | |
350 } else { | |
351 token = new CharactersToken(output); | |
352 } | |
353 _addToken(token); | |
354 } | |
355 } | |
356 | |
357 /// This method replaces the need for "entityInAttributeValueState". | |
358 void processEntityInAttribute(String allowedChar) { | |
359 consumeEntity(allowedChar: allowedChar, fromAttribute: true); | |
360 } | |
361 | |
362 /// This method is a generic handler for emitting the tags. It also sets | |
363 /// the state to "data" because that's what's needed after a token has been | |
364 /// emitted. | |
365 void emitCurrentToken() { | |
366 var token = currentToken; | |
367 // Add token to the queue to be yielded | |
368 if (token is TagToken) { | |
369 if (lowercaseElementName) { | |
370 token.name = asciiUpper2Lower(token.name); | |
371 } | |
372 if (token is EndTagToken) { | |
373 if (_attributes != null) { | |
374 _addToken(new ParseErrorToken("attributes-in-end-tag")); | |
375 } | |
376 if (token.selfClosing) { | |
377 _addToken(new ParseErrorToken("this-closing-flag-on-end-tag")); | |
378 } | |
379 } else if (token is StartTagToken) { | |
380 // HTML5 specific normalizations to the token stream. | |
381 // Convert the list into a map where first key wins. | |
382 token.data = new LinkedHashMap<Object, String>(); | |
383 if (_attributes != null) { | |
384 for (var attr in _attributes) { | |
385 token.data.putIfAbsent(attr.name, () => attr.value); | |
386 } | |
387 if (attributeSpans) token.attributeSpans = _attributes; | |
388 } | |
389 } | |
390 _attributes = null; | |
391 _attributeNames = null; | |
392 } | |
393 _addToken(token); | |
394 state = dataState; | |
395 } | |
396 | |
397 // Below are the various tokenizer states worked out. | |
398 | |
399 bool dataState() { | |
400 var data = stream.char(); | |
401 if (data == "&") { | |
402 state = entityDataState; | |
403 } else if (data == "<") { | |
404 state = tagOpenState; | |
405 } else if (data == "\u0000") { | |
406 _addToken(new ParseErrorToken("invalid-codepoint")); | |
407 _addToken(new CharactersToken("\u0000")); | |
408 } else if (data == EOF) { | |
409 // Tokenization ends. | |
410 return false; | |
411 } else if (isWhitespace(data)) { | |
412 // Directly after emitting a token you switch back to the "data | |
413 // state". At that point spaceCharacters are important so they are | |
414 // emitted separately. | |
415 _addToken(new SpaceCharactersToken( | |
416 '${data}${stream.charsUntil(spaceCharacters, true)}')); | |
417 // No need to update lastFourChars here, since the first space will | |
418 // have already been appended to lastFourChars and will have broken | |
419 // any <!-- or --> sequences | |
420 } else { | |
421 var chars = stream.charsUntil("&<\u0000"); | |
422 _addToken(new CharactersToken('${data}${chars}')); | |
423 } | |
424 return true; | |
425 } | |
426 | |
427 bool entityDataState() { | |
428 consumeEntity(); | |
429 state = dataState; | |
430 return true; | |
431 } | |
432 | |
433 bool rcdataState() { | |
434 var data = stream.char(); | |
435 if (data == "&") { | |
436 state = characterReferenceInRcdata; | |
437 } else if (data == "<") { | |
438 state = rcdataLessThanSignState; | |
439 } else if (data == EOF) { | |
440 // Tokenization ends. | |
441 return false; | |
442 } else if (data == "\u0000") { | |
443 _addToken(new ParseErrorToken("invalid-codepoint")); | |
444 _addToken(new CharactersToken("\uFFFD")); | |
445 } else if (isWhitespace(data)) { | |
446 // Directly after emitting a token you switch back to the "data | |
447 // state". At that point spaceCharacters are important so they are | |
448 // emitted separately. | |
449 _addToken(new SpaceCharactersToken( | |
450 '${data}${stream.charsUntil(spaceCharacters, true)}')); | |
451 } else { | |
452 var chars = stream.charsUntil("&<"); | |
453 _addToken(new CharactersToken('${data}${chars}')); | |
454 } | |
455 return true; | |
456 } | |
457 | |
458 bool characterReferenceInRcdata() { | |
459 consumeEntity(); | |
460 state = rcdataState; | |
461 return true; | |
462 } | |
463 | |
464 bool rawtextState() { | |
465 var data = stream.char(); | |
466 if (data == "<") { | |
467 state = rawtextLessThanSignState; | |
468 } else if (data == "\u0000") { | |
469 _addToken(new ParseErrorToken("invalid-codepoint")); | |
470 _addToken(new CharactersToken("\uFFFD")); | |
471 } else if (data == EOF) { | |
472 // Tokenization ends. | |
473 return false; | |
474 } else { | |
475 var chars = stream.charsUntil("<\u0000"); | |
476 _addToken(new CharactersToken("${data}${chars}")); | |
477 } | |
478 return true; | |
479 } | |
480 | |
481 bool scriptDataState() { | |
482 var data = stream.char(); | |
483 if (data == "<") { | |
484 state = scriptDataLessThanSignState; | |
485 } else if (data == "\u0000") { | |
486 _addToken(new ParseErrorToken("invalid-codepoint")); | |
487 _addToken(new CharactersToken("\uFFFD")); | |
488 } else if (data == EOF) { | |
489 // Tokenization ends. | |
490 return false; | |
491 } else { | |
492 var chars = stream.charsUntil("<\u0000"); | |
493 _addToken(new CharactersToken("${data}${chars}")); | |
494 } | |
495 return true; | |
496 } | |
497 | |
498 bool plaintextState() { | |
499 var data = stream.char(); | |
500 if (data == EOF) { | |
501 // Tokenization ends. | |
502 return false; | |
503 } else if (data == "\u0000") { | |
504 _addToken(new ParseErrorToken("invalid-codepoint")); | |
505 _addToken(new CharactersToken("\uFFFD")); | |
506 } else { | |
507 _addToken(new CharactersToken('${data}${stream.charsUntil("\u0000")}')); | |
508 } | |
509 return true; | |
510 } | |
511 | |
512 bool tagOpenState() { | |
513 var data = stream.char(); | |
514 if (data == "!") { | |
515 state = markupDeclarationOpenState; | |
516 } else if (data == "/") { | |
517 state = closeTagOpenState; | |
518 } else if (isLetter(data)) { | |
519 currentToken = new StartTagToken(data); | |
520 state = tagNameState; | |
521 } else if (data == ">") { | |
522 // XXX In theory it could be something besides a tag name. But | |
523 // do we really care? | |
524 _addToken(new ParseErrorToken("expected-tag-name-but-got-right-bracket")); | |
525 _addToken(new CharactersToken("<>")); | |
526 state = dataState; | |
527 } else if (data == "?") { | |
528 // XXX In theory it could be something besides a tag name. But | |
529 // do we really care? | |
530 _addToken(new ParseErrorToken("expected-tag-name-but-got-question-mark")); | |
531 stream.unget(data); | |
532 state = bogusCommentState; | |
533 } else { | |
534 // XXX | |
535 _addToken(new ParseErrorToken("expected-tag-name")); | |
536 _addToken(new CharactersToken("<")); | |
537 stream.unget(data); | |
538 state = dataState; | |
539 } | |
540 return true; | |
541 } | |
542 | |
543 bool closeTagOpenState() { | |
544 var data = stream.char(); | |
545 if (isLetter(data)) { | |
546 currentToken = new EndTagToken(data); | |
547 state = tagNameState; | |
548 } else if (data == ">") { | |
549 _addToken( | |
550 new ParseErrorToken("expected-closing-tag-but-got-right-bracket")); | |
551 state = dataState; | |
552 } else if (data == EOF) { | |
553 _addToken(new ParseErrorToken("expected-closing-tag-but-got-eof")); | |
554 _addToken(new CharactersToken("</")); | |
555 state = dataState; | |
556 } else { | |
557 // XXX data can be _'_... | |
558 _addToken(new ParseErrorToken("expected-closing-tag-but-got-char", | |
559 messageParams: {"data": data})); | |
560 stream.unget(data); | |
561 state = bogusCommentState; | |
562 } | |
563 return true; | |
564 } | |
565 | |
566 bool tagNameState() { | |
567 var data = stream.char(); | |
568 if (isWhitespace(data)) { | |
569 state = beforeAttributeNameState; | |
570 } else if (data == ">") { | |
571 emitCurrentToken(); | |
572 } else if (data == EOF) { | |
573 _addToken(new ParseErrorToken("eof-in-tag-name")); | |
574 state = dataState; | |
575 } else if (data == "/") { | |
576 state = selfClosingStartTagState; | |
577 } else if (data == "\u0000") { | |
578 _addToken(new ParseErrorToken("invalid-codepoint")); | |
579 currentTagToken.name = '${currentTagToken.name}\uFFFD'; | |
580 } else { | |
581 currentTagToken.name = '${currentTagToken.name}$data'; | |
582 // (Don't use charsUntil here, because tag names are | |
583 // very short and it's faster to not do anything fancy) | |
584 } | |
585 return true; | |
586 } | |
587 | |
588 bool rcdataLessThanSignState() { | |
589 var data = stream.char(); | |
590 if (data == "/") { | |
591 _buffer.clear(); | |
592 state = rcdataEndTagOpenState; | |
593 } else { | |
594 _addToken(new CharactersToken("<")); | |
595 stream.unget(data); | |
596 state = rcdataState; | |
597 } | |
598 return true; | |
599 } | |
600 | |
601 bool rcdataEndTagOpenState() { | |
602 var data = stream.char(); | |
603 if (isLetter(data)) { | |
604 _buffer.write(data); | |
605 state = rcdataEndTagNameState; | |
606 } else { | |
607 _addToken(new CharactersToken("</")); | |
608 stream.unget(data); | |
609 state = rcdataState; | |
610 } | |
611 return true; | |
612 } | |
613 | |
614 bool _tokenIsAppropriate() { | |
615 // TODO(jmesserly): this should use case insensitive compare instead. | |
616 return currentToken is TagToken && | |
617 currentTagToken.name.toLowerCase() == '$_buffer'.toLowerCase(); | |
618 } | |
619 | |
620 bool rcdataEndTagNameState() { | |
621 var appropriate = _tokenIsAppropriate(); | |
622 var data = stream.char(); | |
623 if (isWhitespace(data) && appropriate) { | |
624 currentToken = new EndTagToken('$_buffer'); | |
625 state = beforeAttributeNameState; | |
626 } else if (data == "/" && appropriate) { | |
627 currentToken = new EndTagToken('$_buffer'); | |
628 state = selfClosingStartTagState; | |
629 } else if (data == ">" && appropriate) { | |
630 currentToken = new EndTagToken('$_buffer'); | |
631 emitCurrentToken(); | |
632 state = dataState; | |
633 } else if (isLetter(data)) { | |
634 _buffer.write(data); | |
635 } else { | |
636 _addToken(new CharactersToken("</$_buffer")); | |
637 stream.unget(data); | |
638 state = rcdataState; | |
639 } | |
640 return true; | |
641 } | |
642 | |
643 bool rawtextLessThanSignState() { | |
644 var data = stream.char(); | |
645 if (data == "/") { | |
646 _buffer.clear(); | |
647 state = rawtextEndTagOpenState; | |
648 } else { | |
649 _addToken(new CharactersToken("<")); | |
650 stream.unget(data); | |
651 state = rawtextState; | |
652 } | |
653 return true; | |
654 } | |
655 | |
656 bool rawtextEndTagOpenState() { | |
657 var data = stream.char(); | |
658 if (isLetter(data)) { | |
659 _buffer.write(data); | |
660 state = rawtextEndTagNameState; | |
661 } else { | |
662 _addToken(new CharactersToken("</")); | |
663 stream.unget(data); | |
664 state = rawtextState; | |
665 } | |
666 return true; | |
667 } | |
668 | |
669 bool rawtextEndTagNameState() { | |
670 var appropriate = _tokenIsAppropriate(); | |
671 var data = stream.char(); | |
672 if (isWhitespace(data) && appropriate) { | |
673 currentToken = new EndTagToken('$_buffer'); | |
674 state = beforeAttributeNameState; | |
675 } else if (data == "/" && appropriate) { | |
676 currentToken = new EndTagToken('$_buffer'); | |
677 state = selfClosingStartTagState; | |
678 } else if (data == ">" && appropriate) { | |
679 currentToken = new EndTagToken('$_buffer'); | |
680 emitCurrentToken(); | |
681 state = dataState; | |
682 } else if (isLetter(data)) { | |
683 _buffer.write(data); | |
684 } else { | |
685 _addToken(new CharactersToken("</$_buffer")); | |
686 stream.unget(data); | |
687 state = rawtextState; | |
688 } | |
689 return true; | |
690 } | |
691 | |
692 bool scriptDataLessThanSignState() { | |
693 var data = stream.char(); | |
694 if (data == "/") { | |
695 _buffer.clear(); | |
696 state = scriptDataEndTagOpenState; | |
697 } else if (data == "!") { | |
698 _addToken(new CharactersToken("<!")); | |
699 state = scriptDataEscapeStartState; | |
700 } else { | |
701 _addToken(new CharactersToken("<")); | |
702 stream.unget(data); | |
703 state = scriptDataState; | |
704 } | |
705 return true; | |
706 } | |
707 | |
708 bool scriptDataEndTagOpenState() { | |
709 var data = stream.char(); | |
710 if (isLetter(data)) { | |
711 _buffer.write(data); | |
712 state = scriptDataEndTagNameState; | |
713 } else { | |
714 _addToken(new CharactersToken("</")); | |
715 stream.unget(data); | |
716 state = scriptDataState; | |
717 } | |
718 return true; | |
719 } | |
720 | |
721 bool scriptDataEndTagNameState() { | |
722 var appropriate = _tokenIsAppropriate(); | |
723 var data = stream.char(); | |
724 if (isWhitespace(data) && appropriate) { | |
725 currentToken = new EndTagToken('$_buffer'); | |
726 state = beforeAttributeNameState; | |
727 } else if (data == "/" && appropriate) { | |
728 currentToken = new EndTagToken('$_buffer'); | |
729 state = selfClosingStartTagState; | |
730 } else if (data == ">" && appropriate) { | |
731 currentToken = new EndTagToken('$_buffer'); | |
732 emitCurrentToken(); | |
733 state = dataState; | |
734 } else if (isLetter(data)) { | |
735 _buffer.write(data); | |
736 } else { | |
737 _addToken(new CharactersToken("</$_buffer")); | |
738 stream.unget(data); | |
739 state = scriptDataState; | |
740 } | |
741 return true; | |
742 } | |
743 | |
744 bool scriptDataEscapeStartState() { | |
745 var data = stream.char(); | |
746 if (data == "-") { | |
747 _addToken(new CharactersToken("-")); | |
748 state = scriptDataEscapeStartDashState; | |
749 } else { | |
750 stream.unget(data); | |
751 state = scriptDataState; | |
752 } | |
753 return true; | |
754 } | |
755 | |
756 bool scriptDataEscapeStartDashState() { | |
757 var data = stream.char(); | |
758 if (data == "-") { | |
759 _addToken(new CharactersToken("-")); | |
760 state = scriptDataEscapedDashDashState; | |
761 } else { | |
762 stream.unget(data); | |
763 state = scriptDataState; | |
764 } | |
765 return true; | |
766 } | |
767 | |
768 bool scriptDataEscapedState() { | |
769 var data = stream.char(); | |
770 if (data == "-") { | |
771 _addToken(new CharactersToken("-")); | |
772 state = scriptDataEscapedDashState; | |
773 } else if (data == "<") { | |
774 state = scriptDataEscapedLessThanSignState; | |
775 } else if (data == "\u0000") { | |
776 _addToken(new ParseErrorToken("invalid-codepoint")); | |
777 _addToken(new CharactersToken("\uFFFD")); | |
778 } else if (data == EOF) { | |
779 state = dataState; | |
780 } else { | |
781 var chars = stream.charsUntil("<-\u0000"); | |
782 _addToken(new CharactersToken("${data}${chars}")); | |
783 } | |
784 return true; | |
785 } | |
786 | |
787 bool scriptDataEscapedDashState() { | |
788 var data = stream.char(); | |
789 if (data == "-") { | |
790 _addToken(new CharactersToken("-")); | |
791 state = scriptDataEscapedDashDashState; | |
792 } else if (data == "<") { | |
793 state = scriptDataEscapedLessThanSignState; | |
794 } else if (data == "\u0000") { | |
795 _addToken(new ParseErrorToken("invalid-codepoint")); | |
796 _addToken(new CharactersToken("\uFFFD")); | |
797 state = scriptDataEscapedState; | |
798 } else if (data == EOF) { | |
799 state = dataState; | |
800 } else { | |
801 _addToken(new CharactersToken(data)); | |
802 state = scriptDataEscapedState; | |
803 } | |
804 return true; | |
805 } | |
806 | |
807 bool scriptDataEscapedDashDashState() { | |
808 var data = stream.char(); | |
809 if (data == "-") { | |
810 _addToken(new CharactersToken("-")); | |
811 } else if (data == "<") { | |
812 state = scriptDataEscapedLessThanSignState; | |
813 } else if (data == ">") { | |
814 _addToken(new CharactersToken(">")); | |
815 state = scriptDataState; | |
816 } else if (data == "\u0000") { | |
817 _addToken(new ParseErrorToken("invalid-codepoint")); | |
818 _addToken(new CharactersToken("\uFFFD")); | |
819 state = scriptDataEscapedState; | |
820 } else if (data == EOF) { | |
821 state = dataState; | |
822 } else { | |
823 _addToken(new CharactersToken(data)); | |
824 state = scriptDataEscapedState; | |
825 } | |
826 return true; | |
827 } | |
828 | |
829 bool scriptDataEscapedLessThanSignState() { | |
830 var data = stream.char(); | |
831 if (data == "/") { | |
832 _buffer.clear(); | |
833 state = scriptDataEscapedEndTagOpenState; | |
834 } else if (isLetter(data)) { | |
835 _addToken(new CharactersToken("<$data")); | |
836 _buffer.clear(); | |
837 _buffer.write(data); | |
838 state = scriptDataDoubleEscapeStartState; | |
839 } else { | |
840 _addToken(new CharactersToken("<")); | |
841 stream.unget(data); | |
842 state = scriptDataEscapedState; | |
843 } | |
844 return true; | |
845 } | |
846 | |
847 bool scriptDataEscapedEndTagOpenState() { | |
848 var data = stream.char(); | |
849 if (isLetter(data)) { | |
850 _buffer.clear(); | |
851 _buffer.write(data); | |
852 state = scriptDataEscapedEndTagNameState; | |
853 } else { | |
854 _addToken(new CharactersToken("</")); | |
855 stream.unget(data); | |
856 state = scriptDataEscapedState; | |
857 } | |
858 return true; | |
859 } | |
860 | |
861 bool scriptDataEscapedEndTagNameState() { | |
862 var appropriate = _tokenIsAppropriate(); | |
863 var data = stream.char(); | |
864 if (isWhitespace(data) && appropriate) { | |
865 currentToken = new EndTagToken('$_buffer'); | |
866 state = beforeAttributeNameState; | |
867 } else if (data == "/" && appropriate) { | |
868 currentToken = new EndTagToken('$_buffer'); | |
869 state = selfClosingStartTagState; | |
870 } else if (data == ">" && appropriate) { | |
871 currentToken = new EndTagToken('$_buffer'); | |
872 emitCurrentToken(); | |
873 state = dataState; | |
874 } else if (isLetter(data)) { | |
875 _buffer.write(data); | |
876 } else { | |
877 _addToken(new CharactersToken("</$_buffer")); | |
878 stream.unget(data); | |
879 state = scriptDataEscapedState; | |
880 } | |
881 return true; | |
882 } | |
883 | |
884 bool scriptDataDoubleEscapeStartState() { | |
885 var data = stream.char(); | |
886 if (isWhitespace(data) || data == "/" || data == ">") { | |
887 _addToken(new CharactersToken(data)); | |
888 if ('$_buffer'.toLowerCase() == "script") { | |
889 state = scriptDataDoubleEscapedState; | |
890 } else { | |
891 state = scriptDataEscapedState; | |
892 } | |
893 } else if (isLetter(data)) { | |
894 _addToken(new CharactersToken(data)); | |
895 _buffer.write(data); | |
896 } else { | |
897 stream.unget(data); | |
898 state = scriptDataEscapedState; | |
899 } | |
900 return true; | |
901 } | |
902 | |
903 bool scriptDataDoubleEscapedState() { | |
904 var data = stream.char(); | |
905 if (data == "-") { | |
906 _addToken(new CharactersToken("-")); | |
907 state = scriptDataDoubleEscapedDashState; | |
908 } else if (data == "<") { | |
909 _addToken(new CharactersToken("<")); | |
910 state = scriptDataDoubleEscapedLessThanSignState; | |
911 } else if (data == "\u0000") { | |
912 _addToken(new ParseErrorToken("invalid-codepoint")); | |
913 _addToken(new CharactersToken("\uFFFD")); | |
914 } else if (data == EOF) { | |
915 _addToken(new ParseErrorToken("eof-in-script-in-script")); | |
916 state = dataState; | |
917 } else { | |
918 _addToken(new CharactersToken(data)); | |
919 } | |
920 return true; | |
921 } | |
922 | |
923 bool scriptDataDoubleEscapedDashState() { | |
924 var data = stream.char(); | |
925 if (data == "-") { | |
926 _addToken(new CharactersToken("-")); | |
927 state = scriptDataDoubleEscapedDashDashState; | |
928 } else if (data == "<") { | |
929 _addToken(new CharactersToken("<")); | |
930 state = scriptDataDoubleEscapedLessThanSignState; | |
931 } else if (data == "\u0000") { | |
932 _addToken(new ParseErrorToken("invalid-codepoint")); | |
933 _addToken(new CharactersToken("\uFFFD")); | |
934 state = scriptDataDoubleEscapedState; | |
935 } else if (data == EOF) { | |
936 _addToken(new ParseErrorToken("eof-in-script-in-script")); | |
937 state = dataState; | |
938 } else { | |
939 _addToken(new CharactersToken(data)); | |
940 state = scriptDataDoubleEscapedState; | |
941 } | |
942 return true; | |
943 } | |
944 | |
945 // TODO(jmesserly): report bug in original code | |
946 // (was "Dash" instead of "DashDash") | |
947 bool scriptDataDoubleEscapedDashDashState() { | |
948 var data = stream.char(); | |
949 if (data == "-") { | |
950 _addToken(new CharactersToken("-")); | |
951 } else if (data == "<") { | |
952 _addToken(new CharactersToken("<")); | |
953 state = scriptDataDoubleEscapedLessThanSignState; | |
954 } else if (data == ">") { | |
955 _addToken(new CharactersToken(">")); | |
956 state = scriptDataState; | |
957 } else if (data == "\u0000") { | |
958 _addToken(new ParseErrorToken("invalid-codepoint")); | |
959 _addToken(new CharactersToken("\uFFFD")); | |
960 state = scriptDataDoubleEscapedState; | |
961 } else if (data == EOF) { | |
962 _addToken(new ParseErrorToken("eof-in-script-in-script")); | |
963 state = dataState; | |
964 } else { | |
965 _addToken(new CharactersToken(data)); | |
966 state = scriptDataDoubleEscapedState; | |
967 } | |
968 return true; | |
969 } | |
970 | |
971 bool scriptDataDoubleEscapedLessThanSignState() { | |
972 var data = stream.char(); | |
973 if (data == "/") { | |
974 _addToken(new CharactersToken("/")); | |
975 _buffer.clear(); | |
976 state = scriptDataDoubleEscapeEndState; | |
977 } else { | |
978 stream.unget(data); | |
979 state = scriptDataDoubleEscapedState; | |
980 } | |
981 return true; | |
982 } | |
983 | |
984 bool scriptDataDoubleEscapeEndState() { | |
985 var data = stream.char(); | |
986 if (isWhitespace(data) || data == "/" || data == ">") { | |
987 _addToken(new CharactersToken(data)); | |
988 if ('$_buffer'.toLowerCase() == "script") { | |
989 state = scriptDataEscapedState; | |
990 } else { | |
991 state = scriptDataDoubleEscapedState; | |
992 } | |
993 } else if (isLetter(data)) { | |
994 _addToken(new CharactersToken(data)); | |
995 _buffer.write(data); | |
996 } else { | |
997 stream.unget(data); | |
998 state = scriptDataDoubleEscapedState; | |
999 } | |
1000 return true; | |
1001 } | |
1002 | |
1003 bool beforeAttributeNameState() { | |
1004 var data = stream.char(); | |
1005 if (isWhitespace(data)) { | |
1006 stream.charsUntil(spaceCharacters, true); | |
1007 } else if (isLetter(data)) { | |
1008 _addAttribute(data); | |
1009 state = attributeNameState; | |
1010 } else if (data == ">") { | |
1011 emitCurrentToken(); | |
1012 } else if (data == "/") { | |
1013 state = selfClosingStartTagState; | |
1014 } else if (data == EOF) { | |
1015 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof")); | |
1016 state = dataState; | |
1017 } else if ("'\"=<".contains(data)) { | |
1018 _addToken(new ParseErrorToken("invalid-character-in-attribute-name")); | |
1019 _addAttribute(data); | |
1020 state = attributeNameState; | |
1021 } else if (data == "\u0000") { | |
1022 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1023 _addAttribute("\uFFFD"); | |
1024 state = attributeNameState; | |
1025 } else { | |
1026 _addAttribute(data); | |
1027 state = attributeNameState; | |
1028 } | |
1029 return true; | |
1030 } | |
1031 | |
1032 bool attributeNameState() { | |
1033 var data = stream.char(); | |
1034 bool leavingThisState = true; | |
1035 bool emitToken = false; | |
1036 if (data == "=") { | |
1037 state = beforeAttributeValueState; | |
1038 } else if (isLetter(data)) { | |
1039 _attributeName.write(data); | |
1040 _attributeName.write(stream.charsUntil(asciiLetters, true)); | |
1041 leavingThisState = false; | |
1042 } else if (data == ">") { | |
1043 // XXX If we emit here the attributes are converted to a dict | |
1044 // without being checked and when the code below runs we error | |
1045 // because data is a dict not a list | |
1046 emitToken = true; | |
1047 } else if (isWhitespace(data)) { | |
1048 state = afterAttributeNameState; | |
1049 } else if (data == "/") { | |
1050 state = selfClosingStartTagState; | |
1051 } else if (data == "\u0000") { | |
1052 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1053 _attributeName.write('\uFFFD'); | |
1054 leavingThisState = false; | |
1055 } else if (data == EOF) { | |
1056 _addToken(new ParseErrorToken("eof-in-attribute-name")); | |
1057 state = dataState; | |
1058 } else if ("'\"<".contains(data)) { | |
1059 _addToken(new ParseErrorToken("invalid-character-in-attribute-name")); | |
1060 _attributeName.write(data); | |
1061 leavingThisState = false; | |
1062 } else { | |
1063 _attributeName.write(data); | |
1064 leavingThisState = false; | |
1065 } | |
1066 | |
1067 if (leavingThisState) { | |
1068 _markAttributeNameEnd(-1); | |
1069 | |
1070 // Attributes are not dropped at this stage. That happens when the | |
1071 // start tag token is emitted so values can still be safely appended | |
1072 // to attributes, but we do want to report the parse error in time. | |
1073 var attrName = _attributeName.toString(); | |
1074 if (lowercaseAttrName) { | |
1075 attrName = asciiUpper2Lower(attrName); | |
1076 } | |
1077 _attributes.last.name = attrName; | |
1078 if (_attributeNames == null) _attributeNames = new Set(); | |
1079 if (_attributeNames.contains(attrName)) { | |
1080 _addToken(new ParseErrorToken("duplicate-attribute")); | |
1081 } | |
1082 _attributeNames.add(attrName); | |
1083 | |
1084 // XXX Fix for above XXX | |
1085 if (emitToken) { | |
1086 emitCurrentToken(); | |
1087 } | |
1088 } | |
1089 return true; | |
1090 } | |
1091 | |
1092 bool afterAttributeNameState() { | |
1093 var data = stream.char(); | |
1094 if (isWhitespace(data)) { | |
1095 stream.charsUntil(spaceCharacters, true); | |
1096 } else if (data == "=") { | |
1097 state = beforeAttributeValueState; | |
1098 } else if (data == ">") { | |
1099 emitCurrentToken(); | |
1100 } else if (isLetter(data)) { | |
1101 _addAttribute(data); | |
1102 state = attributeNameState; | |
1103 } else if (data == "/") { | |
1104 state = selfClosingStartTagState; | |
1105 } else if (data == "\u0000") { | |
1106 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1107 _addAttribute("\uFFFD"); | |
1108 state = attributeNameState; | |
1109 } else if (data == EOF) { | |
1110 _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof")); | |
1111 state = dataState; | |
1112 } else if ("'\"<".contains(data)) { | |
1113 _addToken(new ParseErrorToken("invalid-character-after-attribute-name")); | |
1114 _addAttribute(data); | |
1115 state = attributeNameState; | |
1116 } else { | |
1117 _addAttribute(data); | |
1118 state = attributeNameState; | |
1119 } | |
1120 return true; | |
1121 } | |
1122 | |
1123 bool beforeAttributeValueState() { | |
1124 var data = stream.char(); | |
1125 if (isWhitespace(data)) { | |
1126 stream.charsUntil(spaceCharacters, true); | |
1127 } else if (data == "\"") { | |
1128 _markAttributeValueStart(0); | |
1129 state = attributeValueDoubleQuotedState; | |
1130 } else if (data == "&") { | |
1131 state = attributeValueUnQuotedState; | |
1132 stream.unget(data); | |
1133 _markAttributeValueStart(0); | |
1134 } else if (data == "'") { | |
1135 _markAttributeValueStart(0); | |
1136 state = attributeValueSingleQuotedState; | |
1137 } else if (data == ">") { | |
1138 _addToken(new ParseErrorToken( | |
1139 "expected-attribute-value-but-got-right-bracket")); | |
1140 emitCurrentToken(); | |
1141 } else if (data == "\u0000") { | |
1142 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1143 _markAttributeValueStart(-1); | |
1144 _attributeValue.write('\uFFFD'); | |
1145 state = attributeValueUnQuotedState; | |
1146 } else if (data == EOF) { | |
1147 _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof")); | |
1148 state = dataState; | |
1149 } else if ("=<`".contains(data)) { | |
1150 _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value")); | |
1151 _markAttributeValueStart(-1); | |
1152 _attributeValue.write(data); | |
1153 state = attributeValueUnQuotedState; | |
1154 } else { | |
1155 _markAttributeValueStart(-1); | |
1156 _attributeValue.write(data); | |
1157 state = attributeValueUnQuotedState; | |
1158 } | |
1159 return true; | |
1160 } | |
1161 | |
1162 bool attributeValueDoubleQuotedState() { | |
1163 var data = stream.char(); | |
1164 if (data == "\"") { | |
1165 _markAttributeValueEnd(-1); | |
1166 _markAttributeEnd(0); | |
1167 state = afterAttributeValueState; | |
1168 } else if (data == "&") { | |
1169 processEntityInAttribute('"'); | |
1170 } else if (data == "\u0000") { | |
1171 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1172 _attributeValue.write('\uFFFD'); | |
1173 } else if (data == EOF) { | |
1174 _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote")); | |
1175 _markAttributeValueEnd(-1); | |
1176 state = dataState; | |
1177 } else { | |
1178 _attributeValue.write(data); | |
1179 _attributeValue.write(stream.charsUntil("\"&")); | |
1180 } | |
1181 return true; | |
1182 } | |
1183 | |
1184 bool attributeValueSingleQuotedState() { | |
1185 var data = stream.char(); | |
1186 if (data == "'") { | |
1187 _markAttributeValueEnd(-1); | |
1188 _markAttributeEnd(0); | |
1189 state = afterAttributeValueState; | |
1190 } else if (data == "&") { | |
1191 processEntityInAttribute("'"); | |
1192 } else if (data == "\u0000") { | |
1193 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1194 _attributeValue.write('\uFFFD'); | |
1195 } else if (data == EOF) { | |
1196 _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote")); | |
1197 _markAttributeValueEnd(-1); | |
1198 state = dataState; | |
1199 } else { | |
1200 _attributeValue.write(data); | |
1201 _attributeValue.write(stream.charsUntil("\'&")); | |
1202 } | |
1203 return true; | |
1204 } | |
1205 | |
1206 bool attributeValueUnQuotedState() { | |
1207 var data = stream.char(); | |
1208 if (isWhitespace(data)) { | |
1209 _markAttributeValueEnd(-1); | |
1210 state = beforeAttributeNameState; | |
1211 } else if (data == "&") { | |
1212 processEntityInAttribute(">"); | |
1213 } else if (data == ">") { | |
1214 _markAttributeValueEnd(-1); | |
1215 emitCurrentToken(); | |
1216 } else if (data == EOF) { | |
1217 _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes")); | |
1218 _markAttributeValueEnd(-1); | |
1219 state = dataState; | |
1220 } else if ('"\'=<`'.contains(data)) { | |
1221 _addToken(new ParseErrorToken( | |
1222 "unexpected-character-in-unquoted-attribute-value")); | |
1223 _attributeValue.write(data); | |
1224 } else if (data == "\u0000") { | |
1225 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1226 _attributeValue.write('\uFFFD'); | |
1227 } else { | |
1228 _attributeValue.write(data); | |
1229 _attributeValue.write(stream.charsUntil("&>\"\'=<`$spaceCharacters")); | |
1230 } | |
1231 return true; | |
1232 } | |
1233 | |
1234 bool afterAttributeValueState() { | |
1235 var data = stream.char(); | |
1236 if (isWhitespace(data)) { | |
1237 state = beforeAttributeNameState; | |
1238 } else if (data == ">") { | |
1239 emitCurrentToken(); | |
1240 } else if (data == "/") { | |
1241 state = selfClosingStartTagState; | |
1242 } else if (data == EOF) { | |
1243 _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value")); | |
1244 stream.unget(data); | |
1245 state = dataState; | |
1246 } else { | |
1247 _addToken( | |
1248 new ParseErrorToken("unexpected-character-after-attribute-value")); | |
1249 stream.unget(data); | |
1250 state = beforeAttributeNameState; | |
1251 } | |
1252 return true; | |
1253 } | |
1254 | |
1255 bool selfClosingStartTagState() { | |
1256 var data = stream.char(); | |
1257 if (data == ">") { | |
1258 currentTagToken.selfClosing = true; | |
1259 emitCurrentToken(); | |
1260 } else if (data == EOF) { | |
1261 _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag")); | |
1262 stream.unget(data); | |
1263 state = dataState; | |
1264 } else { | |
1265 _addToken( | |
1266 new ParseErrorToken("unexpected-character-after-soldius-in-tag")); | |
1267 stream.unget(data); | |
1268 state = beforeAttributeNameState; | |
1269 } | |
1270 return true; | |
1271 } | |
1272 | |
1273 bool bogusCommentState() { | |
1274 // Make a new comment token and give it as value all the characters | |
1275 // until the first > or EOF (charsUntil checks for EOF automatically) | |
1276 // and emit it. | |
1277 var data = stream.charsUntil(">"); | |
1278 data = data.replaceAll("\u0000", "\uFFFD"); | |
1279 _addToken(new CommentToken(data)); | |
1280 | |
1281 // Eat the character directly after the bogus comment which is either a | |
1282 // ">" or an EOF. | |
1283 stream.char(); | |
1284 state = dataState; | |
1285 return true; | |
1286 } | |
1287 | |
1288 bool markupDeclarationOpenState() { | |
1289 var charStack = [stream.char()]; | |
1290 if (charStack.last == "-") { | |
1291 charStack.add(stream.char()); | |
1292 if (charStack.last == "-") { | |
1293 currentToken = new CommentToken(); | |
1294 state = commentStartState; | |
1295 return true; | |
1296 } | |
1297 } else if (charStack.last == 'd' || charStack.last == 'D') { | |
1298 var matched = true; | |
1299 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) { | |
1300 var char = stream.char(); | |
1301 charStack.add(char); | |
1302 if (char == EOF || !expected.contains(char)) { | |
1303 matched = false; | |
1304 break; | |
1305 } | |
1306 } | |
1307 if (matched) { | |
1308 currentToken = new DoctypeToken(correct: true); | |
1309 state = doctypeState; | |
1310 return true; | |
1311 } | |
1312 } else if (charStack.last == "[" && | |
1313 parser != null && | |
1314 parser.tree.openElements.length > 0 && | |
1315 parser.tree.openElements.last.namespaceUri != | |
1316 parser.tree.defaultNamespace) { | |
1317 var matched = true; | |
1318 for (var expected in const ["C", "D", "A", "T", "A", "["]) { | |
1319 charStack.add(stream.char()); | |
1320 if (charStack.last != expected) { | |
1321 matched = false; | |
1322 break; | |
1323 } | |
1324 } | |
1325 if (matched) { | |
1326 state = cdataSectionState; | |
1327 return true; | |
1328 } | |
1329 } | |
1330 | |
1331 _addToken(new ParseErrorToken("expected-dashes-or-doctype")); | |
1332 | |
1333 while (charStack.length > 0) { | |
1334 stream.unget(charStack.removeLast()); | |
1335 } | |
1336 state = bogusCommentState; | |
1337 return true; | |
1338 } | |
1339 | |
1340 bool commentStartState() { | |
1341 var data = stream.char(); | |
1342 if (data == "-") { | |
1343 state = commentStartDashState; | |
1344 } else if (data == "\u0000") { | |
1345 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1346 currentStringToken.add('\uFFFD'); | |
1347 } else if (data == ">") { | |
1348 _addToken(new ParseErrorToken("incorrect-comment")); | |
1349 _addToken(currentToken); | |
1350 state = dataState; | |
1351 } else if (data == EOF) { | |
1352 _addToken(new ParseErrorToken("eof-in-comment")); | |
1353 _addToken(currentToken); | |
1354 state = dataState; | |
1355 } else { | |
1356 currentStringToken.add(data); | |
1357 state = commentState; | |
1358 } | |
1359 return true; | |
1360 } | |
1361 | |
1362 bool commentStartDashState() { | |
1363 var data = stream.char(); | |
1364 if (data == "-") { | |
1365 state = commentEndState; | |
1366 } else if (data == "\u0000") { | |
1367 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1368 currentStringToken.add('-\uFFFD'); | |
1369 } else if (data == ">") { | |
1370 _addToken(new ParseErrorToken("incorrect-comment")); | |
1371 _addToken(currentToken); | |
1372 state = dataState; | |
1373 } else if (data == EOF) { | |
1374 _addToken(new ParseErrorToken("eof-in-comment")); | |
1375 _addToken(currentToken); | |
1376 state = dataState; | |
1377 } else { | |
1378 currentStringToken.add('-').add(data); | |
1379 state = commentState; | |
1380 } | |
1381 return true; | |
1382 } | |
1383 | |
1384 bool commentState() { | |
1385 var data = stream.char(); | |
1386 if (data == "-") { | |
1387 state = commentEndDashState; | |
1388 } else if (data == "\u0000") { | |
1389 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1390 currentStringToken.add('\uFFFD'); | |
1391 } else if (data == EOF) { | |
1392 _addToken(new ParseErrorToken("eof-in-comment")); | |
1393 _addToken(currentToken); | |
1394 state = dataState; | |
1395 } else { | |
1396 currentStringToken.add(data).add(stream.charsUntil("-\u0000")); | |
1397 } | |
1398 return true; | |
1399 } | |
1400 | |
1401 bool commentEndDashState() { | |
1402 var data = stream.char(); | |
1403 if (data == "-") { | |
1404 state = commentEndState; | |
1405 } else if (data == "\u0000") { | |
1406 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1407 currentStringToken.add('-\uFFFD'); | |
1408 state = commentState; | |
1409 } else if (data == EOF) { | |
1410 _addToken(new ParseErrorToken("eof-in-comment-end-dash")); | |
1411 _addToken(currentToken); | |
1412 state = dataState; | |
1413 } else { | |
1414 currentStringToken.add('-').add(data); | |
1415 state = commentState; | |
1416 } | |
1417 return true; | |
1418 } | |
1419 | |
1420 bool commentEndState() { | |
1421 var data = stream.char(); | |
1422 if (data == ">") { | |
1423 _addToken(currentToken); | |
1424 state = dataState; | |
1425 } else if (data == "\u0000") { | |
1426 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1427 currentStringToken.add('--\uFFFD'); | |
1428 state = commentState; | |
1429 } else if (data == "!") { | |
1430 _addToken( | |
1431 new ParseErrorToken("unexpected-bang-after-double-dash-in-comment")); | |
1432 state = commentEndBangState; | |
1433 } else if (data == "-") { | |
1434 _addToken( | |
1435 new ParseErrorToken("unexpected-dash-after-double-dash-in-comment")); | |
1436 currentStringToken.add(data); | |
1437 } else if (data == EOF) { | |
1438 _addToken(new ParseErrorToken("eof-in-comment-double-dash")); | |
1439 _addToken(currentToken); | |
1440 state = dataState; | |
1441 } else { | |
1442 // XXX | |
1443 _addToken(new ParseErrorToken("unexpected-char-in-comment")); | |
1444 currentStringToken.add('--').add(data); | |
1445 state = commentState; | |
1446 } | |
1447 return true; | |
1448 } | |
1449 | |
1450 bool commentEndBangState() { | |
1451 var data = stream.char(); | |
1452 if (data == ">") { | |
1453 _addToken(currentToken); | |
1454 state = dataState; | |
1455 } else if (data == "-") { | |
1456 currentStringToken.add('--!'); | |
1457 state = commentEndDashState; | |
1458 } else if (data == "\u0000") { | |
1459 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1460 currentStringToken.add('--!\uFFFD'); | |
1461 state = commentState; | |
1462 } else if (data == EOF) { | |
1463 _addToken(new ParseErrorToken("eof-in-comment-end-bang-state")); | |
1464 _addToken(currentToken); | |
1465 state = dataState; | |
1466 } else { | |
1467 currentStringToken.add('--!').add(data); | |
1468 state = commentState; | |
1469 } | |
1470 return true; | |
1471 } | |
1472 | |
1473 bool doctypeState() { | |
1474 var data = stream.char(); | |
1475 if (isWhitespace(data)) { | |
1476 state = beforeDoctypeNameState; | |
1477 } else if (data == EOF) { | |
1478 _addToken(new ParseErrorToken("expected-doctype-name-but-got-eof")); | |
1479 currentDoctypeToken.correct = false; | |
1480 _addToken(currentToken); | |
1481 state = dataState; | |
1482 } else { | |
1483 _addToken(new ParseErrorToken("need-space-after-doctype")); | |
1484 stream.unget(data); | |
1485 state = beforeDoctypeNameState; | |
1486 } | |
1487 return true; | |
1488 } | |
1489 | |
1490 bool beforeDoctypeNameState() { | |
1491 var data = stream.char(); | |
1492 if (isWhitespace(data)) { | |
1493 return true; | |
1494 } else if (data == ">") { | |
1495 _addToken( | |
1496 new ParseErrorToken("expected-doctype-name-but-got-right-bracket")); | |
1497 currentDoctypeToken.correct = false; | |
1498 _addToken(currentToken); | |
1499 state = dataState; | |
1500 } else if (data == "\u0000") { | |
1501 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1502 currentDoctypeToken.name = "\uFFFD"; | |
1503 state = doctypeNameState; | |
1504 } else if (data == EOF) { | |
1505 _addToken(new ParseErrorToken("expected-doctype-name-but-got-eof")); | |
1506 currentDoctypeToken.correct = false; | |
1507 _addToken(currentToken); | |
1508 state = dataState; | |
1509 } else { | |
1510 currentDoctypeToken.name = data; | |
1511 state = doctypeNameState; | |
1512 } | |
1513 return true; | |
1514 } | |
1515 | |
1516 bool doctypeNameState() { | |
1517 var data = stream.char(); | |
1518 if (isWhitespace(data)) { | |
1519 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); | |
1520 state = afterDoctypeNameState; | |
1521 } else if (data == ">") { | |
1522 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); | |
1523 _addToken(currentToken); | |
1524 state = dataState; | |
1525 } else if (data == "\u0000") { | |
1526 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1527 currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD"; | |
1528 state = doctypeNameState; | |
1529 } else if (data == EOF) { | |
1530 _addToken(new ParseErrorToken("eof-in-doctype-name")); | |
1531 currentDoctypeToken.correct = false; | |
1532 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); | |
1533 _addToken(currentToken); | |
1534 state = dataState; | |
1535 } else { | |
1536 currentDoctypeToken.name = '${currentDoctypeToken.name}$data'; | |
1537 } | |
1538 return true; | |
1539 } | |
1540 | |
1541 bool afterDoctypeNameState() { | |
1542 var data = stream.char(); | |
1543 if (isWhitespace(data)) { | |
1544 return true; | |
1545 } else if (data == ">") { | |
1546 _addToken(currentToken); | |
1547 state = dataState; | |
1548 } else if (data == EOF) { | |
1549 currentDoctypeToken.correct = false; | |
1550 stream.unget(data); | |
1551 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1552 _addToken(currentToken); | |
1553 state = dataState; | |
1554 } else { | |
1555 if (data == "p" || data == "P") { | |
1556 // TODO(jmesserly): would be nice to have a helper for this. | |
1557 var matched = true; | |
1558 for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) { | |
1559 data = stream.char(); | |
1560 if (data == EOF || !expected.contains(data)) { | |
1561 matched = false; | |
1562 break; | |
1563 } | |
1564 } | |
1565 if (matched) { | |
1566 state = afterDoctypePublicKeywordState; | |
1567 return true; | |
1568 } | |
1569 } else if (data == "s" || data == "S") { | |
1570 var matched = true; | |
1571 for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) { | |
1572 data = stream.char(); | |
1573 if (data == EOF || !expected.contains(data)) { | |
1574 matched = false; | |
1575 break; | |
1576 } | |
1577 } | |
1578 if (matched) { | |
1579 state = afterDoctypeSystemKeywordState; | |
1580 return true; | |
1581 } | |
1582 } | |
1583 | |
1584 // All the characters read before the current 'data' will be | |
1585 // [a-zA-Z], so they're garbage in the bogus doctype and can be | |
1586 // discarded; only the latest character might be '>' or EOF | |
1587 // and needs to be ungetted | |
1588 stream.unget(data); | |
1589 _addToken(new ParseErrorToken( | |
1590 "expected-space-or-right-bracket-in-doctype", | |
1591 messageParams: {"data": data})); | |
1592 currentDoctypeToken.correct = false; | |
1593 state = bogusDoctypeState; | |
1594 } | |
1595 return true; | |
1596 } | |
1597 | |
1598 bool afterDoctypePublicKeywordState() { | |
1599 var data = stream.char(); | |
1600 if (isWhitespace(data)) { | |
1601 state = beforeDoctypePublicIdentifierState; | |
1602 } else if (data == "'" || data == '"') { | |
1603 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1604 stream.unget(data); | |
1605 state = beforeDoctypePublicIdentifierState; | |
1606 } else if (data == EOF) { | |
1607 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1608 currentDoctypeToken.correct = false; | |
1609 _addToken(currentToken); | |
1610 state = dataState; | |
1611 } else { | |
1612 stream.unget(data); | |
1613 state = beforeDoctypePublicIdentifierState; | |
1614 } | |
1615 return true; | |
1616 } | |
1617 | |
1618 bool beforeDoctypePublicIdentifierState() { | |
1619 var data = stream.char(); | |
1620 if (isWhitespace(data)) { | |
1621 return true; | |
1622 } else if (data == "\"") { | |
1623 currentDoctypeToken.publicId = ""; | |
1624 state = doctypePublicIdentifierDoubleQuotedState; | |
1625 } else if (data == "'") { | |
1626 currentDoctypeToken.publicId = ""; | |
1627 state = doctypePublicIdentifierSingleQuotedState; | |
1628 } else if (data == ">") { | |
1629 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1630 currentDoctypeToken.correct = false; | |
1631 _addToken(currentToken); | |
1632 state = dataState; | |
1633 } else if (data == EOF) { | |
1634 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1635 currentDoctypeToken.correct = false; | |
1636 _addToken(currentToken); | |
1637 state = dataState; | |
1638 } else { | |
1639 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1640 currentDoctypeToken.correct = false; | |
1641 state = bogusDoctypeState; | |
1642 } | |
1643 return true; | |
1644 } | |
1645 | |
1646 bool doctypePublicIdentifierDoubleQuotedState() { | |
1647 var data = stream.char(); | |
1648 if (data == '"') { | |
1649 state = afterDoctypePublicIdentifierState; | |
1650 } else if (data == "\u0000") { | |
1651 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1652 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD"; | |
1653 } else if (data == ">") { | |
1654 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1655 currentDoctypeToken.correct = false; | |
1656 _addToken(currentToken); | |
1657 state = dataState; | |
1658 } else if (data == EOF) { | |
1659 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1660 currentDoctypeToken.correct = false; | |
1661 _addToken(currentToken); | |
1662 state = dataState; | |
1663 } else { | |
1664 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data'; | |
1665 } | |
1666 return true; | |
1667 } | |
1668 | |
1669 bool doctypePublicIdentifierSingleQuotedState() { | |
1670 var data = stream.char(); | |
1671 if (data == "'") { | |
1672 state = afterDoctypePublicIdentifierState; | |
1673 } else if (data == "\u0000") { | |
1674 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1675 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD"; | |
1676 } else if (data == ">") { | |
1677 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1678 currentDoctypeToken.correct = false; | |
1679 _addToken(currentToken); | |
1680 state = dataState; | |
1681 } else if (data == EOF) { | |
1682 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1683 currentDoctypeToken.correct = false; | |
1684 _addToken(currentToken); | |
1685 state = dataState; | |
1686 } else { | |
1687 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data'; | |
1688 } | |
1689 return true; | |
1690 } | |
1691 | |
1692 bool afterDoctypePublicIdentifierState() { | |
1693 var data = stream.char(); | |
1694 if (isWhitespace(data)) { | |
1695 state = betweenDoctypePublicAndSystemIdentifiersState; | |
1696 } else if (data == ">") { | |
1697 _addToken(currentToken); | |
1698 state = dataState; | |
1699 } else if (data == '"') { | |
1700 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1701 currentDoctypeToken.systemId = ""; | |
1702 state = doctypeSystemIdentifierDoubleQuotedState; | |
1703 } else if (data == "'") { | |
1704 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1705 currentDoctypeToken.systemId = ""; | |
1706 state = doctypeSystemIdentifierSingleQuotedState; | |
1707 } else if (data == EOF) { | |
1708 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1709 currentDoctypeToken.correct = false; | |
1710 _addToken(currentToken); | |
1711 state = dataState; | |
1712 } else { | |
1713 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1714 currentDoctypeToken.correct = false; | |
1715 state = bogusDoctypeState; | |
1716 } | |
1717 return true; | |
1718 } | |
1719 | |
1720 bool betweenDoctypePublicAndSystemIdentifiersState() { | |
1721 var data = stream.char(); | |
1722 if (isWhitespace(data)) { | |
1723 return true; | |
1724 } else if (data == ">") { | |
1725 _addToken(currentToken); | |
1726 state = dataState; | |
1727 } else if (data == '"') { | |
1728 currentDoctypeToken.systemId = ""; | |
1729 state = doctypeSystemIdentifierDoubleQuotedState; | |
1730 } else if (data == "'") { | |
1731 currentDoctypeToken.systemId = ""; | |
1732 state = doctypeSystemIdentifierSingleQuotedState; | |
1733 } else if (data == EOF) { | |
1734 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1735 currentDoctypeToken.correct = false; | |
1736 _addToken(currentToken); | |
1737 state = dataState; | |
1738 } else { | |
1739 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1740 currentDoctypeToken.correct = false; | |
1741 state = bogusDoctypeState; | |
1742 } | |
1743 return true; | |
1744 } | |
1745 | |
1746 bool afterDoctypeSystemKeywordState() { | |
1747 var data = stream.char(); | |
1748 if (isWhitespace(data)) { | |
1749 state = beforeDoctypeSystemIdentifierState; | |
1750 } else if (data == "'" || data == '"') { | |
1751 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1752 stream.unget(data); | |
1753 state = beforeDoctypeSystemIdentifierState; | |
1754 } else if (data == EOF) { | |
1755 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1756 currentDoctypeToken.correct = false; | |
1757 _addToken(currentToken); | |
1758 state = dataState; | |
1759 } else { | |
1760 stream.unget(data); | |
1761 state = beforeDoctypeSystemIdentifierState; | |
1762 } | |
1763 return true; | |
1764 } | |
1765 | |
1766 bool beforeDoctypeSystemIdentifierState() { | |
1767 var data = stream.char(); | |
1768 if (isWhitespace(data)) { | |
1769 return true; | |
1770 } else if (data == "\"") { | |
1771 currentDoctypeToken.systemId = ""; | |
1772 state = doctypeSystemIdentifierDoubleQuotedState; | |
1773 } else if (data == "'") { | |
1774 currentDoctypeToken.systemId = ""; | |
1775 state = doctypeSystemIdentifierSingleQuotedState; | |
1776 } else if (data == ">") { | |
1777 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1778 currentDoctypeToken.correct = false; | |
1779 _addToken(currentToken); | |
1780 state = dataState; | |
1781 } else if (data == EOF) { | |
1782 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1783 currentDoctypeToken.correct = false; | |
1784 _addToken(currentToken); | |
1785 state = dataState; | |
1786 } else { | |
1787 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1788 currentDoctypeToken.correct = false; | |
1789 state = bogusDoctypeState; | |
1790 } | |
1791 return true; | |
1792 } | |
1793 | |
1794 bool doctypeSystemIdentifierDoubleQuotedState() { | |
1795 var data = stream.char(); | |
1796 if (data == "\"") { | |
1797 state = afterDoctypeSystemIdentifierState; | |
1798 } else if (data == "\u0000") { | |
1799 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1800 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD"; | |
1801 } else if (data == ">") { | |
1802 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1803 currentDoctypeToken.correct = false; | |
1804 _addToken(currentToken); | |
1805 state = dataState; | |
1806 } else if (data == EOF) { | |
1807 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1808 currentDoctypeToken.correct = false; | |
1809 _addToken(currentToken); | |
1810 state = dataState; | |
1811 } else { | |
1812 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data'; | |
1813 } | |
1814 return true; | |
1815 } | |
1816 | |
1817 bool doctypeSystemIdentifierSingleQuotedState() { | |
1818 var data = stream.char(); | |
1819 if (data == "'") { | |
1820 state = afterDoctypeSystemIdentifierState; | |
1821 } else if (data == "\u0000") { | |
1822 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1823 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD"; | |
1824 } else if (data == ">") { | |
1825 _addToken(new ParseErrorToken("unexpected-end-of-doctype")); | |
1826 currentDoctypeToken.correct = false; | |
1827 _addToken(currentToken); | |
1828 state = dataState; | |
1829 } else if (data == EOF) { | |
1830 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1831 currentDoctypeToken.correct = false; | |
1832 _addToken(currentToken); | |
1833 state = dataState; | |
1834 } else { | |
1835 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data'; | |
1836 } | |
1837 return true; | |
1838 } | |
1839 | |
1840 bool afterDoctypeSystemIdentifierState() { | |
1841 var data = stream.char(); | |
1842 if (isWhitespace(data)) { | |
1843 return true; | |
1844 } else if (data == ">") { | |
1845 _addToken(currentToken); | |
1846 state = dataState; | |
1847 } else if (data == EOF) { | |
1848 _addToken(new ParseErrorToken("eof-in-doctype")); | |
1849 currentDoctypeToken.correct = false; | |
1850 _addToken(currentToken); | |
1851 state = dataState; | |
1852 } else { | |
1853 _addToken(new ParseErrorToken("unexpected-char-in-doctype")); | |
1854 state = bogusDoctypeState; | |
1855 } | |
1856 return true; | |
1857 } | |
1858 | |
1859 bool bogusDoctypeState() { | |
1860 var data = stream.char(); | |
1861 if (data == ">") { | |
1862 _addToken(currentToken); | |
1863 state = dataState; | |
1864 } else if (data == EOF) { | |
1865 // XXX EMIT | |
1866 stream.unget(data); | |
1867 _addToken(currentToken); | |
1868 state = dataState; | |
1869 } | |
1870 return true; | |
1871 } | |
1872 | |
1873 bool cdataSectionState() { | |
1874 var data = []; | |
1875 int matchedEnd = 0; | |
1876 while (true) { | |
1877 var ch = stream.char(); | |
1878 if (ch == EOF) { | |
1879 break; | |
1880 } | |
1881 // Deal with null here rather than in the parser | |
1882 if (ch == "\u0000") { | |
1883 _addToken(new ParseErrorToken("invalid-codepoint")); | |
1884 ch = "\uFFFD"; | |
1885 } | |
1886 data.add(ch); | |
1887 // TODO(jmesserly): it'd be nice if we had an easier way to match the end, | |
1888 // perhaps with a "peek" API. | |
1889 if (ch == "]" && matchedEnd < 2) { | |
1890 matchedEnd++; | |
1891 } else if (ch == ">" && matchedEnd == 2) { | |
1892 // Remove "]]>" from the end. | |
1893 data.removeLast(); | |
1894 data.removeLast(); | |
1895 data.removeLast(); | |
1896 break; | |
1897 } else { | |
1898 matchedEnd = 0; | |
1899 } | |
1900 } | |
1901 | |
1902 if (data.length > 0) { | |
1903 _addToken(new CharactersToken(data.join())); | |
1904 } | |
1905 state = dataState; | |
1906 return true; | |
1907 } | |
1908 } | |
OLD | NEW |