| OLD | NEW |
| 1 library tokenizer; | 1 library tokenizer; |
| 2 | 2 |
| 3 import 'dart:collection'; | 3 import 'dart:collection'; |
| 4 import 'package:html5lib/parser.dart' show HtmlParser; | 4 import 'package:html5lib/parser.dart' show HtmlParser; |
| 5 import 'package:source_maps/span.dart' show Span, FileSpan; | 5 import 'package:source_maps/span.dart' show Span, FileSpan; |
| 6 import 'constants.dart'; | 6 import 'constants.dart'; |
| 7 import 'inputstream.dart'; | 7 import 'inputstream.dart'; |
| 8 import 'token.dart'; | 8 import 'token.dart'; |
| 9 import 'utils.dart'; | 9 import 'utils.dart'; |
| 10 | 10 |
| 11 // Group entities by their first character, for faster lookups | 11 // Group entities by their first character, for faster lookups |
| 12 | 12 |
| 13 // TODO(jmesserly): we could use a better data structure here like a trie, if | 13 // TODO(jmesserly): we could use a better data structure here like a trie, if |
| 14 // we had it implemented in Dart. | 14 // we had it implemented in Dart. |
| 15 Map<String, List<String>> entitiesByFirstChar = (() { | 15 Map<String, List<String>> entitiesByFirstChar = (() { |
| 16 var result = {}; | 16 var result = {}; |
| 17 for (var k in entities.keys) { | 17 for (var k in entities.keys) { |
| 18 result.putIfAbsent(k[0], () => []).add(k); | 18 result.putIfAbsent(k[0], () => []).add(k); |
| 19 } | 19 } |
| 20 return result; | 20 return result; |
| 21 })(); | 21 })(); |
| 22 | 22 |
| 23 // TODO(jmesserly): lots of ways to make this faster: | 23 // TODO(jmesserly): lots of ways to make this faster: |
| 24 // - use char codes everywhere instead of 1-char strings | 24 // - use char codes everywhere instead of 1-char strings |
| 25 // - use switch instead of contains, indexOf | 25 // - use switch instead of contains, indexOf |
| 26 // - use switch instead of the sequential if tests | 26 // - use switch instead of the sequential if tests |
| 27 // - avoid string concat | 27 // - avoid string concat |
| 28 | 28 |
| 29 /** | 29 /// This class takes care of tokenizing HTML. |
| 30 * This class takes care of tokenizing HTML. | |
| 31 */ | |
| 32 class HtmlTokenizer implements Iterator<Token> { | 30 class HtmlTokenizer implements Iterator<Token> { |
| 33 // TODO(jmesserly): a lot of these could be made private | 31 // TODO(jmesserly): a lot of these could be made private |
| 34 | 32 |
| 35 final HtmlInputStream stream; | 33 final HtmlInputStream stream; |
| 36 | 34 |
| 37 final bool lowercaseElementName; | 35 final bool lowercaseElementName; |
| 38 | 36 |
| 39 final bool lowercaseAttrName; | 37 final bool lowercaseAttrName; |
| 40 | 38 |
| 41 /** True to generate spans in for [Token.span]. */ | 39 /// True to generate spans in for [Token.span]. |
| 42 final bool generateSpans; | 40 final bool generateSpans; |
| 43 | 41 |
| 44 /** True to generate spans for attributes. */ | 42 /// True to generate spans for attributes. |
| 45 final bool attributeSpans; | 43 final bool attributeSpans; |
| 46 | 44 |
| 47 /** | 45 /// This reference to the parser is used for correct CDATA handling. |
| 48 * This reference to the parser is used for correct CDATA handling. | 46 /// The [HtmlParser] will set this at construction time. |
| 49 * The [HtmlParser] will set this at construction time. | |
| 50 */ | |
| 51 HtmlParser parser; | 47 HtmlParser parser; |
| 52 | 48 |
| 53 final Queue<Token> tokenQueue; | 49 final Queue<Token> tokenQueue; |
| 54 | 50 |
| 55 /** Holds the token that is currently being processed. */ | 51 /// Holds the token that is currently being processed. |
| 56 Token currentToken; | 52 Token currentToken; |
| 57 | 53 |
| 58 /** | 54 /// Holds a reference to the method to be invoked for the next parser state. |
| 59 * Holds a reference to the method to be invoked for the next parser state. | |
| 60 */ | |
| 61 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode | 55 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode |
| 62 // bug prevents us from doing that. See http://dartbug.com/12465 | 56 // bug prevents us from doing that. See http://dartbug.com/12465 |
| 63 Function state; | 57 Function state; |
| 64 | 58 |
| 65 String temporaryBuffer; | 59 String temporaryBuffer; |
| 66 | 60 |
| 67 int _lastOffset; | 61 int _lastOffset; |
| 68 | 62 |
| 69 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add | 63 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add |
| 70 // an item until it's ready. But the code doesn't have a clear notion of when | 64 // an item until it's ready. But the code doesn't have a clear notion of when |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 117 // Note: we could track the name span here, if we need it. | 111 // Note: we could track the name span here, if we need it. |
| 118 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); | 112 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); |
| 119 | 113 |
| 120 void _addAttribute(String name) { | 114 void _addAttribute(String name) { |
| 121 if (_attributes == null) _attributes = []; | 115 if (_attributes == null) _attributes = []; |
| 122 var attr = new TagAttribute(name); | 116 var attr = new TagAttribute(name); |
| 123 _attributes.add(attr); | 117 _attributes.add(attr); |
| 124 if (attributeSpans) attr.start = stream.position - name.length; | 118 if (attributeSpans) attr.start = stream.position - name.length; |
| 125 } | 119 } |
| 126 | 120 |
| 127 /** | 121 /// This is where the magic happens. |
| 128 * This is where the magic happens. | 122 /// |
| 129 * | 123 /// We do our usually processing through the states and when we have a token |
| 130 * We do our usually processing through the states and when we have a token | 124 /// to return we yield the token which pauses processing until the next token |
| 131 * to return we yield the token which pauses processing until the next token | 125 /// is requested. |
| 132 * is requested. | |
| 133 */ | |
| 134 bool moveNext() { | 126 bool moveNext() { |
| 135 // Start processing. When EOF is reached state will return false; | 127 // Start processing. When EOF is reached state will return false; |
| 136 // instead of true and the loop will terminate. | 128 // instead of true and the loop will terminate. |
| 137 while (stream.errors.length == 0 && tokenQueue.length == 0) { | 129 while (stream.errors.length == 0 && tokenQueue.length == 0) { |
| 138 if (!state()) { | 130 if (!state()) { |
| 139 _current = null; | 131 _current = null; |
| 140 return false; | 132 return false; |
| 141 } | 133 } |
| 142 } | 134 } |
| 143 if (stream.errors.length > 0) { | 135 if (stream.errors.length > 0) { |
| 144 _current = new ParseErrorToken(stream.errors.removeFirst()); | 136 _current = new ParseErrorToken(stream.errors.removeFirst()); |
| 145 } else { | 137 } else { |
| 146 assert (tokenQueue.length > 0); | 138 assert (tokenQueue.length > 0); |
| 147 _current = tokenQueue.removeFirst(); | 139 _current = tokenQueue.removeFirst(); |
| 148 } | 140 } |
| 149 return true; | 141 return true; |
| 150 } | 142 } |
| 151 | 143 |
| 152 /** | 144 /// Resets the tokenizer state. Calling this does not reset the [stream] or |
| 153 * Resets the tokenizer state. Calling this does not reset the [stream] or | 145 /// the [parser]. |
| 154 * the [parser]. | |
| 155 */ | |
| 156 void reset() { | 146 void reset() { |
| 157 _lastOffset = 0; | 147 _lastOffset = 0; |
| 158 tokenQueue.clear(); | 148 tokenQueue.clear(); |
| 159 currentToken = null; | 149 currentToken = null; |
| 160 temporaryBuffer = null; | 150 temporaryBuffer = null; |
| 161 _attributes = null; | 151 _attributes = null; |
| 162 _attributeNames = null; | 152 _attributeNames = null; |
| 163 state = dataState; | 153 state = dataState; |
| 164 } | 154 } |
| 165 | 155 |
| 166 /** Adds a token to the queue. Sets the span if needed. */ | 156 /// Adds a token to the queue. Sets the span if needed. |
| 167 void _addToken(Token token) { | 157 void _addToken(Token token) { |
| 168 if (generateSpans && token.span == null) { | 158 if (generateSpans && token.span == null) { |
| 169 int offset = stream.position; | 159 int offset = stream.position; |
| 170 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset); | 160 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset); |
| 171 if (token is! ParseErrorToken) { | 161 if (token is! ParseErrorToken) { |
| 172 _lastOffset = offset; | 162 _lastOffset = offset; |
| 173 } | 163 } |
| 174 } | 164 } |
| 175 tokenQueue.add(token); | 165 tokenQueue.add(token); |
| 176 } | 166 } |
| 177 | 167 |
| 178 /** | 168 /// This function returns either U+FFFD or the character based on the |
| 179 * This function returns either U+FFFD or the character based on the | 169 /// decimal or hexadecimal representation. It also discards ";" if present. |
| 180 * decimal or hexadecimal representation. It also discards ";" if present. | 170 /// If not present it will add a [ParseErrorToken]. |
| 181 * If not present it will add a [ParseErrorToken]. | |
| 182 */ | |
| 183 String consumeNumberEntity(bool isHex) { | 171 String consumeNumberEntity(bool isHex) { |
| 184 var allowed = isDigit; | 172 var allowed = isDigit; |
| 185 var radix = 10; | 173 var radix = 10; |
| 186 if (isHex) { | 174 if (isHex) { |
| 187 allowed = isHexDigit; | 175 allowed = isHexDigit; |
| 188 radix = 16; | 176 radix = 16; |
| 189 } | 177 } |
| 190 | 178 |
| 191 var charStack = []; | 179 var charStack = []; |
| 192 | 180 |
| (...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 338 var token; | 326 var token; |
| 339 if (isWhitespace(output)) { | 327 if (isWhitespace(output)) { |
| 340 token = new SpaceCharactersToken(output); | 328 token = new SpaceCharactersToken(output); |
| 341 } else { | 329 } else { |
| 342 token = new CharactersToken(output); | 330 token = new CharactersToken(output); |
| 343 } | 331 } |
| 344 _addToken(token); | 332 _addToken(token); |
| 345 } | 333 } |
| 346 } | 334 } |
| 347 | 335 |
| 348 /** This method replaces the need for "entityInAttributeValueState". */ | 336 /// This method replaces the need for "entityInAttributeValueState". |
| 349 void processEntityInAttribute(String allowedChar) { | 337 void processEntityInAttribute(String allowedChar) { |
| 350 consumeEntity(allowedChar: allowedChar, fromAttribute: true); | 338 consumeEntity(allowedChar: allowedChar, fromAttribute: true); |
| 351 } | 339 } |
| 352 | 340 |
| 353 /** | 341 /// This method is a generic handler for emitting the tags. It also sets |
| 354 * This method is a generic handler for emitting the tags. It also sets | 342 /// the state to "data" because that's what's needed after a token has been |
| 355 * the state to "data" because that's what's needed after a token has been | 343 /// emitted. |
| 356 * emitted. | |
| 357 */ | |
| 358 void emitCurrentToken() { | 344 void emitCurrentToken() { |
| 359 var token = currentToken; | 345 var token = currentToken; |
| 360 // Add token to the queue to be yielded | 346 // Add token to the queue to be yielded |
| 361 if (token is TagToken) { | 347 if (token is TagToken) { |
| 362 if (lowercaseElementName) { | 348 if (lowercaseElementName) { |
| 363 token.name = asciiUpper2Lower(token.name); | 349 token.name = asciiUpper2Lower(token.name); |
| 364 } | 350 } |
| 365 if (token is EndTagToken) { | 351 if (token is EndTagToken) { |
| 366 if (_attributes != null) { | 352 if (_attributes != null) { |
| 367 _addToken(new ParseErrorToken("attributes-in-end-tag")); | 353 _addToken(new ParseErrorToken("attributes-in-end-tag")); |
| (...skipping 1524 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1892 } | 1878 } |
| 1893 | 1879 |
| 1894 if (data.length > 0) { | 1880 if (data.length > 0) { |
| 1895 _addToken(new CharactersToken(data.join())); | 1881 _addToken(new CharactersToken(data.join())); |
| 1896 } | 1882 } |
| 1897 state = dataState; | 1883 state = dataState; |
| 1898 return true; | 1884 return true; |
| 1899 } | 1885 } |
| 1900 } | 1886 } |
| 1901 | 1887 |
| OLD | NEW |