OLD | NEW |
1 library tokenizer; | 1 library tokenizer; |
2 | 2 |
3 import 'dart:collection'; | 3 import 'dart:collection'; |
4 import 'package:html5lib/parser.dart' show HtmlParser; | 4 import 'package:html5lib/parser.dart' show HtmlParser; |
5 import 'package:source_maps/span.dart' show Span, FileSpan; | 5 import 'package:source_maps/span.dart' show Span, FileSpan; |
6 import 'constants.dart'; | 6 import 'constants.dart'; |
7 import 'inputstream.dart'; | 7 import 'inputstream.dart'; |
8 import 'token.dart'; | 8 import 'token.dart'; |
9 import 'utils.dart'; | 9 import 'utils.dart'; |
10 | 10 |
11 // Group entities by their first character, for faster lookups | 11 // Group entities by their first character, for faster lookups |
12 | 12 |
13 // TODO(jmesserly): we could use a better data structure here like a trie, if | 13 // TODO(jmesserly): we could use a better data structure here like a trie, if |
14 // we had it implemented in Dart. | 14 // we had it implemented in Dart. |
15 Map<String, List<String>> entitiesByFirstChar = (() { | 15 Map<String, List<String>> entitiesByFirstChar = (() { |
16 var result = {}; | 16 var result = {}; |
17 for (var k in entities.keys) { | 17 for (var k in entities.keys) { |
18 result.putIfAbsent(k[0], () => []).add(k); | 18 result.putIfAbsent(k[0], () => []).add(k); |
19 } | 19 } |
20 return result; | 20 return result; |
21 })(); | 21 })(); |
22 | 22 |
23 // TODO(jmesserly): lots of ways to make this faster: | 23 // TODO(jmesserly): lots of ways to make this faster: |
24 // - use char codes everywhere instead of 1-char strings | 24 // - use char codes everywhere instead of 1-char strings |
25 // - use switch instead of contains, indexOf | 25 // - use switch instead of contains, indexOf |
26 // - use switch instead of the sequential if tests | 26 // - use switch instead of the sequential if tests |
27 // - avoid string concat | 27 // - avoid string concat |
28 | 28 |
29 /** | 29 /// This class takes care of tokenizing HTML. |
30 * This class takes care of tokenizing HTML. | |
31 */ | |
32 class HtmlTokenizer implements Iterator<Token> { | 30 class HtmlTokenizer implements Iterator<Token> { |
33 // TODO(jmesserly): a lot of these could be made private | 31 // TODO(jmesserly): a lot of these could be made private |
34 | 32 |
35 final HtmlInputStream stream; | 33 final HtmlInputStream stream; |
36 | 34 |
37 final bool lowercaseElementName; | 35 final bool lowercaseElementName; |
38 | 36 |
39 final bool lowercaseAttrName; | 37 final bool lowercaseAttrName; |
40 | 38 |
41 /** True to generate spans in for [Token.span]. */ | 39 /// True to generate spans in for [Token.span]. |
42 final bool generateSpans; | 40 final bool generateSpans; |
43 | 41 |
44 /** True to generate spans for attributes. */ | 42 /// True to generate spans for attributes. |
45 final bool attributeSpans; | 43 final bool attributeSpans; |
46 | 44 |
47 /** | 45 /// This reference to the parser is used for correct CDATA handling. |
48 * This reference to the parser is used for correct CDATA handling. | 46 /// The [HtmlParser] will set this at construction time. |
49 * The [HtmlParser] will set this at construction time. | |
50 */ | |
51 HtmlParser parser; | 47 HtmlParser parser; |
52 | 48 |
53 final Queue<Token> tokenQueue; | 49 final Queue<Token> tokenQueue; |
54 | 50 |
55 /** Holds the token that is currently being processed. */ | 51 /// Holds the token that is currently being processed. |
56 Token currentToken; | 52 Token currentToken; |
57 | 53 |
58 /** | 54 /// Holds a reference to the method to be invoked for the next parser state. |
59 * Holds a reference to the method to be invoked for the next parser state. | |
60 */ | |
61 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode | 55 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode |
62 // bug prevents us from doing that. See http://dartbug.com/12465 | 56 // bug prevents us from doing that. See http://dartbug.com/12465 |
63 Function state; | 57 Function state; |
64 | 58 |
65 String temporaryBuffer; | 59 String temporaryBuffer; |
66 | 60 |
67 int _lastOffset; | 61 int _lastOffset; |
68 | 62 |
69 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add | 63 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add |
70 // an item until it's ready. But the code doesn't have a clear notion of when | 64 // an item until it's ready. But the code doesn't have a clear notion of when |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
117 // Note: we could track the name span here, if we need it. | 111 // Note: we could track the name span here, if we need it. |
118 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); | 112 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); |
119 | 113 |
120 void _addAttribute(String name) { | 114 void _addAttribute(String name) { |
121 if (_attributes == null) _attributes = []; | 115 if (_attributes == null) _attributes = []; |
122 var attr = new TagAttribute(name); | 116 var attr = new TagAttribute(name); |
123 _attributes.add(attr); | 117 _attributes.add(attr); |
124 if (attributeSpans) attr.start = stream.position - name.length; | 118 if (attributeSpans) attr.start = stream.position - name.length; |
125 } | 119 } |
126 | 120 |
127 /** | 121 /// This is where the magic happens. |
128 * This is where the magic happens. | 122 /// |
129 * | 123 /// We do our usually processing through the states and when we have a token |
130 * We do our usually processing through the states and when we have a token | 124 /// to return we yield the token which pauses processing until the next token |
131 * to return we yield the token which pauses processing until the next token | 125 /// is requested. |
132 * is requested. | |
133 */ | |
134 bool moveNext() { | 126 bool moveNext() { |
135 // Start processing. When EOF is reached state will return false; | 127 // Start processing. When EOF is reached state will return false; |
136 // instead of true and the loop will terminate. | 128 // instead of true and the loop will terminate. |
137 while (stream.errors.length == 0 && tokenQueue.length == 0) { | 129 while (stream.errors.length == 0 && tokenQueue.length == 0) { |
138 if (!state()) { | 130 if (!state()) { |
139 _current = null; | 131 _current = null; |
140 return false; | 132 return false; |
141 } | 133 } |
142 } | 134 } |
143 if (stream.errors.length > 0) { | 135 if (stream.errors.length > 0) { |
144 _current = new ParseErrorToken(stream.errors.removeFirst()); | 136 _current = new ParseErrorToken(stream.errors.removeFirst()); |
145 } else { | 137 } else { |
146 assert (tokenQueue.length > 0); | 138 assert (tokenQueue.length > 0); |
147 _current = tokenQueue.removeFirst(); | 139 _current = tokenQueue.removeFirst(); |
148 } | 140 } |
149 return true; | 141 return true; |
150 } | 142 } |
151 | 143 |
152 /** | 144 /// Resets the tokenizer state. Calling this does not reset the [stream] or |
153 * Resets the tokenizer state. Calling this does not reset the [stream] or | 145 /// the [parser]. |
154 * the [parser]. | |
155 */ | |
156 void reset() { | 146 void reset() { |
157 _lastOffset = 0; | 147 _lastOffset = 0; |
158 tokenQueue.clear(); | 148 tokenQueue.clear(); |
159 currentToken = null; | 149 currentToken = null; |
160 temporaryBuffer = null; | 150 temporaryBuffer = null; |
161 _attributes = null; | 151 _attributes = null; |
162 _attributeNames = null; | 152 _attributeNames = null; |
163 state = dataState; | 153 state = dataState; |
164 } | 154 } |
165 | 155 |
166 /** Adds a token to the queue. Sets the span if needed. */ | 156 /// Adds a token to the queue. Sets the span if needed. |
167 void _addToken(Token token) { | 157 void _addToken(Token token) { |
168 if (generateSpans && token.span == null) { | 158 if (generateSpans && token.span == null) { |
169 int offset = stream.position; | 159 int offset = stream.position; |
170 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset); | 160 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset); |
171 if (token is! ParseErrorToken) { | 161 if (token is! ParseErrorToken) { |
172 _lastOffset = offset; | 162 _lastOffset = offset; |
173 } | 163 } |
174 } | 164 } |
175 tokenQueue.add(token); | 165 tokenQueue.add(token); |
176 } | 166 } |
177 | 167 |
178 /** | 168 /// This function returns either U+FFFD or the character based on the |
179 * This function returns either U+FFFD or the character based on the | 169 /// decimal or hexadecimal representation. It also discards ";" if present. |
180 * decimal or hexadecimal representation. It also discards ";" if present. | 170 /// If not present it will add a [ParseErrorToken]. |
181 * If not present it will add a [ParseErrorToken]. | |
182 */ | |
183 String consumeNumberEntity(bool isHex) { | 171 String consumeNumberEntity(bool isHex) { |
184 var allowed = isDigit; | 172 var allowed = isDigit; |
185 var radix = 10; | 173 var radix = 10; |
186 if (isHex) { | 174 if (isHex) { |
187 allowed = isHexDigit; | 175 allowed = isHexDigit; |
188 radix = 16; | 176 radix = 16; |
189 } | 177 } |
190 | 178 |
191 var charStack = []; | 179 var charStack = []; |
192 | 180 |
(...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
338 var token; | 326 var token; |
339 if (isWhitespace(output)) { | 327 if (isWhitespace(output)) { |
340 token = new SpaceCharactersToken(output); | 328 token = new SpaceCharactersToken(output); |
341 } else { | 329 } else { |
342 token = new CharactersToken(output); | 330 token = new CharactersToken(output); |
343 } | 331 } |
344 _addToken(token); | 332 _addToken(token); |
345 } | 333 } |
346 } | 334 } |
347 | 335 |
348 /** This method replaces the need for "entityInAttributeValueState". */ | 336 /// This method replaces the need for "entityInAttributeValueState". |
349 void processEntityInAttribute(String allowedChar) { | 337 void processEntityInAttribute(String allowedChar) { |
350 consumeEntity(allowedChar: allowedChar, fromAttribute: true); | 338 consumeEntity(allowedChar: allowedChar, fromAttribute: true); |
351 } | 339 } |
352 | 340 |
353 /** | 341 /// This method is a generic handler for emitting the tags. It also sets |
354 * This method is a generic handler for emitting the tags. It also sets | 342 /// the state to "data" because that's what's needed after a token has been |
355 * the state to "data" because that's what's needed after a token has been | 343 /// emitted. |
356 * emitted. | |
357 */ | |
358 void emitCurrentToken() { | 344 void emitCurrentToken() { |
359 var token = currentToken; | 345 var token = currentToken; |
360 // Add token to the queue to be yielded | 346 // Add token to the queue to be yielded |
361 if (token is TagToken) { | 347 if (token is TagToken) { |
362 if (lowercaseElementName) { | 348 if (lowercaseElementName) { |
363 token.name = asciiUpper2Lower(token.name); | 349 token.name = asciiUpper2Lower(token.name); |
364 } | 350 } |
365 if (token is EndTagToken) { | 351 if (token is EndTagToken) { |
366 if (_attributes != null) { | 352 if (_attributes != null) { |
367 _addToken(new ParseErrorToken("attributes-in-end-tag")); | 353 _addToken(new ParseErrorToken("attributes-in-end-tag")); |
(...skipping 1524 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1892 } | 1878 } |
1893 | 1879 |
1894 if (data.length > 0) { | 1880 if (data.length > 0) { |
1895 _addToken(new CharactersToken(data.join())); | 1881 _addToken(new CharactersToken(data.join())); |
1896 } | 1882 } |
1897 state = dataState; | 1883 state = dataState; |
1898 return true; | 1884 return true; |
1899 } | 1885 } |
1900 } | 1886 } |
1901 | 1887 |
OLD | NEW |