Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1230)

Side by Side Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 178843003: [html5lib] triple slash comment style (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: remove extra check Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 library tokenizer; 1 library tokenizer;
2 2
3 import 'dart:collection'; 3 import 'dart:collection';
4 import 'package:html5lib/parser.dart' show HtmlParser; 4 import 'package:html5lib/parser.dart' show HtmlParser;
5 import 'package:source_maps/span.dart' show Span, FileSpan; 5 import 'package:source_maps/span.dart' show Span, FileSpan;
6 import 'constants.dart'; 6 import 'constants.dart';
7 import 'inputstream.dart'; 7 import 'inputstream.dart';
8 import 'token.dart'; 8 import 'token.dart';
9 import 'utils.dart'; 9 import 'utils.dart';
10 10
11 // Group entities by their first character, for faster lookups 11 // Group entities by their first character, for faster lookups
12 12
13 // TODO(jmesserly): we could use a better data structure here like a trie, if 13 // TODO(jmesserly): we could use a better data structure here like a trie, if
14 // we had it implemented in Dart. 14 // we had it implemented in Dart.
15 Map<String, List<String>> entitiesByFirstChar = (() { 15 Map<String, List<String>> entitiesByFirstChar = (() {
16 var result = {}; 16 var result = {};
17 for (var k in entities.keys) { 17 for (var k in entities.keys) {
18 result.putIfAbsent(k[0], () => []).add(k); 18 result.putIfAbsent(k[0], () => []).add(k);
19 } 19 }
20 return result; 20 return result;
21 })(); 21 })();
22 22
23 // TODO(jmesserly): lots of ways to make this faster: 23 // TODO(jmesserly): lots of ways to make this faster:
24 // - use char codes everywhere instead of 1-char strings 24 // - use char codes everywhere instead of 1-char strings
25 // - use switch instead of contains, indexOf 25 // - use switch instead of contains, indexOf
26 // - use switch instead of the sequential if tests 26 // - use switch instead of the sequential if tests
27 // - avoid string concat 27 // - avoid string concat
28 28
29 /** 29 /// This class takes care of tokenizing HTML.
30 * This class takes care of tokenizing HTML.
31 */
32 class HtmlTokenizer implements Iterator<Token> { 30 class HtmlTokenizer implements Iterator<Token> {
33 // TODO(jmesserly): a lot of these could be made private 31 // TODO(jmesserly): a lot of these could be made private
34 32
35 final HtmlInputStream stream; 33 final HtmlInputStream stream;
36 34
37 final bool lowercaseElementName; 35 final bool lowercaseElementName;
38 36
39 final bool lowercaseAttrName; 37 final bool lowercaseAttrName;
40 38
41 /** True to generate spans in for [Token.span]. */ 39 /// True to generate spans in for [Token.span].
42 final bool generateSpans; 40 final bool generateSpans;
43 41
44 /** True to generate spans for attributes. */ 42 /// True to generate spans for attributes.
45 final bool attributeSpans; 43 final bool attributeSpans;
46 44
47 /** 45 /// This reference to the parser is used for correct CDATA handling.
48 * This reference to the parser is used for correct CDATA handling. 46 /// The [HtmlParser] will set this at construction time.
49 * The [HtmlParser] will set this at construction time.
50 */
51 HtmlParser parser; 47 HtmlParser parser;
52 48
53 final Queue<Token> tokenQueue; 49 final Queue<Token> tokenQueue;
54 50
55 /** Holds the token that is currently being processed. */ 51 /// Holds the token that is currently being processed.
56 Token currentToken; 52 Token currentToken;
57 53
58 /** 54 /// Holds a reference to the method to be invoked for the next parser state.
59 * Holds a reference to the method to be invoked for the next parser state.
60 */
61 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode 55 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode
62 // bug prevents us from doing that. See http://dartbug.com/12465 56 // bug prevents us from doing that. See http://dartbug.com/12465
63 Function state; 57 Function state;
64 58
65 String temporaryBuffer; 59 String temporaryBuffer;
66 60
67 int _lastOffset; 61 int _lastOffset;
68 62
69 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add 63 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add
70 // an item until it's ready. But the code doesn't have a clear notion of when 64 // an item until it's ready. But the code doesn't have a clear notion of when
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
117 // Note: we could track the name span here, if we need it. 111 // Note: we could track the name span here, if we need it.
118 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); 112 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);
119 113
120 void _addAttribute(String name) { 114 void _addAttribute(String name) {
121 if (_attributes == null) _attributes = []; 115 if (_attributes == null) _attributes = [];
122 var attr = new TagAttribute(name); 116 var attr = new TagAttribute(name);
123 _attributes.add(attr); 117 _attributes.add(attr);
124 if (attributeSpans) attr.start = stream.position - name.length; 118 if (attributeSpans) attr.start = stream.position - name.length;
125 } 119 }
126 120
127 /** 121 /// This is where the magic happens.
128 * This is where the magic happens. 122 ///
129 * 123 /// We do our usually processing through the states and when we have a token
130 * We do our usually processing through the states and when we have a token 124 /// to return we yield the token which pauses processing until the next token
131 * to return we yield the token which pauses processing until the next token 125 /// is requested.
132 * is requested.
133 */
134 bool moveNext() { 126 bool moveNext() {
135 // Start processing. When EOF is reached state will return false; 127 // Start processing. When EOF is reached state will return false;
136 // instead of true and the loop will terminate. 128 // instead of true and the loop will terminate.
137 while (stream.errors.length == 0 && tokenQueue.length == 0) { 129 while (stream.errors.length == 0 && tokenQueue.length == 0) {
138 if (!state()) { 130 if (!state()) {
139 _current = null; 131 _current = null;
140 return false; 132 return false;
141 } 133 }
142 } 134 }
143 if (stream.errors.length > 0) { 135 if (stream.errors.length > 0) {
144 _current = new ParseErrorToken(stream.errors.removeFirst()); 136 _current = new ParseErrorToken(stream.errors.removeFirst());
145 } else { 137 } else {
146 assert (tokenQueue.length > 0); 138 assert (tokenQueue.length > 0);
147 _current = tokenQueue.removeFirst(); 139 _current = tokenQueue.removeFirst();
148 } 140 }
149 return true; 141 return true;
150 } 142 }
151 143
152 /** 144 /// Resets the tokenizer state. Calling this does not reset the [stream] or
153 * Resets the tokenizer state. Calling this does not reset the [stream] or 145 /// the [parser].
154 * the [parser].
155 */
156 void reset() { 146 void reset() {
157 _lastOffset = 0; 147 _lastOffset = 0;
158 tokenQueue.clear(); 148 tokenQueue.clear();
159 currentToken = null; 149 currentToken = null;
160 temporaryBuffer = null; 150 temporaryBuffer = null;
161 _attributes = null; 151 _attributes = null;
162 _attributeNames = null; 152 _attributeNames = null;
163 state = dataState; 153 state = dataState;
164 } 154 }
165 155
166 /** Adds a token to the queue. Sets the span if needed. */ 156 /// Adds a token to the queue. Sets the span if needed.
167 void _addToken(Token token) { 157 void _addToken(Token token) {
168 if (generateSpans && token.span == null) { 158 if (generateSpans && token.span == null) {
169 int offset = stream.position; 159 int offset = stream.position;
170 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset); 160 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset);
171 if (token is! ParseErrorToken) { 161 if (token is! ParseErrorToken) {
172 _lastOffset = offset; 162 _lastOffset = offset;
173 } 163 }
174 } 164 }
175 tokenQueue.add(token); 165 tokenQueue.add(token);
176 } 166 }
177 167
178 /** 168 /// This function returns either U+FFFD or the character based on the
179 * This function returns either U+FFFD or the character based on the 169 /// decimal or hexadecimal representation. It also discards ";" if present.
180 * decimal or hexadecimal representation. It also discards ";" if present. 170 /// If not present it will add a [ParseErrorToken].
181 * If not present it will add a [ParseErrorToken].
182 */
183 String consumeNumberEntity(bool isHex) { 171 String consumeNumberEntity(bool isHex) {
184 var allowed = isDigit; 172 var allowed = isDigit;
185 var radix = 10; 173 var radix = 10;
186 if (isHex) { 174 if (isHex) {
187 allowed = isHexDigit; 175 allowed = isHexDigit;
188 radix = 16; 176 radix = 16;
189 } 177 }
190 178
191 var charStack = []; 179 var charStack = [];
192 180
(...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after
338 var token; 326 var token;
339 if (isWhitespace(output)) { 327 if (isWhitespace(output)) {
340 token = new SpaceCharactersToken(output); 328 token = new SpaceCharactersToken(output);
341 } else { 329 } else {
342 token = new CharactersToken(output); 330 token = new CharactersToken(output);
343 } 331 }
344 _addToken(token); 332 _addToken(token);
345 } 333 }
346 } 334 }
347 335
348 /** This method replaces the need for "entityInAttributeValueState". */ 336 /// This method replaces the need for "entityInAttributeValueState".
349 void processEntityInAttribute(String allowedChar) { 337 void processEntityInAttribute(String allowedChar) {
350 consumeEntity(allowedChar: allowedChar, fromAttribute: true); 338 consumeEntity(allowedChar: allowedChar, fromAttribute: true);
351 } 339 }
352 340
353 /** 341 /// This method is a generic handler for emitting the tags. It also sets
354 * This method is a generic handler for emitting the tags. It also sets 342 /// the state to "data" because that's what's needed after a token has been
355 * the state to "data" because that's what's needed after a token has been 343 /// emitted.
356 * emitted.
357 */
358 void emitCurrentToken() { 344 void emitCurrentToken() {
359 var token = currentToken; 345 var token = currentToken;
360 // Add token to the queue to be yielded 346 // Add token to the queue to be yielded
361 if (token is TagToken) { 347 if (token is TagToken) {
362 if (lowercaseElementName) { 348 if (lowercaseElementName) {
363 token.name = asciiUpper2Lower(token.name); 349 token.name = asciiUpper2Lower(token.name);
364 } 350 }
365 if (token is EndTagToken) { 351 if (token is EndTagToken) {
366 if (_attributes != null) { 352 if (_attributes != null) {
367 _addToken(new ParseErrorToken("attributes-in-end-tag")); 353 _addToken(new ParseErrorToken("attributes-in-end-tag"));
(...skipping 1524 matching lines...) Expand 10 before | Expand all | Expand 10 after
1892 } 1878 }
1893 1879
1894 if (data.length > 0) { 1880 if (data.length > 0) {
1895 _addToken(new CharactersToken(data.join())); 1881 _addToken(new CharactersToken(data.join()));
1896 } 1882 }
1897 state = dataState; 1883 state = dataState;
1898 return true; 1884 return true;
1899 } 1885 }
1900 } 1886 }
1901 1887
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698