Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(98)

Side by Side Diff: pkg/csslib/lib/src/tokenizer.dart

Issue 268623002: [html5lib] implement querySelector/querySelectorAll (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « pkg/csslib/lib/src/token.dart ('k') | pkg/csslib/lib/src/tokenizer_base.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of csslib.parser; 5 part of csslib.parser;
6 6
7 class Tokenizer extends TokenizerBase { 7 class Tokenizer extends TokenizerBase {
8 /** U+ prefix for unicode characters. */ 8 /** U+ prefix for unicode characters. */
9 final UNICODE_U = 'U'.codeUnitAt(0); 9 final UNICODE_U = 'U'.codeUnitAt(0);
10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0); 10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0);
(...skipping 23 matching lines...) Expand all
34 case TokenChar.END_OF_FILE: 34 case TokenChar.END_OF_FILE:
35 return _finishToken(TokenKind.END_OF_FILE); 35 return _finishToken(TokenKind.END_OF_FILE);
36 case TokenChar.AT: 36 case TokenChar.AT:
37 int peekCh = _peekChar(); 37 int peekCh = _peekChar();
38 if (TokenizerHelpers.isIdentifierStart(peekCh)) { 38 if (TokenizerHelpers.isIdentifierStart(peekCh)) {
39 var oldIndex = _index; 39 var oldIndex = _index;
40 var oldStartIndex = _startIndex; 40 var oldStartIndex = _startIndex;
41 41
42 _startIndex = _index; 42 _startIndex = _index;
43 ch = _nextChar(); 43 ch = _nextChar();
44 Token ident = this.finishIdentifier(ch); 44 Token ident = finishIdentifier();
45 45
46 // Is it a directive? 46 // Is it a directive?
47 int tokId = TokenKind.matchDirectives(_text, _startIndex, 47 int tokId = TokenKind.matchDirectives(_text, _startIndex,
48 _index - _startIndex); 48 _index - _startIndex);
49 if (tokId == -1) { 49 if (tokId == -1) {
50 // No, is it a margin directive? 50 // No, is it a margin directive?
51 tokId = TokenKind.matchMarginDirectives(_text, _startIndex, 51 tokId = TokenKind.matchMarginDirectives(_text, _startIndex,
52 _index - _startIndex); 52 _index - _startIndex);
53 } 53 }
54 54
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
94 // ]]> 94 // ]]>
95 return next(); 95 return next();
96 } 96 }
97 return _finishToken(TokenKind.RBRACK); 97 return _finishToken(TokenKind.RBRACK);
98 case TokenChar.HASH: 98 case TokenChar.HASH:
99 return _finishToken(TokenKind.HASH); 99 return _finishToken(TokenKind.HASH);
100 case TokenChar.PLUS: 100 case TokenChar.PLUS:
101 if (maybeEatDigit()) return finishNumber(); 101 if (maybeEatDigit()) return finishNumber();
102 return _finishToken(TokenKind.PLUS); 102 return _finishToken(TokenKind.PLUS);
103 case TokenChar.MINUS: 103 case TokenChar.MINUS:
104 if (selectorExpression || unicodeRange) { 104 if (inSelectorExpression || unicodeRange) {
105 // If parsing in pseudo function expression then minus is an operator 105 // If parsing in pseudo function expression then minus is an operator
106 // not part of identifier e.g., interval value range (e.g. U+400-4ff) 106 // not part of identifier e.g., interval value range (e.g. U+400-4ff)
107 // or minus operator in selector expression. 107 // or minus operator in selector expression.
108 return _finishToken(TokenKind.MINUS); 108 return _finishToken(TokenKind.MINUS);
109 } else if (maybeEatDigit()) { 109 } else if (maybeEatDigit()) {
110 return finishNumber(); 110 return finishNumber();
111 } else if (TokenizerHelpers.isIdentifierStart(ch)) { 111 } else if (TokenizerHelpers.isIdentifierStart(ch)) {
112 return this.finishIdentifier(ch); 112 return finishIdentifier();
113 } 113 }
114 return _finishToken(TokenKind.MINUS); 114 return _finishToken(TokenKind.MINUS);
115 case TokenChar.GREATER: 115 case TokenChar.GREATER:
116 return _finishToken(TokenKind.GREATER); 116 return _finishToken(TokenKind.GREATER);
117 case TokenChar.TILDE: 117 case TokenChar.TILDE:
118 if (_maybeEatChar(TokenChar.EQUALS)) { 118 if (_maybeEatChar(TokenChar.EQUALS)) {
119 return _finishToken(TokenKind.INCLUDES); // ~= 119 return _finishToken(TokenKind.INCLUDES); // ~=
120 } 120 }
121 return _finishToken(TokenKind.TILDE); 121 return _finishToken(TokenKind.TILDE);
122 case TokenChar.ASTERISK: 122 case TokenChar.ASTERISK:
123 if (_maybeEatChar(TokenChar.EQUALS)) { 123 if (_maybeEatChar(TokenChar.EQUALS)) {
124 return _finishToken(TokenKind.SUBSTRING_MATCH); // *= 124 return _finishToken(TokenKind.SUBSTRING_MATCH); // *=
125 } 125 }
126 return _finishToken(TokenKind.ASTERISK); 126 return _finishToken(TokenKind.ASTERISK);
127 case TokenChar.AMPERSAND: 127 case TokenChar.AMPERSAND:
128 return _finishToken(TokenKind.AMPERSAND); 128 return _finishToken(TokenKind.AMPERSAND);
129 case TokenChar.NAMESPACE: 129 case TokenChar.NAMESPACE:
130 if (_maybeEatChar(TokenChar.EQUALS)) {
131 return _finishToken(TokenKind.DASH_MATCH); // |=
132 }
130 return _finishToken(TokenKind.NAMESPACE); 133 return _finishToken(TokenKind.NAMESPACE);
131 case TokenChar.COLON: 134 case TokenChar.COLON:
132 return _finishToken(TokenKind.COLON); 135 return _finishToken(TokenKind.COLON);
133 case TokenChar.COMMA: 136 case TokenChar.COMMA:
134 return _finishToken(TokenKind.COMMA); 137 return _finishToken(TokenKind.COMMA);
135 case TokenChar.SEMICOLON: 138 case TokenChar.SEMICOLON:
136 return _finishToken(TokenKind.SEMICOLON); 139 return _finishToken(TokenKind.SEMICOLON);
137 case TokenChar.PERCENT: 140 case TokenChar.PERCENT:
138 return _finishToken(TokenKind.PERCENT); 141 return _finishToken(TokenKind.PERCENT);
139 case TokenChar.SINGLE_QUOTE: 142 case TokenChar.SINGLE_QUOTE:
(...skipping 15 matching lines...) Expand all
155 _maybeEatChar(CDATA_NAME[3]) && 158 _maybeEatChar(CDATA_NAME[3]) &&
156 _maybeEatChar(CDATA_NAME[4]) && 159 _maybeEatChar(CDATA_NAME[4]) &&
157 _maybeEatChar(TokenChar.LBRACK)) { 160 _maybeEatChar(TokenChar.LBRACK)) {
158 // <![CDATA[ 161 // <![CDATA[
159 return next(); 162 return next();
160 } 163 }
161 } 164 }
162 return _finishToken(TokenKind.LESS); 165 return _finishToken(TokenKind.LESS);
163 case TokenChar.EQUALS: 166 case TokenChar.EQUALS:
164 return _finishToken(TokenKind.EQUALS); 167 return _finishToken(TokenKind.EQUALS);
165 case TokenChar.OR:
166 if (_maybeEatChar(TokenChar.EQUALS)) {
167 return _finishToken(TokenKind.DASH_MATCH); // |=
168 }
169 return _finishToken(TokenKind.OR);
170 case TokenChar.CARET: 168 case TokenChar.CARET:
171 if (_maybeEatChar(TokenChar.EQUALS)) { 169 if (_maybeEatChar(TokenChar.EQUALS)) {
172 return _finishToken(TokenKind.PREFIX_MATCH); // ^= 170 return _finishToken(TokenKind.PREFIX_MATCH); // ^=
173 } 171 }
174 return _finishToken(TokenKind.CARET); 172 return _finishToken(TokenKind.CARET);
175 case TokenChar.DOLLAR: 173 case TokenChar.DOLLAR:
176 if (_maybeEatChar(TokenChar.EQUALS)) { 174 if (_maybeEatChar(TokenChar.EQUALS)) {
177 return _finishToken(TokenKind.SUFFIX_MATCH); // $= 175 return _finishToken(TokenKind.SUFFIX_MATCH); // $=
178 } 176 }
179 return _finishToken(TokenKind.DOLLAR); 177 return _finishToken(TokenKind.DOLLAR);
180 case TokenChar.BANG: 178 case TokenChar.BANG:
181 Token tok = finishIdentifier(ch); 179 Token tok = finishIdentifier();
182 return (tok == null) ? _finishToken(TokenKind.BANG) : tok; 180 return (tok == null) ? _finishToken(TokenKind.BANG) : tok;
183 case TokenChar.BACKSLASH:
184 return _finishToken(TokenKind.BACKSLASH);
185 default: 181 default:
182 // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's
183 // appropriate outside of a few specific places; certainly shouldn't
184 // be parsed in selectors.
185 if (!inSelector && ch == TokenChar.BACKSLASH) {
186 return _finishToken(TokenKind.BACKSLASH);
187 }
188
186 if (unicodeRange) { 189 if (unicodeRange) {
187 // Three types of unicode ranges: 190 // Three types of unicode ranges:
188 // - single code point (e.g. U+416) 191 // - single code point (e.g. U+416)
189 // - interval value range (e.g. U+400-4ff) 192 // - interval value range (e.g. U+400-4ff)
190 // - range where trailing ‘?’ characters imply ‘any digit value’ 193 // - range where trailing ‘?’ characters imply ‘any digit value’
191 // (e.g. U+4??) 194 // (e.g. U+4??)
192 if (maybeEatHexDigit()) { 195 if (maybeEatHexDigit()) {
193 var t = finishHexNumber(); 196 var t = finishHexNumber();
194 // Any question marks then it's a HEX_RANGE not HEX_NUMBER. 197 // Any question marks then it's a HEX_RANGE not HEX_NUMBER.
195 if (maybeEatQuestionMark()) finishUnicodeRange(); 198 if (maybeEatQuestionMark()) finishUnicodeRange();
196 return t; 199 return t;
197 } else if (maybeEatQuestionMark()) { 200 } else if (maybeEatQuestionMark()) {
198 // HEX_RANGE U+N??? 201 // HEX_RANGE U+N???
199 return finishUnicodeRange(); 202 return finishUnicodeRange();
200 } else { 203 } else {
201 return _errorToken(); 204 return _errorToken();
202 } 205 }
203 } else if ((ch == UNICODE_U || ch == UNICODE_LOWER_U) && 206 } else if ((ch == UNICODE_U || ch == UNICODE_LOWER_U) &&
204 (_peekChar() == UNICODE_PLUS)) { 207 (_peekChar() == UNICODE_PLUS)) {
205 // Unicode range: U+uNumber[-U+uNumber] 208 // Unicode range: U+uNumber[-U+uNumber]
206 // uNumber = 0..10FFFF 209 // uNumber = 0..10FFFF
207 _nextChar(); // Skip + 210 _nextChar(); // Skip +
208 _startIndex = _index; // Starts at the number 211 _startIndex = _index; // Starts at the number
209 return _finishToken(TokenKind.UNICODE_RANGE); 212 return _finishToken(TokenKind.UNICODE_RANGE);
210 } else if (varDef(ch)) { 213 } else if (varDef(ch)) {
211 return _finishToken(TokenKind.VAR_DEFINITION); 214 return _finishToken(TokenKind.VAR_DEFINITION);
212 } else if (varUsage(ch)) { 215 } else if (varUsage(ch)) {
213 return _finishToken(TokenKind.VAR_USAGE); 216 return _finishToken(TokenKind.VAR_USAGE);
214 } else if (TokenizerHelpers.isIdentifierStart(ch)) { 217 } else if (TokenizerHelpers.isIdentifierStart(ch)) {
215 return finishIdentifier(ch); 218 return finishIdentifier();
216 } else if (TokenizerHelpers.isDigit(ch)) { 219 } else if (TokenizerHelpers.isDigit(ch)) {
217 return finishNumber(); 220 return finishNumber();
218 } 221 }
219 return _errorToken(); 222 return _errorToken();
220 } 223 }
221 } 224 }
222 225
223 bool varDef(int ch) { 226 bool varDef(int ch) {
224 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) && 227 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) &&
225 _maybeEatChar('r'.codeUnitAt(0)) && _maybeEatChar('-'.codeUnitAt(0)); 228 _maybeEatChar('r'.codeUnitAt(0)) && _maybeEatChar('-'.codeUnitAt(0));
226 } 229 }
227 230
228 bool varUsage(int ch) { 231 bool varUsage(int ch) {
229 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) && 232 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) &&
230 _maybeEatChar('r'.codeUnitAt(0)) && (_peekChar() == '-'.codeUnitAt(0)); 233 _maybeEatChar('r'.codeUnitAt(0)) && (_peekChar() == '-'.codeUnitAt(0));
231 } 234 }
232 235
233 Token _errorToken([String message = null]) { 236 Token _errorToken([String message = null]) {
234 return _finishToken(TokenKind.ERROR); 237 return _finishToken(TokenKind.ERROR);
235 } 238 }
236 239
237 int getIdentifierKind() { 240 int getIdentifierKind() {
238 // Is the identifier a unit type? 241 // Is the identifier a unit type?
239 int tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex); 242 int tokId = -1;
243
244 // Don't match units in selectors or selector expressions.
245 if (!inSelectorExpression && !inSelector) {
246 tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex);
247 }
240 if (tokId == -1) { 248 if (tokId == -1) {
241 tokId = (_text.substring(_startIndex, _index) == '!important') ? 249 tokId = (_text.substring(_startIndex, _index) == '!important') ?
242 TokenKind.IMPORTANT : -1; 250 TokenKind.IMPORTANT : -1;
243 } 251 }
244 252
245 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER; 253 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER;
246 } 254 }
247 255
248 // Need to override so CSS version of isIdentifierPart is used. 256 Token finishIdentifier() {
249 Token finishIdentifier(int ch) { 257 // If we encounter an escape sequence, remember it so we can post-process
258 // to unescape.
259 bool hasEscapedChars = false;
260 var chars = [];
261
262 // backup so we can start with the first character
263 int validateFrom = _index;
264 _index = _startIndex;
250 while (_index < _text.length) { 265 while (_index < _text.length) {
251 // If parsing in pseudo function expression then minus is an operator 266 int ch = _text.codeUnitAt(_index);
252 // not part of identifier. 267
253 var isIdentifier = selectorExpression 268 // If the previous character was "\" we need to escape. T
254 ? TokenizerHelpers.isIdentifierPartExpr(_text.codeUnitAt(_index)) 269 // http://www.w3.org/TR/CSS21/syndata.html#characters
255 : TokenizerHelpers.isIdentifierPart(_text.codeUnitAt(_index)); 270 // if followed by hexadecimal digits, create the appropriate character.
256 if (!isIdentifier) { 271 // otherwise, include the character in the identifier and don't treat it
257 break; 272 // specially.
273 if (ch == 92/*\*/) {
274 int startHex = ++_index;
275 eatHexDigits(startHex + 6);
276 if (_index != startHex) {
277 // Parse the hex digits and add that character.
278 chars.add(int.parse('0x' + _text.substring(startHex, _index)));
279
280 if (_index == _text.length) break;
281
282 // if we stopped the hex because of a whitespace char, skip it
283 ch = _text.codeUnitAt(_index);
284 if (_index - startHex != 6 &&
285 (ch == TokenChar.SPACE || ch == TokenChar.TAB ||
286 ch == TokenChar.RETURN || ch == TokenChar.NEWLINE)) {
287 _index++;
288 }
289 } else {
290 // not a digit, just add the next character literally
291 if (_index == _text.length) break;
292 chars.add(_text.codeUnitAt(_index++));
293 }
294 } else if (_index < validateFrom || (inSelectorExpression
295 ? TokenizerHelpers.isIdentifierPartExpr(ch)
296 : TokenizerHelpers.isIdentifierPart(ch))) {
297 chars.add(ch);
298 _index++;
258 } else { 299 } else {
259 _index += 1; 300 // Not an identifier or escaped character.
301 break;
260 } 302 }
261 } 303 }
262 304
263 int kind = getIdentifierKind(); 305 var span = _file.span(_startIndex, _index);
264 if (kind == TokenKind.IDENTIFIER) { 306 var text = new String.fromCharCodes(chars);
265 return _finishToken(TokenKind.IDENTIFIER);
266 } else {
267 return _finishToken(kind);
268 }
269 }
270 307
271 Token finishImportant() { 308 return new IdentifierToken(text, getIdentifierKind(), span);
272
273 } 309 }
274 310
275 Token finishNumber() { 311 Token finishNumber() {
276 eatDigits(); 312 eatDigits();
277 313
278 if (_peekChar() == 46/*.*/) { 314 if (_peekChar() == 46/*.*/) {
279 // Handle the case of 1.toString(). 315 // Handle the case of 1.toString().
280 _nextChar(); 316 _nextChar();
281 if (TokenizerHelpers.isDigit(_peekChar())) { 317 if (TokenizerHelpers.isDigit(_peekChar())) {
282 eatDigits(); 318 eatDigits();
283 return _finishToken(TokenKind.DOUBLE); 319 return _finishToken(TokenKind.DOUBLE);
284 } else { 320 } else {
285 _index -= 1; 321 _index -= 1;
286 } 322 }
287 } 323 }
288 324
289 return _finishToken(TokenKind.INTEGER); 325 return _finishToken(TokenKind.INTEGER);
290 } 326 }
291 327
292 bool maybeEatDigit() { 328 bool maybeEatDigit() {
293 if (_index < _text.length 329 if (_index < _text.length
294 && TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { 330 && TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) {
295 _index += 1; 331 _index += 1;
296 return true; 332 return true;
297 } 333 }
298 return false; 334 return false;
299 } 335 }
300 336
301 Token finishHexNumber() { 337 Token finishHexNumber() {
302 eatHexDigits(); 338 eatHexDigits(_text.length);
303 return _finishToken(TokenKind.HEX_INTEGER); 339 return _finishToken(TokenKind.HEX_INTEGER);
304 } 340 }
305 341
306 void eatHexDigits() { 342 void eatHexDigits(int end) {
307 while (_index < _text.length) { 343 end = math.min(end, _text.length);
344 while (_index < end) {
308 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { 345 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) {
309 _index += 1; 346 _index += 1;
310 } else { 347 } else {
311 return; 348 return;
312 } 349 }
313 } 350 }
314 } 351 }
315 352
316 bool maybeEatHexDigit() { 353 bool maybeEatHexDigit() {
317 if (_index < _text.length 354 if (_index < _text.length
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
392 || (c >= 65/*A*/ && c <= 70/*F*/)); 429 || (c >= 65/*A*/ && c <= 70/*F*/));
393 } 430 }
394 431
395 static bool isIdentifierPart(int c) { 432 static bool isIdentifierPart(int c) {
396 return isIdentifierPartExpr(c) || c == 45 /*-*/; 433 return isIdentifierPartExpr(c) || c == 45 /*-*/;
397 } 434 }
398 435
399 /** Pseudo function expressions identifiers can't have a minus sign. */ 436 /** Pseudo function expressions identifiers can't have a minus sign. */
400 static bool isIdentifierStartExpr(int c) { 437 static bool isIdentifierStartExpr(int c) {
401 return ((c >= 97/*a*/ && c <= 122/*z*/) || (c >= 65/*A*/ && c <= 90/*Z*/) || 438 return ((c >= 97/*a*/ && c <= 122/*z*/) || (c >= 65/*A*/ && c <= 90/*Z*/) ||
402 c == 95/*_*/); 439 // Note: Unicode 10646 chars U+00A0 or higher are allowed, see:
440 // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier
441 // http://www.w3.org/TR/CSS21/syndata.html#characters
442 // Also, escaped character should be allowed.
443 c == 95/*_*/ || c >= 0xA0 || c == 92/*\*/);
403 } 444 }
404 445
405 /** Pseudo function expressions identifiers can't have a minus sign. */ 446 /** Pseudo function expressions identifiers can't have a minus sign. */
406 static bool isIdentifierPartExpr(int c) { 447 static bool isIdentifierPartExpr(int c) {
407 return (isIdentifierStartExpr(c) || isDigit(c)); 448 return (isIdentifierStartExpr(c) || isDigit(c));
408 } 449 }
409 } 450 }
OLDNEW
« no previous file with comments | « pkg/csslib/lib/src/token.dart ('k') | pkg/csslib/lib/src/tokenizer_base.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698