OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of csslib.parser; | |
6 | |
7 class Tokenizer extends TokenizerBase { | |
8 /** U+ prefix for unicode characters. */ | |
9 final UNICODE_U = 'U'.codeUnitAt(0); | |
10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0); | |
11 final UNICODE_PLUS = '+'.codeUnitAt(0); | |
12 | |
13 final QUESTION_MARK = '?'.codeUnitAt(0); | |
14 | |
15 /** CDATA keyword. */ | |
16 final List CDATA_NAME = 'CDATA'.codeUnits; | |
17 | |
18 Tokenizer(SourceFile file, String text, bool skipWhitespace, [int index = 0]) | |
19 : super(file, text, skipWhitespace, index); | |
20 | |
21 Token next({unicodeRange: false}) { | |
22 // keep track of our starting position | |
23 _startIndex = _index; | |
24 | |
25 int ch; | |
26 ch = _nextChar(); | |
27 switch (ch) { | |
28 case TokenChar.NEWLINE: | |
29 case TokenChar.RETURN: | |
30 case TokenChar.SPACE: | |
31 case TokenChar.TAB: | |
32 return finishWhitespace(); | |
33 case TokenChar.END_OF_FILE: | |
34 return _finishToken(TokenKind.END_OF_FILE); | |
35 case TokenChar.AT: | |
36 int peekCh = _peekChar(); | |
37 if (TokenizerHelpers.isIdentifierStart(peekCh)) { | |
38 var oldIndex = _index; | |
39 var oldStartIndex = _startIndex; | |
40 | |
41 _startIndex = _index; | |
42 ch = _nextChar(); | |
43 finishIdentifier(); | |
44 | |
45 // Is it a directive? | |
46 int tokId = TokenKind.matchDirectives( | |
47 _text, _startIndex, _index - _startIndex); | |
48 if (tokId == -1) { | |
49 // No, is it a margin directive? | |
50 tokId = TokenKind.matchMarginDirectives( | |
51 _text, _startIndex, _index - _startIndex); | |
52 } | |
53 | |
54 if (tokId != -1) { | |
55 return _finishToken(tokId); | |
56 } else { | |
57 // Didn't find a CSS directive or margin directive so the @name is | |
58 // probably the Less definition '@name: value_variable_definition'. | |
59 _startIndex = oldStartIndex; | |
60 _index = oldIndex; | |
61 } | |
62 } | |
63 return _finishToken(TokenKind.AT); | |
64 case TokenChar.DOT: | |
65 int start = _startIndex; // Start where the dot started. | |
66 if (maybeEatDigit()) { | |
67 // looks like a number dot followed by digit(s). | |
68 Token number = finishNumber(); | |
69 if (number.kind == TokenKind.INTEGER) { | |
70 // It's a number but it's preceeded by a dot, so make it a double. | |
71 _startIndex = start; | |
72 return _finishToken(TokenKind.DOUBLE); | |
73 } else { | |
74 // Don't allow dot followed by a double (e.g, '..1'). | |
75 return _errorToken(); | |
76 } | |
77 } | |
78 // It's really a dot. | |
79 return _finishToken(TokenKind.DOT); | |
80 case TokenChar.LPAREN: | |
81 return _finishToken(TokenKind.LPAREN); | |
82 case TokenChar.RPAREN: | |
83 return _finishToken(TokenKind.RPAREN); | |
84 case TokenChar.LBRACE: | |
85 return _finishToken(TokenKind.LBRACE); | |
86 case TokenChar.RBRACE: | |
87 return _finishToken(TokenKind.RBRACE); | |
88 case TokenChar.LBRACK: | |
89 return _finishToken(TokenKind.LBRACK); | |
90 case TokenChar.RBRACK: | |
91 if (_maybeEatChar(TokenChar.RBRACK) && | |
92 _maybeEatChar(TokenChar.GREATER)) { | |
93 // ]]> | |
94 return next(); | |
95 } | |
96 return _finishToken(TokenKind.RBRACK); | |
97 case TokenChar.HASH: | |
98 return _finishToken(TokenKind.HASH); | |
99 case TokenChar.PLUS: | |
100 if (maybeEatDigit()) return finishNumber(); | |
101 return _finishToken(TokenKind.PLUS); | |
102 case TokenChar.MINUS: | |
103 if (inSelectorExpression || unicodeRange) { | |
104 // If parsing in pseudo function expression then minus is an operator | |
105 // not part of identifier e.g., interval value range (e.g. U+400-4ff) | |
106 // or minus operator in selector expression. | |
107 return _finishToken(TokenKind.MINUS); | |
108 } else if (maybeEatDigit()) { | |
109 return finishNumber(); | |
110 } else if (TokenizerHelpers.isIdentifierStart(ch)) { | |
111 return finishIdentifier(); | |
112 } | |
113 return _finishToken(TokenKind.MINUS); | |
114 case TokenChar.GREATER: | |
115 return _finishToken(TokenKind.GREATER); | |
116 case TokenChar.TILDE: | |
117 if (_maybeEatChar(TokenChar.EQUALS)) { | |
118 return _finishToken(TokenKind.INCLUDES); // ~= | |
119 } | |
120 return _finishToken(TokenKind.TILDE); | |
121 case TokenChar.ASTERISK: | |
122 if (_maybeEatChar(TokenChar.EQUALS)) { | |
123 return _finishToken(TokenKind.SUBSTRING_MATCH); // *= | |
124 } | |
125 return _finishToken(TokenKind.ASTERISK); | |
126 case TokenChar.AMPERSAND: | |
127 return _finishToken(TokenKind.AMPERSAND); | |
128 case TokenChar.NAMESPACE: | |
129 if (_maybeEatChar(TokenChar.EQUALS)) { | |
130 return _finishToken(TokenKind.DASH_MATCH); // |= | |
131 } | |
132 return _finishToken(TokenKind.NAMESPACE); | |
133 case TokenChar.COLON: | |
134 return _finishToken(TokenKind.COLON); | |
135 case TokenChar.COMMA: | |
136 return _finishToken(TokenKind.COMMA); | |
137 case TokenChar.SEMICOLON: | |
138 return _finishToken(TokenKind.SEMICOLON); | |
139 case TokenChar.PERCENT: | |
140 return _finishToken(TokenKind.PERCENT); | |
141 case TokenChar.SINGLE_QUOTE: | |
142 return _finishToken(TokenKind.SINGLE_QUOTE); | |
143 case TokenChar.DOUBLE_QUOTE: | |
144 return _finishToken(TokenKind.DOUBLE_QUOTE); | |
145 case TokenChar.SLASH: | |
146 if (_maybeEatChar(TokenChar.ASTERISK)) return finishMultiLineComment(); | |
147 return _finishToken(TokenKind.SLASH); | |
148 case TokenChar.LESS: // <!-- | |
149 if (_maybeEatChar(TokenChar.BANG)) { | |
150 if (_maybeEatChar(TokenChar.MINUS) && | |
151 _maybeEatChar(TokenChar.MINUS)) { | |
152 return finishMultiLineComment(); | |
153 } else if (_maybeEatChar(TokenChar.LBRACK) && | |
154 _maybeEatChar(CDATA_NAME[0]) && | |
155 _maybeEatChar(CDATA_NAME[1]) && | |
156 _maybeEatChar(CDATA_NAME[2]) && | |
157 _maybeEatChar(CDATA_NAME[3]) && | |
158 _maybeEatChar(CDATA_NAME[4]) && | |
159 _maybeEatChar(TokenChar.LBRACK)) { | |
160 // <![CDATA[ | |
161 return next(); | |
162 } | |
163 } | |
164 return _finishToken(TokenKind.LESS); | |
165 case TokenChar.EQUALS: | |
166 return _finishToken(TokenKind.EQUALS); | |
167 case TokenChar.CARET: | |
168 if (_maybeEatChar(TokenChar.EQUALS)) { | |
169 return _finishToken(TokenKind.PREFIX_MATCH); // ^= | |
170 } | |
171 return _finishToken(TokenKind.CARET); | |
172 case TokenChar.DOLLAR: | |
173 if (_maybeEatChar(TokenChar.EQUALS)) { | |
174 return _finishToken(TokenKind.SUFFIX_MATCH); // $= | |
175 } | |
176 return _finishToken(TokenKind.DOLLAR); | |
177 case TokenChar.BANG: | |
178 Token tok = finishIdentifier(); | |
179 return (tok == null) ? _finishToken(TokenKind.BANG) : tok; | |
180 default: | |
181 // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's | |
182 // appropriate outside of a few specific places; certainly shouldn't | |
183 // be parsed in selectors. | |
184 if (!inSelector && ch == TokenChar.BACKSLASH) { | |
185 return _finishToken(TokenKind.BACKSLASH); | |
186 } | |
187 | |
188 if (unicodeRange) { | |
189 // Three types of unicode ranges: | |
190 // - single code point (e.g. U+416) | |
191 // - interval value range (e.g. U+400-4ff) | |
192 // - range where trailing ‘?’ characters imply ‘any digit value’ | |
193 // (e.g. U+4??) | |
194 if (maybeEatHexDigit()) { | |
195 var t = finishHexNumber(); | |
196 // Any question marks then it's a HEX_RANGE not HEX_NUMBER. | |
197 if (maybeEatQuestionMark()) finishUnicodeRange(); | |
198 return t; | |
199 } else if (maybeEatQuestionMark()) { | |
200 // HEX_RANGE U+N??? | |
201 return finishUnicodeRange(); | |
202 } else { | |
203 return _errorToken(); | |
204 } | |
205 } else if ((ch == UNICODE_U || ch == UNICODE_LOWER_U) && | |
206 (_peekChar() == UNICODE_PLUS)) { | |
207 // Unicode range: U+uNumber[-U+uNumber] | |
208 // uNumber = 0..10FFFF | |
209 _nextChar(); // Skip + | |
210 _startIndex = _index; // Starts at the number | |
211 return _finishToken(TokenKind.UNICODE_RANGE); | |
212 } else if (varDef(ch)) { | |
213 return _finishToken(TokenKind.VAR_DEFINITION); | |
214 } else if (varUsage(ch)) { | |
215 return _finishToken(TokenKind.VAR_USAGE); | |
216 } else if (TokenizerHelpers.isIdentifierStart(ch)) { | |
217 return finishIdentifier(); | |
218 } else if (TokenizerHelpers.isDigit(ch)) { | |
219 return finishNumber(); | |
220 } | |
221 return _errorToken(); | |
222 } | |
223 } | |
224 | |
225 bool varDef(int ch) { | |
226 return ch == 'v'.codeUnitAt(0) && | |
227 _maybeEatChar('a'.codeUnitAt(0)) && | |
228 _maybeEatChar('r'.codeUnitAt(0)) && | |
229 _maybeEatChar('-'.codeUnitAt(0)); | |
230 } | |
231 | |
232 bool varUsage(int ch) { | |
233 return ch == 'v'.codeUnitAt(0) && | |
234 _maybeEatChar('a'.codeUnitAt(0)) && | |
235 _maybeEatChar('r'.codeUnitAt(0)) && | |
236 (_peekChar() == '-'.codeUnitAt(0)); | |
237 } | |
238 | |
239 Token _errorToken([String message = null]) { | |
240 return _finishToken(TokenKind.ERROR); | |
241 } | |
242 | |
243 int getIdentifierKind() { | |
244 // Is the identifier a unit type? | |
245 int tokId = -1; | |
246 | |
247 // Don't match units in selectors or selector expressions. | |
248 if (!inSelectorExpression && !inSelector) { | |
249 tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex); | |
250 } | |
251 if (tokId == -1) { | |
252 tokId = (_text.substring(_startIndex, _index) == '!important') | |
253 ? TokenKind.IMPORTANT | |
254 : -1; | |
255 } | |
256 | |
257 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER; | |
258 } | |
259 | |
260 Token finishIdentifier() { | |
261 // If we encounter an escape sequence, remember it so we can post-process | |
262 // to unescape. | |
263 var chars = []; | |
264 | |
265 // backup so we can start with the first character | |
266 int validateFrom = _index; | |
267 _index = _startIndex; | |
268 while (_index < _text.length) { | |
269 int ch = _text.codeUnitAt(_index); | |
270 | |
271 // If the previous character was "\" we need to escape. T | |
272 // http://www.w3.org/TR/CSS21/syndata.html#characters | |
273 // if followed by hexadecimal digits, create the appropriate character. | |
274 // otherwise, include the character in the identifier and don't treat it | |
275 // specially. | |
276 if (ch == 92 /*\*/ && _inString) { | |
277 int startHex = ++_index; | |
278 eatHexDigits(startHex + 6); | |
279 if (_index != startHex) { | |
280 // Parse the hex digits and add that character. | |
281 chars.add(int.parse('0x' + _text.substring(startHex, _index))); | |
282 | |
283 if (_index == _text.length) break; | |
284 | |
285 // if we stopped the hex because of a whitespace char, skip it | |
286 ch = _text.codeUnitAt(_index); | |
287 if (_index - startHex != 6 && | |
288 (ch == TokenChar.SPACE || | |
289 ch == TokenChar.TAB || | |
290 ch == TokenChar.RETURN || | |
291 ch == TokenChar.NEWLINE)) { | |
292 _index++; | |
293 } | |
294 } else { | |
295 // not a digit, just add the next character literally | |
296 if (_index == _text.length) break; | |
297 chars.add(_text.codeUnitAt(_index++)); | |
298 } | |
299 } else if (_index < validateFrom || | |
300 (inSelectorExpression | |
301 ? TokenizerHelpers.isIdentifierPartExpr(ch) | |
302 : TokenizerHelpers.isIdentifierPart(ch))) { | |
303 chars.add(ch); | |
304 _index++; | |
305 } else { | |
306 // Not an identifier or escaped character. | |
307 break; | |
308 } | |
309 } | |
310 | |
311 var span = _file.span(_startIndex, _index); | |
312 var text = new String.fromCharCodes(chars); | |
313 | |
314 return new IdentifierToken(text, getIdentifierKind(), span); | |
315 } | |
316 | |
317 Token finishNumber() { | |
318 eatDigits(); | |
319 | |
320 if (_peekChar() == 46 /*.*/) { | |
321 // Handle the case of 1.toString(). | |
322 _nextChar(); | |
323 if (TokenizerHelpers.isDigit(_peekChar())) { | |
324 eatDigits(); | |
325 return _finishToken(TokenKind.DOUBLE); | |
326 } else { | |
327 _index -= 1; | |
328 } | |
329 } | |
330 | |
331 return _finishToken(TokenKind.INTEGER); | |
332 } | |
333 | |
334 bool maybeEatDigit() { | |
335 if (_index < _text.length && | |
336 TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { | |
337 _index += 1; | |
338 return true; | |
339 } | |
340 return false; | |
341 } | |
342 | |
343 Token finishHexNumber() { | |
344 eatHexDigits(_text.length); | |
345 return _finishToken(TokenKind.HEX_INTEGER); | |
346 } | |
347 | |
348 void eatHexDigits(int end) { | |
349 end = math.min(end, _text.length); | |
350 while (_index < end) { | |
351 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { | |
352 _index += 1; | |
353 } else { | |
354 return; | |
355 } | |
356 } | |
357 } | |
358 | |
359 bool maybeEatHexDigit() { | |
360 if (_index < _text.length && | |
361 TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { | |
362 _index += 1; | |
363 return true; | |
364 } | |
365 return false; | |
366 } | |
367 | |
368 bool maybeEatQuestionMark() { | |
369 if (_index < _text.length && _text.codeUnitAt(_index) == QUESTION_MARK) { | |
370 _index += 1; | |
371 return true; | |
372 } | |
373 return false; | |
374 } | |
375 | |
376 void eatQuestionMarks() { | |
377 while (_index < _text.length) { | |
378 if (_text.codeUnitAt(_index) == QUESTION_MARK) { | |
379 _index += 1; | |
380 } else { | |
381 return; | |
382 } | |
383 } | |
384 } | |
385 | |
386 Token finishUnicodeRange() { | |
387 eatQuestionMarks(); | |
388 return _finishToken(TokenKind.HEX_RANGE); | |
389 } | |
390 | |
391 Token finishMultiLineComment() { | |
392 while (true) { | |
393 int ch = _nextChar(); | |
394 if (ch == 0) { | |
395 return _finishToken(TokenKind.INCOMPLETE_COMMENT); | |
396 } else if (ch == 42 /*'*'*/) { | |
397 if (_maybeEatChar(47 /*'/'*/)) { | |
398 if (_inString) { | |
399 return next(); | |
400 } else { | |
401 return _finishToken(TokenKind.COMMENT); | |
402 } | |
403 } | |
404 } else if (ch == TokenChar.MINUS) { | |
405 /* Check if close part of Comment Definition --> (CDC). */ | |
406 if (_maybeEatChar(TokenChar.MINUS)) { | |
407 if (_maybeEatChar(TokenChar.GREATER)) { | |
408 if (_inString) { | |
409 return next(); | |
410 } else { | |
411 return _finishToken(TokenKind.HTML_COMMENT); | |
412 } | |
413 } | |
414 } | |
415 } | |
416 } | |
417 return _errorToken(); | |
418 } | |
419 } | |
420 | |
421 /** Static helper methods. */ | |
422 class TokenizerHelpers { | |
423 static bool isIdentifierStart(int c) { | |
424 return isIdentifierStartExpr(c) || c == 45 /*-*/; | |
425 } | |
426 | |
427 static bool isDigit(int c) { | |
428 return (c >= 48 /*0*/ && c <= 57 /*9*/); | |
429 } | |
430 | |
431 static bool isHexDigit(int c) { | |
432 return (isDigit(c) || | |
433 (c >= 97 /*a*/ && c <= 102 /*f*/) || | |
434 (c >= 65 /*A*/ && c <= 70 /*F*/)); | |
435 } | |
436 | |
437 static bool isIdentifierPart(int c) { | |
438 return isIdentifierPartExpr(c) || c == 45 /*-*/; | |
439 } | |
440 | |
441 /** Pseudo function expressions identifiers can't have a minus sign. */ | |
442 static bool isIdentifierStartExpr(int c) { | |
443 return ((c >= 97 /*a*/ && c <= 122 /*z*/) || | |
444 (c >= 65 /*A*/ && c <= 90 /*Z*/) || | |
445 // Note: Unicode 10646 chars U+00A0 or higher are allowed, see: | |
446 // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier | |
447 // http://www.w3.org/TR/CSS21/syndata.html#characters | |
448 // Also, escaped character should be allowed. | |
449 c == 95 /*_*/ || c >= 0xA0 || c == 92 /*\*/); | |
450 } | |
451 | |
452 /** Pseudo function expressions identifiers can't have a minus sign. */ | |
453 static bool isIdentifierPartExpr(int c) { | |
454 return (isIdentifierStartExpr(c) || isDigit(c)); | |
455 } | |
456 } | |
OLD | NEW |