OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 |
| 5 part of csslib.parser; |
| 6 |
| 7 class Tokenizer extends TokenizerBase { |
| 8 /** U+ prefix for unicode characters. */ |
| 9 final UNICODE_U = 'U'.codeUnitAt(0); |
| 10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0); |
| 11 final UNICODE_PLUS = '+'.codeUnitAt(0); |
| 12 |
| 13 final QUESTION_MARK = '?'.codeUnitAt(0); |
| 14 |
| 15 /** CDATA keyword. */ |
| 16 final List CDATA_NAME = 'CDATA'.codeUnits; |
| 17 |
| 18 Tokenizer(SourceFile file, String text, bool skipWhitespace, [int index = 0]) |
| 19 : super(file, text, skipWhitespace, index); |
| 20 |
| 21 Token next({unicodeRange: false}) { |
| 22 // keep track of our starting position |
| 23 _startIndex = _index; |
| 24 |
| 25 int ch; |
| 26 ch = _nextChar(); |
| 27 switch (ch) { |
| 28 case TokenChar.NEWLINE: |
| 29 case TokenChar.RETURN: |
| 30 case TokenChar.SPACE: |
| 31 case TokenChar.TAB: |
| 32 return finishWhitespace(); |
| 33 case TokenChar.END_OF_FILE: |
| 34 return _finishToken(TokenKind.END_OF_FILE); |
| 35 case TokenChar.AT: |
| 36 int peekCh = _peekChar(); |
| 37 if (TokenizerHelpers.isIdentifierStart(peekCh)) { |
| 38 var oldIndex = _index; |
| 39 var oldStartIndex = _startIndex; |
| 40 |
| 41 _startIndex = _index; |
| 42 ch = _nextChar(); |
| 43 finishIdentifier(); |
| 44 |
| 45 // Is it a directive? |
| 46 int tokId = TokenKind.matchDirectives( |
| 47 _text, _startIndex, _index - _startIndex); |
| 48 if (tokId == -1) { |
| 49 // No, is it a margin directive? |
| 50 tokId = TokenKind.matchMarginDirectives( |
| 51 _text, _startIndex, _index - _startIndex); |
| 52 } |
| 53 |
| 54 if (tokId != -1) { |
| 55 return _finishToken(tokId); |
| 56 } else { |
| 57 // Didn't find a CSS directive or margin directive so the @name is |
| 58 // probably the Less definition '@name: value_variable_definition'. |
| 59 _startIndex = oldStartIndex; |
| 60 _index = oldIndex; |
| 61 } |
| 62 } |
| 63 return _finishToken(TokenKind.AT); |
| 64 case TokenChar.DOT: |
| 65 int start = _startIndex; // Start where the dot started. |
| 66 if (maybeEatDigit()) { |
| 67 // looks like a number dot followed by digit(s). |
| 68 Token number = finishNumber(); |
| 69 if (number.kind == TokenKind.INTEGER) { |
| 70 // It's a number but it's preceeded by a dot, so make it a double. |
| 71 _startIndex = start; |
| 72 return _finishToken(TokenKind.DOUBLE); |
| 73 } else { |
| 74 // Don't allow dot followed by a double (e.g, '..1'). |
| 75 return _errorToken(); |
| 76 } |
| 77 } |
| 78 // It's really a dot. |
| 79 return _finishToken(TokenKind.DOT); |
| 80 case TokenChar.LPAREN: |
| 81 return _finishToken(TokenKind.LPAREN); |
| 82 case TokenChar.RPAREN: |
| 83 return _finishToken(TokenKind.RPAREN); |
| 84 case TokenChar.LBRACE: |
| 85 return _finishToken(TokenKind.LBRACE); |
| 86 case TokenChar.RBRACE: |
| 87 return _finishToken(TokenKind.RBRACE); |
| 88 case TokenChar.LBRACK: |
| 89 return _finishToken(TokenKind.LBRACK); |
| 90 case TokenChar.RBRACK: |
| 91 if (_maybeEatChar(TokenChar.RBRACK) && |
| 92 _maybeEatChar(TokenChar.GREATER)) { |
| 93 // ]]> |
| 94 return next(); |
| 95 } |
| 96 return _finishToken(TokenKind.RBRACK); |
| 97 case TokenChar.HASH: |
| 98 return _finishToken(TokenKind.HASH); |
| 99 case TokenChar.PLUS: |
| 100 if (maybeEatDigit()) return finishNumber(); |
| 101 return _finishToken(TokenKind.PLUS); |
| 102 case TokenChar.MINUS: |
| 103 if (inSelectorExpression || unicodeRange) { |
| 104 // If parsing in pseudo function expression then minus is an operator |
| 105 // not part of identifier e.g., interval value range (e.g. U+400-4ff) |
| 106 // or minus operator in selector expression. |
| 107 return _finishToken(TokenKind.MINUS); |
| 108 } else if (maybeEatDigit()) { |
| 109 return finishNumber(); |
| 110 } else if (TokenizerHelpers.isIdentifierStart(ch)) { |
| 111 return finishIdentifier(); |
| 112 } |
| 113 return _finishToken(TokenKind.MINUS); |
| 114 case TokenChar.GREATER: |
| 115 return _finishToken(TokenKind.GREATER); |
| 116 case TokenChar.TILDE: |
| 117 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 118 return _finishToken(TokenKind.INCLUDES); // ~= |
| 119 } |
| 120 return _finishToken(TokenKind.TILDE); |
| 121 case TokenChar.ASTERISK: |
| 122 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 123 return _finishToken(TokenKind.SUBSTRING_MATCH); // *= |
| 124 } |
| 125 return _finishToken(TokenKind.ASTERISK); |
| 126 case TokenChar.AMPERSAND: |
| 127 return _finishToken(TokenKind.AMPERSAND); |
| 128 case TokenChar.NAMESPACE: |
| 129 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 130 return _finishToken(TokenKind.DASH_MATCH); // |= |
| 131 } |
| 132 return _finishToken(TokenKind.NAMESPACE); |
| 133 case TokenChar.COLON: |
| 134 return _finishToken(TokenKind.COLON); |
| 135 case TokenChar.COMMA: |
| 136 return _finishToken(TokenKind.COMMA); |
| 137 case TokenChar.SEMICOLON: |
| 138 return _finishToken(TokenKind.SEMICOLON); |
| 139 case TokenChar.PERCENT: |
| 140 return _finishToken(TokenKind.PERCENT); |
| 141 case TokenChar.SINGLE_QUOTE: |
| 142 return _finishToken(TokenKind.SINGLE_QUOTE); |
| 143 case TokenChar.DOUBLE_QUOTE: |
| 144 return _finishToken(TokenKind.DOUBLE_QUOTE); |
| 145 case TokenChar.SLASH: |
| 146 if (_maybeEatChar(TokenChar.ASTERISK)) return finishMultiLineComment(); |
| 147 return _finishToken(TokenKind.SLASH); |
| 148 case TokenChar.LESS: // <!-- |
| 149 if (_maybeEatChar(TokenChar.BANG)) { |
| 150 if (_maybeEatChar(TokenChar.MINUS) && |
| 151 _maybeEatChar(TokenChar.MINUS)) { |
| 152 return finishMultiLineComment(); |
| 153 } else if (_maybeEatChar(TokenChar.LBRACK) && |
| 154 _maybeEatChar(CDATA_NAME[0]) && |
| 155 _maybeEatChar(CDATA_NAME[1]) && |
| 156 _maybeEatChar(CDATA_NAME[2]) && |
| 157 _maybeEatChar(CDATA_NAME[3]) && |
| 158 _maybeEatChar(CDATA_NAME[4]) && |
| 159 _maybeEatChar(TokenChar.LBRACK)) { |
| 160 // <![CDATA[ |
| 161 return next(); |
| 162 } |
| 163 } |
| 164 return _finishToken(TokenKind.LESS); |
| 165 case TokenChar.EQUALS: |
| 166 return _finishToken(TokenKind.EQUALS); |
| 167 case TokenChar.CARET: |
| 168 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 169 return _finishToken(TokenKind.PREFIX_MATCH); // ^= |
| 170 } |
| 171 return _finishToken(TokenKind.CARET); |
| 172 case TokenChar.DOLLAR: |
| 173 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 174 return _finishToken(TokenKind.SUFFIX_MATCH); // $= |
| 175 } |
| 176 return _finishToken(TokenKind.DOLLAR); |
| 177 case TokenChar.BANG: |
| 178 Token tok = finishIdentifier(); |
| 179 return (tok == null) ? _finishToken(TokenKind.BANG) : tok; |
| 180 default: |
| 181 // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's |
| 182 // appropriate outside of a few specific places; certainly shouldn't |
| 183 // be parsed in selectors. |
| 184 if (!inSelector && ch == TokenChar.BACKSLASH) { |
| 185 return _finishToken(TokenKind.BACKSLASH); |
| 186 } |
| 187 |
| 188 if (unicodeRange) { |
| 189 // Three types of unicode ranges: |
| 190 // - single code point (e.g. U+416) |
| 191 // - interval value range (e.g. U+400-4ff) |
| 192 // - range where trailing ‘?’ characters imply ‘any digit value’ |
| 193 // (e.g. U+4??) |
| 194 if (maybeEatHexDigit()) { |
| 195 var t = finishHexNumber(); |
| 196 // Any question marks then it's a HEX_RANGE not HEX_NUMBER. |
| 197 if (maybeEatQuestionMark()) finishUnicodeRange(); |
| 198 return t; |
| 199 } else if (maybeEatQuestionMark()) { |
| 200 // HEX_RANGE U+N??? |
| 201 return finishUnicodeRange(); |
| 202 } else { |
| 203 return _errorToken(); |
| 204 } |
| 205 } else if ((ch == UNICODE_U || ch == UNICODE_LOWER_U) && |
| 206 (_peekChar() == UNICODE_PLUS)) { |
| 207 // Unicode range: U+uNumber[-U+uNumber] |
| 208 // uNumber = 0..10FFFF |
| 209 _nextChar(); // Skip + |
| 210 _startIndex = _index; // Starts at the number |
| 211 return _finishToken(TokenKind.UNICODE_RANGE); |
| 212 } else if (varDef(ch)) { |
| 213 return _finishToken(TokenKind.VAR_DEFINITION); |
| 214 } else if (varUsage(ch)) { |
| 215 return _finishToken(TokenKind.VAR_USAGE); |
| 216 } else if (TokenizerHelpers.isIdentifierStart(ch)) { |
| 217 return finishIdentifier(); |
| 218 } else if (TokenizerHelpers.isDigit(ch)) { |
| 219 return finishNumber(); |
| 220 } |
| 221 return _errorToken(); |
| 222 } |
| 223 } |
| 224 |
| 225 bool varDef(int ch) { |
| 226 return ch == 'v'.codeUnitAt(0) && |
| 227 _maybeEatChar('a'.codeUnitAt(0)) && |
| 228 _maybeEatChar('r'.codeUnitAt(0)) && |
| 229 _maybeEatChar('-'.codeUnitAt(0)); |
| 230 } |
| 231 |
| 232 bool varUsage(int ch) { |
| 233 return ch == 'v'.codeUnitAt(0) && |
| 234 _maybeEatChar('a'.codeUnitAt(0)) && |
| 235 _maybeEatChar('r'.codeUnitAt(0)) && |
| 236 (_peekChar() == '-'.codeUnitAt(0)); |
| 237 } |
| 238 |
| 239 Token _errorToken([String message = null]) { |
| 240 return _finishToken(TokenKind.ERROR); |
| 241 } |
| 242 |
| 243 int getIdentifierKind() { |
| 244 // Is the identifier a unit type? |
| 245 int tokId = -1; |
| 246 |
| 247 // Don't match units in selectors or selector expressions. |
| 248 if (!inSelectorExpression && !inSelector) { |
| 249 tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex); |
| 250 } |
| 251 if (tokId == -1) { |
| 252 tokId = (_text.substring(_startIndex, _index) == '!important') |
| 253 ? TokenKind.IMPORTANT |
| 254 : -1; |
| 255 } |
| 256 |
| 257 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER; |
| 258 } |
| 259 |
| 260 Token finishIdentifier() { |
| 261 // If we encounter an escape sequence, remember it so we can post-process |
| 262 // to unescape. |
| 263 var chars = []; |
| 264 |
| 265 // backup so we can start with the first character |
| 266 int validateFrom = _index; |
| 267 _index = _startIndex; |
| 268 while (_index < _text.length) { |
| 269 int ch = _text.codeUnitAt(_index); |
| 270 |
| 271 // If the previous character was "\" we need to escape. T |
| 272 // http://www.w3.org/TR/CSS21/syndata.html#characters |
| 273 // if followed by hexadecimal digits, create the appropriate character. |
| 274 // otherwise, include the character in the identifier and don't treat it |
| 275 // specially. |
| 276 if (ch == 92 /*\*/ && _inString) { |
| 277 int startHex = ++_index; |
| 278 eatHexDigits(startHex + 6); |
| 279 if (_index != startHex) { |
| 280 // Parse the hex digits and add that character. |
| 281 chars.add(int.parse('0x' + _text.substring(startHex, _index))); |
| 282 |
| 283 if (_index == _text.length) break; |
| 284 |
| 285 // if we stopped the hex because of a whitespace char, skip it |
| 286 ch = _text.codeUnitAt(_index); |
| 287 if (_index - startHex != 6 && |
| 288 (ch == TokenChar.SPACE || |
| 289 ch == TokenChar.TAB || |
| 290 ch == TokenChar.RETURN || |
| 291 ch == TokenChar.NEWLINE)) { |
| 292 _index++; |
| 293 } |
| 294 } else { |
| 295 // not a digit, just add the next character literally |
| 296 if (_index == _text.length) break; |
| 297 chars.add(_text.codeUnitAt(_index++)); |
| 298 } |
| 299 } else if (_index < validateFrom || |
| 300 (inSelectorExpression |
| 301 ? TokenizerHelpers.isIdentifierPartExpr(ch) |
| 302 : TokenizerHelpers.isIdentifierPart(ch))) { |
| 303 chars.add(ch); |
| 304 _index++; |
| 305 } else { |
| 306 // Not an identifier or escaped character. |
| 307 break; |
| 308 } |
| 309 } |
| 310 |
| 311 var span = _file.span(_startIndex, _index); |
| 312 var text = new String.fromCharCodes(chars); |
| 313 |
| 314 return new IdentifierToken(text, getIdentifierKind(), span); |
| 315 } |
| 316 |
| 317 Token finishNumber() { |
| 318 eatDigits(); |
| 319 |
| 320 if (_peekChar() == 46 /*.*/) { |
| 321 // Handle the case of 1.toString(). |
| 322 _nextChar(); |
| 323 if (TokenizerHelpers.isDigit(_peekChar())) { |
| 324 eatDigits(); |
| 325 return _finishToken(TokenKind.DOUBLE); |
| 326 } else { |
| 327 _index -= 1; |
| 328 } |
| 329 } |
| 330 |
| 331 return _finishToken(TokenKind.INTEGER); |
| 332 } |
| 333 |
| 334 bool maybeEatDigit() { |
| 335 if (_index < _text.length && |
| 336 TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { |
| 337 _index += 1; |
| 338 return true; |
| 339 } |
| 340 return false; |
| 341 } |
| 342 |
| 343 Token finishHexNumber() { |
| 344 eatHexDigits(_text.length); |
| 345 return _finishToken(TokenKind.HEX_INTEGER); |
| 346 } |
| 347 |
| 348 void eatHexDigits(int end) { |
| 349 end = math.min(end, _text.length); |
| 350 while (_index < end) { |
| 351 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { |
| 352 _index += 1; |
| 353 } else { |
| 354 return; |
| 355 } |
| 356 } |
| 357 } |
| 358 |
| 359 bool maybeEatHexDigit() { |
| 360 if (_index < _text.length && |
| 361 TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { |
| 362 _index += 1; |
| 363 return true; |
| 364 } |
| 365 return false; |
| 366 } |
| 367 |
| 368 bool maybeEatQuestionMark() { |
| 369 if (_index < _text.length && _text.codeUnitAt(_index) == QUESTION_MARK) { |
| 370 _index += 1; |
| 371 return true; |
| 372 } |
| 373 return false; |
| 374 } |
| 375 |
| 376 void eatQuestionMarks() { |
| 377 while (_index < _text.length) { |
| 378 if (_text.codeUnitAt(_index) == QUESTION_MARK) { |
| 379 _index += 1; |
| 380 } else { |
| 381 return; |
| 382 } |
| 383 } |
| 384 } |
| 385 |
| 386 Token finishUnicodeRange() { |
| 387 eatQuestionMarks(); |
| 388 return _finishToken(TokenKind.HEX_RANGE); |
| 389 } |
| 390 |
| 391 Token finishMultiLineComment() { |
| 392 while (true) { |
| 393 int ch = _nextChar(); |
| 394 if (ch == 0) { |
| 395 return _finishToken(TokenKind.INCOMPLETE_COMMENT); |
| 396 } else if (ch == 42 /*'*'*/) { |
| 397 if (_maybeEatChar(47 /*'/'*/)) { |
| 398 if (_inString) { |
| 399 return next(); |
| 400 } else { |
| 401 return _finishToken(TokenKind.COMMENT); |
| 402 } |
| 403 } |
| 404 } else if (ch == TokenChar.MINUS) { |
| 405 /* Check if close part of Comment Definition --> (CDC). */ |
| 406 if (_maybeEatChar(TokenChar.MINUS)) { |
| 407 if (_maybeEatChar(TokenChar.GREATER)) { |
| 408 if (_inString) { |
| 409 return next(); |
| 410 } else { |
| 411 return _finishToken(TokenKind.HTML_COMMENT); |
| 412 } |
| 413 } |
| 414 } |
| 415 } |
| 416 } |
| 417 return _errorToken(); |
| 418 } |
| 419 } |
| 420 |
| 421 /** Static helper methods. */ |
| 422 class TokenizerHelpers { |
| 423 static bool isIdentifierStart(int c) { |
| 424 return isIdentifierStartExpr(c) || c == 45 /*-*/; |
| 425 } |
| 426 |
| 427 static bool isDigit(int c) { |
| 428 return (c >= 48 /*0*/ && c <= 57 /*9*/); |
| 429 } |
| 430 |
| 431 static bool isHexDigit(int c) { |
| 432 return (isDigit(c) || |
| 433 (c >= 97 /*a*/ && c <= 102 /*f*/) || |
| 434 (c >= 65 /*A*/ && c <= 70 /*F*/)); |
| 435 } |
| 436 |
| 437 static bool isIdentifierPart(int c) { |
| 438 return isIdentifierPartExpr(c) || c == 45 /*-*/; |
| 439 } |
| 440 |
| 441 /** Pseudo function expressions identifiers can't have a minus sign. */ |
| 442 static bool isIdentifierStartExpr(int c) { |
| 443 return ((c >= 97 /*a*/ && c <= 122 /*z*/) || |
| 444 (c >= 65 /*A*/ && c <= 90 /*Z*/) || |
| 445 // Note: Unicode 10646 chars U+00A0 or higher are allowed, see: |
| 446 // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier |
| 447 // http://www.w3.org/TR/CSS21/syndata.html#characters |
| 448 // Also, escaped character should be allowed. |
| 449 c == 95 /*_*/ || c >= 0xA0 || c == 92 /*\*/); |
| 450 } |
| 451 |
| 452 /** Pseudo function expressions identifiers can't have a minus sign. */ |
| 453 static bool isIdentifierPartExpr(int c) { |
| 454 return (isIdentifierStartExpr(c) || isDigit(c)); |
| 455 } |
| 456 } |
OLD | NEW |