| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 part of csslib.parser; | |
| 6 | |
| 7 class Tokenizer extends TokenizerBase { | |
| 8 /** U+ prefix for unicode characters. */ | |
| 9 final UNICODE_U = 'U'.codeUnitAt(0); | |
| 10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0); | |
| 11 final UNICODE_PLUS = '+'.codeUnitAt(0); | |
| 12 | |
| 13 final QUESTION_MARK = '?'.codeUnitAt(0); | |
| 14 | |
| 15 /** CDATA keyword. */ | |
| 16 final List CDATA_NAME = 'CDATA'.codeUnits; | |
| 17 | |
| 18 Tokenizer(SourceFile file, String text, bool skipWhitespace, [int index = 0]) | |
| 19 : super(file, text, skipWhitespace, index); | |
| 20 | |
| 21 Token next({unicodeRange: false}) { | |
| 22 // keep track of our starting position | |
| 23 _startIndex = _index; | |
| 24 | |
| 25 int ch; | |
| 26 ch = _nextChar(); | |
| 27 switch (ch) { | |
| 28 case TokenChar.NEWLINE: | |
| 29 case TokenChar.RETURN: | |
| 30 case TokenChar.SPACE: | |
| 31 case TokenChar.TAB: | |
| 32 return finishWhitespace(); | |
| 33 case TokenChar.END_OF_FILE: | |
| 34 return _finishToken(TokenKind.END_OF_FILE); | |
| 35 case TokenChar.AT: | |
| 36 int peekCh = _peekChar(); | |
| 37 if (TokenizerHelpers.isIdentifierStart(peekCh)) { | |
| 38 var oldIndex = _index; | |
| 39 var oldStartIndex = _startIndex; | |
| 40 | |
| 41 _startIndex = _index; | |
| 42 ch = _nextChar(); | |
| 43 finishIdentifier(); | |
| 44 | |
| 45 // Is it a directive? | |
| 46 int tokId = TokenKind.matchDirectives( | |
| 47 _text, _startIndex, _index - _startIndex); | |
| 48 if (tokId == -1) { | |
| 49 // No, is it a margin directive? | |
| 50 tokId = TokenKind.matchMarginDirectives( | |
| 51 _text, _startIndex, _index - _startIndex); | |
| 52 } | |
| 53 | |
| 54 if (tokId != -1) { | |
| 55 return _finishToken(tokId); | |
| 56 } else { | |
| 57 // Didn't find a CSS directive or margin directive so the @name is | |
| 58 // probably the Less definition '@name: value_variable_definition'. | |
| 59 _startIndex = oldStartIndex; | |
| 60 _index = oldIndex; | |
| 61 } | |
| 62 } | |
| 63 return _finishToken(TokenKind.AT); | |
| 64 case TokenChar.DOT: | |
| 65 int start = _startIndex; // Start where the dot started. | |
| 66 if (maybeEatDigit()) { | |
| 67 // looks like a number dot followed by digit(s). | |
| 68 Token number = finishNumber(); | |
| 69 if (number.kind == TokenKind.INTEGER) { | |
| 70 // It's a number but it's preceeded by a dot, so make it a double. | |
| 71 _startIndex = start; | |
| 72 return _finishToken(TokenKind.DOUBLE); | |
| 73 } else { | |
| 74 // Don't allow dot followed by a double (e.g, '..1'). | |
| 75 return _errorToken(); | |
| 76 } | |
| 77 } | |
| 78 // It's really a dot. | |
| 79 return _finishToken(TokenKind.DOT); | |
| 80 case TokenChar.LPAREN: | |
| 81 return _finishToken(TokenKind.LPAREN); | |
| 82 case TokenChar.RPAREN: | |
| 83 return _finishToken(TokenKind.RPAREN); | |
| 84 case TokenChar.LBRACE: | |
| 85 return _finishToken(TokenKind.LBRACE); | |
| 86 case TokenChar.RBRACE: | |
| 87 return _finishToken(TokenKind.RBRACE); | |
| 88 case TokenChar.LBRACK: | |
| 89 return _finishToken(TokenKind.LBRACK); | |
| 90 case TokenChar.RBRACK: | |
| 91 if (_maybeEatChar(TokenChar.RBRACK) && | |
| 92 _maybeEatChar(TokenChar.GREATER)) { | |
| 93 // ]]> | |
| 94 return next(); | |
| 95 } | |
| 96 return _finishToken(TokenKind.RBRACK); | |
| 97 case TokenChar.HASH: | |
| 98 return _finishToken(TokenKind.HASH); | |
| 99 case TokenChar.PLUS: | |
| 100 if (maybeEatDigit()) return finishNumber(); | |
| 101 return _finishToken(TokenKind.PLUS); | |
| 102 case TokenChar.MINUS: | |
| 103 if (inSelectorExpression || unicodeRange) { | |
| 104 // If parsing in pseudo function expression then minus is an operator | |
| 105 // not part of identifier e.g., interval value range (e.g. U+400-4ff) | |
| 106 // or minus operator in selector expression. | |
| 107 return _finishToken(TokenKind.MINUS); | |
| 108 } else if (maybeEatDigit()) { | |
| 109 return finishNumber(); | |
| 110 } else if (TokenizerHelpers.isIdentifierStart(ch)) { | |
| 111 return finishIdentifier(); | |
| 112 } | |
| 113 return _finishToken(TokenKind.MINUS); | |
| 114 case TokenChar.GREATER: | |
| 115 return _finishToken(TokenKind.GREATER); | |
| 116 case TokenChar.TILDE: | |
| 117 if (_maybeEatChar(TokenChar.EQUALS)) { | |
| 118 return _finishToken(TokenKind.INCLUDES); // ~= | |
| 119 } | |
| 120 return _finishToken(TokenKind.TILDE); | |
| 121 case TokenChar.ASTERISK: | |
| 122 if (_maybeEatChar(TokenChar.EQUALS)) { | |
| 123 return _finishToken(TokenKind.SUBSTRING_MATCH); // *= | |
| 124 } | |
| 125 return _finishToken(TokenKind.ASTERISK); | |
| 126 case TokenChar.AMPERSAND: | |
| 127 return _finishToken(TokenKind.AMPERSAND); | |
| 128 case TokenChar.NAMESPACE: | |
| 129 if (_maybeEatChar(TokenChar.EQUALS)) { | |
| 130 return _finishToken(TokenKind.DASH_MATCH); // |= | |
| 131 } | |
| 132 return _finishToken(TokenKind.NAMESPACE); | |
| 133 case TokenChar.COLON: | |
| 134 return _finishToken(TokenKind.COLON); | |
| 135 case TokenChar.COMMA: | |
| 136 return _finishToken(TokenKind.COMMA); | |
| 137 case TokenChar.SEMICOLON: | |
| 138 return _finishToken(TokenKind.SEMICOLON); | |
| 139 case TokenChar.PERCENT: | |
| 140 return _finishToken(TokenKind.PERCENT); | |
| 141 case TokenChar.SINGLE_QUOTE: | |
| 142 return _finishToken(TokenKind.SINGLE_QUOTE); | |
| 143 case TokenChar.DOUBLE_QUOTE: | |
| 144 return _finishToken(TokenKind.DOUBLE_QUOTE); | |
| 145 case TokenChar.SLASH: | |
| 146 if (_maybeEatChar(TokenChar.ASTERISK)) return finishMultiLineComment(); | |
| 147 return _finishToken(TokenKind.SLASH); | |
| 148 case TokenChar.LESS: // <!-- | |
| 149 if (_maybeEatChar(TokenChar.BANG)) { | |
| 150 if (_maybeEatChar(TokenChar.MINUS) && | |
| 151 _maybeEatChar(TokenChar.MINUS)) { | |
| 152 return finishMultiLineComment(); | |
| 153 } else if (_maybeEatChar(TokenChar.LBRACK) && | |
| 154 _maybeEatChar(CDATA_NAME[0]) && | |
| 155 _maybeEatChar(CDATA_NAME[1]) && | |
| 156 _maybeEatChar(CDATA_NAME[2]) && | |
| 157 _maybeEatChar(CDATA_NAME[3]) && | |
| 158 _maybeEatChar(CDATA_NAME[4]) && | |
| 159 _maybeEatChar(TokenChar.LBRACK)) { | |
| 160 // <![CDATA[ | |
| 161 return next(); | |
| 162 } | |
| 163 } | |
| 164 return _finishToken(TokenKind.LESS); | |
| 165 case TokenChar.EQUALS: | |
| 166 return _finishToken(TokenKind.EQUALS); | |
| 167 case TokenChar.CARET: | |
| 168 if (_maybeEatChar(TokenChar.EQUALS)) { | |
| 169 return _finishToken(TokenKind.PREFIX_MATCH); // ^= | |
| 170 } | |
| 171 return _finishToken(TokenKind.CARET); | |
| 172 case TokenChar.DOLLAR: | |
| 173 if (_maybeEatChar(TokenChar.EQUALS)) { | |
| 174 return _finishToken(TokenKind.SUFFIX_MATCH); // $= | |
| 175 } | |
| 176 return _finishToken(TokenKind.DOLLAR); | |
| 177 case TokenChar.BANG: | |
| 178 Token tok = finishIdentifier(); | |
| 179 return (tok == null) ? _finishToken(TokenKind.BANG) : tok; | |
| 180 default: | |
| 181 // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's | |
| 182 // appropriate outside of a few specific places; certainly shouldn't | |
| 183 // be parsed in selectors. | |
| 184 if (!inSelector && ch == TokenChar.BACKSLASH) { | |
| 185 return _finishToken(TokenKind.BACKSLASH); | |
| 186 } | |
| 187 | |
| 188 if (unicodeRange) { | |
| 189 // Three types of unicode ranges: | |
| 190 // - single code point (e.g. U+416) | |
| 191 // - interval value range (e.g. U+400-4ff) | |
| 192 // - range where trailing ‘?’ characters imply ‘any digit value’ | |
| 193 // (e.g. U+4??) | |
| 194 if (maybeEatHexDigit()) { | |
| 195 var t = finishHexNumber(); | |
| 196 // Any question marks then it's a HEX_RANGE not HEX_NUMBER. | |
| 197 if (maybeEatQuestionMark()) finishUnicodeRange(); | |
| 198 return t; | |
| 199 } else if (maybeEatQuestionMark()) { | |
| 200 // HEX_RANGE U+N??? | |
| 201 return finishUnicodeRange(); | |
| 202 } else { | |
| 203 return _errorToken(); | |
| 204 } | |
| 205 } else if ((ch == UNICODE_U || ch == UNICODE_LOWER_U) && | |
| 206 (_peekChar() == UNICODE_PLUS)) { | |
| 207 // Unicode range: U+uNumber[-U+uNumber] | |
| 208 // uNumber = 0..10FFFF | |
| 209 _nextChar(); // Skip + | |
| 210 _startIndex = _index; // Starts at the number | |
| 211 return _finishToken(TokenKind.UNICODE_RANGE); | |
| 212 } else if (varDef(ch)) { | |
| 213 return _finishToken(TokenKind.VAR_DEFINITION); | |
| 214 } else if (varUsage(ch)) { | |
| 215 return _finishToken(TokenKind.VAR_USAGE); | |
| 216 } else if (TokenizerHelpers.isIdentifierStart(ch)) { | |
| 217 return finishIdentifier(); | |
| 218 } else if (TokenizerHelpers.isDigit(ch)) { | |
| 219 return finishNumber(); | |
| 220 } | |
| 221 return _errorToken(); | |
| 222 } | |
| 223 } | |
| 224 | |
| 225 bool varDef(int ch) { | |
| 226 return ch == 'v'.codeUnitAt(0) && | |
| 227 _maybeEatChar('a'.codeUnitAt(0)) && | |
| 228 _maybeEatChar('r'.codeUnitAt(0)) && | |
| 229 _maybeEatChar('-'.codeUnitAt(0)); | |
| 230 } | |
| 231 | |
| 232 bool varUsage(int ch) { | |
| 233 return ch == 'v'.codeUnitAt(0) && | |
| 234 _maybeEatChar('a'.codeUnitAt(0)) && | |
| 235 _maybeEatChar('r'.codeUnitAt(0)) && | |
| 236 (_peekChar() == '-'.codeUnitAt(0)); | |
| 237 } | |
| 238 | |
| 239 Token _errorToken([String message = null]) { | |
| 240 return _finishToken(TokenKind.ERROR); | |
| 241 } | |
| 242 | |
| 243 int getIdentifierKind() { | |
| 244 // Is the identifier a unit type? | |
| 245 int tokId = -1; | |
| 246 | |
| 247 // Don't match units in selectors or selector expressions. | |
| 248 if (!inSelectorExpression && !inSelector) { | |
| 249 tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex); | |
| 250 } | |
| 251 if (tokId == -1) { | |
| 252 tokId = (_text.substring(_startIndex, _index) == '!important') | |
| 253 ? TokenKind.IMPORTANT | |
| 254 : -1; | |
| 255 } | |
| 256 | |
| 257 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER; | |
| 258 } | |
| 259 | |
| 260 Token finishIdentifier() { | |
| 261 // If we encounter an escape sequence, remember it so we can post-process | |
| 262 // to unescape. | |
| 263 var chars = []; | |
| 264 | |
| 265 // backup so we can start with the first character | |
| 266 int validateFrom = _index; | |
| 267 _index = _startIndex; | |
| 268 while (_index < _text.length) { | |
| 269 int ch = _text.codeUnitAt(_index); | |
| 270 | |
| 271 // If the previous character was "\" we need to escape. T | |
| 272 // http://www.w3.org/TR/CSS21/syndata.html#characters | |
| 273 // if followed by hexadecimal digits, create the appropriate character. | |
| 274 // otherwise, include the character in the identifier and don't treat it | |
| 275 // specially. | |
| 276 if (ch == 92 /*\*/ && _inString) { | |
| 277 int startHex = ++_index; | |
| 278 eatHexDigits(startHex + 6); | |
| 279 if (_index != startHex) { | |
| 280 // Parse the hex digits and add that character. | |
| 281 chars.add(int.parse('0x' + _text.substring(startHex, _index))); | |
| 282 | |
| 283 if (_index == _text.length) break; | |
| 284 | |
| 285 // if we stopped the hex because of a whitespace char, skip it | |
| 286 ch = _text.codeUnitAt(_index); | |
| 287 if (_index - startHex != 6 && | |
| 288 (ch == TokenChar.SPACE || | |
| 289 ch == TokenChar.TAB || | |
| 290 ch == TokenChar.RETURN || | |
| 291 ch == TokenChar.NEWLINE)) { | |
| 292 _index++; | |
| 293 } | |
| 294 } else { | |
| 295 // not a digit, just add the next character literally | |
| 296 if (_index == _text.length) break; | |
| 297 chars.add(_text.codeUnitAt(_index++)); | |
| 298 } | |
| 299 } else if (_index < validateFrom || | |
| 300 (inSelectorExpression | |
| 301 ? TokenizerHelpers.isIdentifierPartExpr(ch) | |
| 302 : TokenizerHelpers.isIdentifierPart(ch))) { | |
| 303 chars.add(ch); | |
| 304 _index++; | |
| 305 } else { | |
| 306 // Not an identifier or escaped character. | |
| 307 break; | |
| 308 } | |
| 309 } | |
| 310 | |
| 311 var span = _file.span(_startIndex, _index); | |
| 312 var text = new String.fromCharCodes(chars); | |
| 313 | |
| 314 return new IdentifierToken(text, getIdentifierKind(), span); | |
| 315 } | |
| 316 | |
| 317 Token finishNumber() { | |
| 318 eatDigits(); | |
| 319 | |
| 320 if (_peekChar() == 46 /*.*/) { | |
| 321 // Handle the case of 1.toString(). | |
| 322 _nextChar(); | |
| 323 if (TokenizerHelpers.isDigit(_peekChar())) { | |
| 324 eatDigits(); | |
| 325 return _finishToken(TokenKind.DOUBLE); | |
| 326 } else { | |
| 327 _index -= 1; | |
| 328 } | |
| 329 } | |
| 330 | |
| 331 return _finishToken(TokenKind.INTEGER); | |
| 332 } | |
| 333 | |
| 334 bool maybeEatDigit() { | |
| 335 if (_index < _text.length && | |
| 336 TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { | |
| 337 _index += 1; | |
| 338 return true; | |
| 339 } | |
| 340 return false; | |
| 341 } | |
| 342 | |
| 343 Token finishHexNumber() { | |
| 344 eatHexDigits(_text.length); | |
| 345 return _finishToken(TokenKind.HEX_INTEGER); | |
| 346 } | |
| 347 | |
| 348 void eatHexDigits(int end) { | |
| 349 end = math.min(end, _text.length); | |
| 350 while (_index < end) { | |
| 351 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { | |
| 352 _index += 1; | |
| 353 } else { | |
| 354 return; | |
| 355 } | |
| 356 } | |
| 357 } | |
| 358 | |
| 359 bool maybeEatHexDigit() { | |
| 360 if (_index < _text.length && | |
| 361 TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { | |
| 362 _index += 1; | |
| 363 return true; | |
| 364 } | |
| 365 return false; | |
| 366 } | |
| 367 | |
| 368 bool maybeEatQuestionMark() { | |
| 369 if (_index < _text.length && _text.codeUnitAt(_index) == QUESTION_MARK) { | |
| 370 _index += 1; | |
| 371 return true; | |
| 372 } | |
| 373 return false; | |
| 374 } | |
| 375 | |
| 376 void eatQuestionMarks() { | |
| 377 while (_index < _text.length) { | |
| 378 if (_text.codeUnitAt(_index) == QUESTION_MARK) { | |
| 379 _index += 1; | |
| 380 } else { | |
| 381 return; | |
| 382 } | |
| 383 } | |
| 384 } | |
| 385 | |
| 386 Token finishUnicodeRange() { | |
| 387 eatQuestionMarks(); | |
| 388 return _finishToken(TokenKind.HEX_RANGE); | |
| 389 } | |
| 390 | |
| 391 Token finishMultiLineComment() { | |
| 392 while (true) { | |
| 393 int ch = _nextChar(); | |
| 394 if (ch == 0) { | |
| 395 return _finishToken(TokenKind.INCOMPLETE_COMMENT); | |
| 396 } else if (ch == 42 /*'*'*/) { | |
| 397 if (_maybeEatChar(47 /*'/'*/)) { | |
| 398 if (_inString) { | |
| 399 return next(); | |
| 400 } else { | |
| 401 return _finishToken(TokenKind.COMMENT); | |
| 402 } | |
| 403 } | |
| 404 } else if (ch == TokenChar.MINUS) { | |
| 405 /* Check if close part of Comment Definition --> (CDC). */ | |
| 406 if (_maybeEatChar(TokenChar.MINUS)) { | |
| 407 if (_maybeEatChar(TokenChar.GREATER)) { | |
| 408 if (_inString) { | |
| 409 return next(); | |
| 410 } else { | |
| 411 return _finishToken(TokenKind.HTML_COMMENT); | |
| 412 } | |
| 413 } | |
| 414 } | |
| 415 } | |
| 416 } | |
| 417 return _errorToken(); | |
| 418 } | |
| 419 } | |
| 420 | |
| 421 /** Static helper methods. */ | |
| 422 class TokenizerHelpers { | |
| 423 static bool isIdentifierStart(int c) { | |
| 424 return isIdentifierStartExpr(c) || c == 45 /*-*/; | |
| 425 } | |
| 426 | |
| 427 static bool isDigit(int c) { | |
| 428 return (c >= 48 /*0*/ && c <= 57 /*9*/); | |
| 429 } | |
| 430 | |
| 431 static bool isHexDigit(int c) { | |
| 432 return (isDigit(c) || | |
| 433 (c >= 97 /*a*/ && c <= 102 /*f*/) || | |
| 434 (c >= 65 /*A*/ && c <= 70 /*F*/)); | |
| 435 } | |
| 436 | |
| 437 static bool isIdentifierPart(int c) { | |
| 438 return isIdentifierPartExpr(c) || c == 45 /*-*/; | |
| 439 } | |
| 440 | |
| 441 /** Pseudo function expressions identifiers can't have a minus sign. */ | |
| 442 static bool isIdentifierStartExpr(int c) { | |
| 443 return ((c >= 97 /*a*/ && c <= 122 /*z*/) || | |
| 444 (c >= 65 /*A*/ && c <= 90 /*Z*/) || | |
| 445 // Note: Unicode 10646 chars U+00A0 or higher are allowed, see: | |
| 446 // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier | |
| 447 // http://www.w3.org/TR/CSS21/syndata.html#characters | |
| 448 // Also, escaped character should be allowed. | |
| 449 c == 95 /*_*/ || c >= 0xA0 || c == 92 /*\*/); | |
| 450 } | |
| 451 | |
| 452 /** Pseudo function expressions identifiers can't have a minus sign. */ | |
| 453 static bool isIdentifierPartExpr(int c) { | |
| 454 return (isIdentifierStartExpr(c) || isDigit(c)); | |
| 455 } | |
| 456 } | |
| OLD | NEW |