OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 |
| 5 part of csslib.parser; |
| 6 |
| 7 class Tokenizer extends TokenizerBase { |
| 8 /** U+ prefix for unicode characters. */ |
| 9 final UNICODE_U = 'U'.codeUnitAt(0); |
| 10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0); |
| 11 final UNICODE_PLUS = '+'.codeUnitAt(0); |
| 12 |
| 13 final QUESTION_MARK = '?'.codeUnitAt(0); |
| 14 |
| 15 /** CDATA keyword. */ |
| 16 final List CDATA_NAME = 'CDATA'.codeUnits; |
| 17 |
| 18 Tokenizer(SourceFile file, String text, bool skipWhitespace, |
| 19 [int index = 0]) |
| 20 : super(file, text, skipWhitespace, index); |
| 21 |
| 22 Token next({unicodeRange: false}) { |
| 23 // keep track of our starting position |
| 24 _startIndex = _index; |
| 25 |
| 26 int ch; |
| 27 ch = _nextChar(); |
| 28 switch (ch) { |
| 29 case TokenChar.NEWLINE: |
| 30 case TokenChar.RETURN: |
| 31 case TokenChar.SPACE: |
| 32 case TokenChar.TAB: |
| 33 return finishWhitespace(); |
| 34 case TokenChar.END_OF_FILE: |
| 35 return _finishToken(TokenKind.END_OF_FILE); |
| 36 case TokenChar.AT: |
| 37 int peekCh = _peekChar(); |
| 38 if (TokenizerHelpers.isIdentifierStart(peekCh)) { |
| 39 var oldIndex = _index; |
| 40 var oldStartIndex = _startIndex; |
| 41 |
| 42 _startIndex = _index; |
| 43 ch = _nextChar(); |
| 44 Token ident = finishIdentifier(); |
| 45 |
| 46 // Is it a directive? |
| 47 int tokId = TokenKind.matchDirectives(_text, _startIndex, |
| 48 _index - _startIndex); |
| 49 if (tokId == -1) { |
| 50 // No, is it a margin directive? |
| 51 tokId = TokenKind.matchMarginDirectives(_text, _startIndex, |
| 52 _index - _startIndex); |
| 53 } |
| 54 |
| 55 if (tokId != -1) { |
| 56 return _finishToken(tokId); |
| 57 } else { |
| 58 // Didn't find a CSS directive or margin directive so the @name is |
| 59 // probably the Less definition '@name: value_variable_definition'. |
| 60 _startIndex = oldStartIndex; |
| 61 _index = oldIndex; |
| 62 } |
| 63 } |
| 64 return _finishToken(TokenKind.AT); |
| 65 case TokenChar.DOT: |
| 66 int start = _startIndex; // Start where the dot started. |
| 67 if (maybeEatDigit()) { |
| 68 // looks like a number dot followed by digit(s). |
| 69 Token number = finishNumber(); |
| 70 if (number.kind == TokenKind.INTEGER) { |
| 71 // It's a number but it's preceeded by a dot, so make it a double. |
| 72 _startIndex = start; |
| 73 return _finishToken(TokenKind.DOUBLE); |
| 74 } else { |
| 75 // Don't allow dot followed by a double (e.g, '..1'). |
| 76 return _errorToken(); |
| 77 } |
| 78 } |
| 79 // It's really a dot. |
| 80 return _finishToken(TokenKind.DOT); |
| 81 case TokenChar.LPAREN: |
| 82 return _finishToken(TokenKind.LPAREN); |
| 83 case TokenChar.RPAREN: |
| 84 return _finishToken(TokenKind.RPAREN); |
| 85 case TokenChar.LBRACE: |
| 86 return _finishToken(TokenKind.LBRACE); |
| 87 case TokenChar.RBRACE: |
| 88 return _finishToken(TokenKind.RBRACE); |
| 89 case TokenChar.LBRACK: |
| 90 return _finishToken(TokenKind.LBRACK); |
| 91 case TokenChar.RBRACK: |
| 92 if (_maybeEatChar(TokenChar.RBRACK) && |
| 93 _maybeEatChar(TokenChar.GREATER)) { |
| 94 // ]]> |
| 95 return next(); |
| 96 } |
| 97 return _finishToken(TokenKind.RBRACK); |
| 98 case TokenChar.HASH: |
| 99 return _finishToken(TokenKind.HASH); |
| 100 case TokenChar.PLUS: |
| 101 if (maybeEatDigit()) return finishNumber(); |
| 102 return _finishToken(TokenKind.PLUS); |
| 103 case TokenChar.MINUS: |
| 104 if (inSelectorExpression || unicodeRange) { |
| 105 // If parsing in pseudo function expression then minus is an operator |
| 106 // not part of identifier e.g., interval value range (e.g. U+400-4ff) |
| 107 // or minus operator in selector expression. |
| 108 return _finishToken(TokenKind.MINUS); |
| 109 } else if (maybeEatDigit()) { |
| 110 return finishNumber(); |
| 111 } else if (TokenizerHelpers.isIdentifierStart(ch)) { |
| 112 return finishIdentifier(); |
| 113 } |
| 114 return _finishToken(TokenKind.MINUS); |
| 115 case TokenChar.GREATER: |
| 116 return _finishToken(TokenKind.GREATER); |
| 117 case TokenChar.TILDE: |
| 118 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 119 return _finishToken(TokenKind.INCLUDES); // ~= |
| 120 } |
| 121 return _finishToken(TokenKind.TILDE); |
| 122 case TokenChar.ASTERISK: |
| 123 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 124 return _finishToken(TokenKind.SUBSTRING_MATCH); // *= |
| 125 } |
| 126 return _finishToken(TokenKind.ASTERISK); |
| 127 case TokenChar.AMPERSAND: |
| 128 return _finishToken(TokenKind.AMPERSAND); |
| 129 case TokenChar.NAMESPACE: |
| 130 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 131 return _finishToken(TokenKind.DASH_MATCH); // |= |
| 132 } |
| 133 return _finishToken(TokenKind.NAMESPACE); |
| 134 case TokenChar.COLON: |
| 135 return _finishToken(TokenKind.COLON); |
| 136 case TokenChar.COMMA: |
| 137 return _finishToken(TokenKind.COMMA); |
| 138 case TokenChar.SEMICOLON: |
| 139 return _finishToken(TokenKind.SEMICOLON); |
| 140 case TokenChar.PERCENT: |
| 141 return _finishToken(TokenKind.PERCENT); |
| 142 case TokenChar.SINGLE_QUOTE: |
| 143 return _finishToken(TokenKind.SINGLE_QUOTE); |
| 144 case TokenChar.DOUBLE_QUOTE: |
| 145 return _finishToken(TokenKind.DOUBLE_QUOTE); |
| 146 case TokenChar.SLASH: |
| 147 if (_maybeEatChar(TokenChar.ASTERISK)) return finishMultiLineComment(); |
| 148 return _finishToken(TokenKind.SLASH); |
| 149 case TokenChar.LESS: // <!-- |
| 150 if (_maybeEatChar(TokenChar.BANG)) { |
| 151 if (_maybeEatChar(TokenChar.MINUS) && |
| 152 _maybeEatChar(TokenChar.MINUS)) { |
| 153 return finishMultiLineComment(); |
| 154 } else if (_maybeEatChar(TokenChar.LBRACK) && |
| 155 _maybeEatChar(CDATA_NAME[0]) && |
| 156 _maybeEatChar(CDATA_NAME[1]) && |
| 157 _maybeEatChar(CDATA_NAME[2]) && |
| 158 _maybeEatChar(CDATA_NAME[3]) && |
| 159 _maybeEatChar(CDATA_NAME[4]) && |
| 160 _maybeEatChar(TokenChar.LBRACK)) { |
| 161 // <![CDATA[ |
| 162 return next(); |
| 163 } |
| 164 } |
| 165 return _finishToken(TokenKind.LESS); |
| 166 case TokenChar.EQUALS: |
| 167 return _finishToken(TokenKind.EQUALS); |
| 168 case TokenChar.CARET: |
| 169 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 170 return _finishToken(TokenKind.PREFIX_MATCH); // ^= |
| 171 } |
| 172 return _finishToken(TokenKind.CARET); |
| 173 case TokenChar.DOLLAR: |
| 174 if (_maybeEatChar(TokenChar.EQUALS)) { |
| 175 return _finishToken(TokenKind.SUFFIX_MATCH); // $= |
| 176 } |
| 177 return _finishToken(TokenKind.DOLLAR); |
| 178 case TokenChar.BANG: |
| 179 Token tok = finishIdentifier(); |
| 180 return (tok == null) ? _finishToken(TokenKind.BANG) : tok; |
| 181 default: |
| 182 // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's |
| 183 // appropriate outside of a few specific places; certainly shouldn't |
| 184 // be parsed in selectors. |
| 185 if (!inSelector && ch == TokenChar.BACKSLASH) { |
| 186 return _finishToken(TokenKind.BACKSLASH); |
| 187 } |
| 188 |
| 189 if (unicodeRange) { |
| 190 // Three types of unicode ranges: |
| 191 // - single code point (e.g. U+416) |
| 192 // - interval value range (e.g. U+400-4ff) |
| 193 // - range where trailing ‘?’ characters imply ‘any digit value’ |
| 194 // (e.g. U+4??) |
| 195 if (maybeEatHexDigit()) { |
| 196 var t = finishHexNumber(); |
| 197 // Any question marks then it's a HEX_RANGE not HEX_NUMBER. |
| 198 if (maybeEatQuestionMark()) finishUnicodeRange(); |
| 199 return t; |
| 200 } else if (maybeEatQuestionMark()) { |
| 201 // HEX_RANGE U+N??? |
| 202 return finishUnicodeRange(); |
| 203 } else { |
| 204 return _errorToken(); |
| 205 } |
| 206 } else if ((ch == UNICODE_U || ch == UNICODE_LOWER_U) && |
| 207 (_peekChar() == UNICODE_PLUS)) { |
| 208 // Unicode range: U+uNumber[-U+uNumber] |
| 209 // uNumber = 0..10FFFF |
| 210 _nextChar(); // Skip + |
| 211 _startIndex = _index; // Starts at the number |
| 212 return _finishToken(TokenKind.UNICODE_RANGE); |
| 213 } else if (varDef(ch)) { |
| 214 return _finishToken(TokenKind.VAR_DEFINITION); |
| 215 } else if (varUsage(ch)) { |
| 216 return _finishToken(TokenKind.VAR_USAGE); |
| 217 } else if (TokenizerHelpers.isIdentifierStart(ch)) { |
| 218 return finishIdentifier(); |
| 219 } else if (TokenizerHelpers.isDigit(ch)) { |
| 220 return finishNumber(); |
| 221 } |
| 222 return _errorToken(); |
| 223 } |
| 224 } |
| 225 |
| 226 bool varDef(int ch) { |
| 227 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) && |
| 228 _maybeEatChar('r'.codeUnitAt(0)) && _maybeEatChar('-'.codeUnitAt(0)); |
| 229 } |
| 230 |
| 231 bool varUsage(int ch) { |
| 232 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) && |
| 233 _maybeEatChar('r'.codeUnitAt(0)) && (_peekChar() == '-'.codeUnitAt(0)); |
| 234 } |
| 235 |
| 236 Token _errorToken([String message = null]) { |
| 237 return _finishToken(TokenKind.ERROR); |
| 238 } |
| 239 |
| 240 int getIdentifierKind() { |
| 241 // Is the identifier a unit type? |
| 242 int tokId = -1; |
| 243 |
| 244 // Don't match units in selectors or selector expressions. |
| 245 if (!inSelectorExpression && !inSelector) { |
| 246 tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex); |
| 247 } |
| 248 if (tokId == -1) { |
| 249 tokId = (_text.substring(_startIndex, _index) == '!important') ? |
| 250 TokenKind.IMPORTANT : -1; |
| 251 } |
| 252 |
| 253 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER; |
| 254 } |
| 255 |
| 256 Token finishIdentifier() { |
| 257 // If we encounter an escape sequence, remember it so we can post-process |
| 258 // to unescape. |
| 259 bool hasEscapedChars = false; |
| 260 var chars = []; |
| 261 |
| 262 // backup so we can start with the first character |
| 263 int validateFrom = _index; |
| 264 _index = _startIndex; |
| 265 while (_index < _text.length) { |
| 266 int ch = _text.codeUnitAt(_index); |
| 267 |
| 268 // If the previous character was "\" we need to escape. T |
| 269 // http://www.w3.org/TR/CSS21/syndata.html#characters |
| 270 // if followed by hexadecimal digits, create the appropriate character. |
| 271 // otherwise, include the character in the identifier and don't treat it |
| 272 // specially. |
| 273 if (ch == 92/*\*/) { |
| 274 int startHex = ++_index; |
| 275 eatHexDigits(startHex + 6); |
| 276 if (_index != startHex) { |
| 277 // Parse the hex digits and add that character. |
| 278 chars.add(int.parse('0x' + _text.substring(startHex, _index))); |
| 279 |
| 280 if (_index == _text.length) break; |
| 281 |
| 282 // if we stopped the hex because of a whitespace char, skip it |
| 283 ch = _text.codeUnitAt(_index); |
| 284 if (_index - startHex != 6 && |
| 285 (ch == TokenChar.SPACE || ch == TokenChar.TAB || |
| 286 ch == TokenChar.RETURN || ch == TokenChar.NEWLINE)) { |
| 287 _index++; |
| 288 } |
| 289 } else { |
| 290 // not a digit, just add the next character literally |
| 291 if (_index == _text.length) break; |
| 292 chars.add(_text.codeUnitAt(_index++)); |
| 293 } |
| 294 } else if (_index < validateFrom || (inSelectorExpression |
| 295 ? TokenizerHelpers.isIdentifierPartExpr(ch) |
| 296 : TokenizerHelpers.isIdentifierPart(ch))) { |
| 297 chars.add(ch); |
| 298 _index++; |
| 299 } else { |
| 300 // Not an identifier or escaped character. |
| 301 break; |
| 302 } |
| 303 } |
| 304 |
| 305 var span = _file.span(_startIndex, _index); |
| 306 var text = new String.fromCharCodes(chars); |
| 307 |
| 308 return new IdentifierToken(text, getIdentifierKind(), span); |
| 309 } |
| 310 |
| 311 Token finishNumber() { |
| 312 eatDigits(); |
| 313 |
| 314 if (_peekChar() == 46/*.*/) { |
| 315 // Handle the case of 1.toString(). |
| 316 _nextChar(); |
| 317 if (TokenizerHelpers.isDigit(_peekChar())) { |
| 318 eatDigits(); |
| 319 return _finishToken(TokenKind.DOUBLE); |
| 320 } else { |
| 321 _index -= 1; |
| 322 } |
| 323 } |
| 324 |
| 325 return _finishToken(TokenKind.INTEGER); |
| 326 } |
| 327 |
| 328 bool maybeEatDigit() { |
| 329 if (_index < _text.length |
| 330 && TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { |
| 331 _index += 1; |
| 332 return true; |
| 333 } |
| 334 return false; |
| 335 } |
| 336 |
| 337 Token finishHexNumber() { |
| 338 eatHexDigits(_text.length); |
| 339 return _finishToken(TokenKind.HEX_INTEGER); |
| 340 } |
| 341 |
| 342 void eatHexDigits(int end) { |
| 343 end = math.min(end, _text.length); |
| 344 while (_index < end) { |
| 345 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { |
| 346 _index += 1; |
| 347 } else { |
| 348 return; |
| 349 } |
| 350 } |
| 351 } |
| 352 |
| 353 bool maybeEatHexDigit() { |
| 354 if (_index < _text.length |
| 355 && TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) { |
| 356 _index += 1; |
| 357 return true; |
| 358 } |
| 359 return false; |
| 360 } |
| 361 |
| 362 bool maybeEatQuestionMark() { |
| 363 if (_index < _text.length && |
| 364 _text.codeUnitAt(_index) == QUESTION_MARK) { |
| 365 _index += 1; |
| 366 return true; |
| 367 } |
| 368 return false; |
| 369 } |
| 370 |
| 371 void eatQuestionMarks() { |
| 372 while (_index < _text.length) { |
| 373 if (_text.codeUnitAt(_index) == QUESTION_MARK) { |
| 374 _index += 1; |
| 375 } else { |
| 376 return; |
| 377 } |
| 378 } |
| 379 } |
| 380 |
| 381 Token finishUnicodeRange() { |
| 382 eatQuestionMarks(); |
| 383 return _finishToken(TokenKind.HEX_RANGE); |
| 384 } |
| 385 |
| 386 Token finishMultiLineComment() { |
| 387 while (true) { |
| 388 int ch = _nextChar(); |
| 389 if (ch == 0) { |
| 390 return _finishToken(TokenKind.INCOMPLETE_COMMENT); |
| 391 } else if (ch == 42/*'*'*/) { |
| 392 if (_maybeEatChar(47/*'/'*/)) { |
| 393 if (_skipWhitespace) { |
| 394 return next(); |
| 395 } else { |
| 396 return _finishToken(TokenKind.COMMENT); |
| 397 } |
| 398 } |
| 399 } else if (ch == TokenChar.MINUS) { |
| 400 /* Check if close part of Comment Definition --> (CDC). */ |
| 401 if (_maybeEatChar(TokenChar.MINUS)) { |
| 402 if (_maybeEatChar(TokenChar.GREATER)) { |
| 403 if (_skipWhitespace) { |
| 404 return next(); |
| 405 } else { |
| 406 return _finishToken(TokenKind.HTML_COMMENT); |
| 407 } |
| 408 } |
| 409 } |
| 410 } |
| 411 } |
| 412 return _errorToken(); |
| 413 } |
| 414 |
| 415 } |
| 416 |
| 417 /** Static helper methods. */ |
| 418 class TokenizerHelpers { |
| 419 static bool isIdentifierStart(int c) { |
| 420 return isIdentifierStartExpr(c) || c == 45 /*-*/; |
| 421 } |
| 422 |
| 423 static bool isDigit(int c) { |
| 424 return (c >= 48/*0*/ && c <= 57/*9*/); |
| 425 } |
| 426 |
| 427 static bool isHexDigit(int c) { |
| 428 return (isDigit(c) || (c >= 97/*a*/ && c <= 102/*f*/) |
| 429 || (c >= 65/*A*/ && c <= 70/*F*/)); |
| 430 } |
| 431 |
| 432 static bool isIdentifierPart(int c) { |
| 433 return isIdentifierPartExpr(c) || c == 45 /*-*/; |
| 434 } |
| 435 |
| 436 /** Pseudo function expressions identifiers can't have a minus sign. */ |
| 437 static bool isIdentifierStartExpr(int c) { |
| 438 return ((c >= 97/*a*/ && c <= 122/*z*/) || (c >= 65/*A*/ && c <= 90/*Z*/) || |
| 439 // Note: Unicode 10646 chars U+00A0 or higher are allowed, see: |
| 440 // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier |
| 441 // http://www.w3.org/TR/CSS21/syndata.html#characters |
| 442 // Also, escaped character should be allowed. |
| 443 c == 95/*_*/ || c >= 0xA0 || c == 92/*\*/); |
| 444 } |
| 445 |
| 446 /** Pseudo function expressions identifiers can't have a minus sign. */ |
| 447 static bool isIdentifierPartExpr(int c) { |
| 448 return (isIdentifierStartExpr(c) || isDigit(c)); |
| 449 } |
| 450 } |
OLD | NEW |