OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 // Generated by scripts/tokenizer_gen.py. |
| 5 |
| 6 part of csslib.parser; |
| 7 |
| 8 /** Tokenizer state to support look ahead for Less' nested selectors. */ |
| 9 class TokenizerState { |
| 10 final int index; |
| 11 final int startIndex; |
| 12 final bool inSelectorExpression; |
| 13 final bool inSelector; |
| 14 |
| 15 TokenizerState(TokenizerBase base) : |
| 16 index = base._index, |
| 17 startIndex = base._startIndex, |
| 18 inSelectorExpression = base.inSelectorExpression, |
| 19 inSelector = base.inSelector; |
| 20 } |
| 21 |
| 22 /** |
| 23 * The base class for our tokenizer. The hand coded parts are in this file, with |
| 24 * the generated parts in the subclass Tokenizer. |
| 25 */ |
| 26 abstract class TokenizerBase { |
| 27 final SourceFile _file; |
| 28 final String _text; |
| 29 |
| 30 bool _skipWhitespace; |
| 31 |
| 32 /** |
| 33 * Changes tokenization when in a pseudo function expression. If true then |
| 34 * minus signs are handled as operators instead of identifiers. |
| 35 */ |
| 36 bool inSelectorExpression = false; |
| 37 |
| 38 /** |
| 39 * Changes tokenization when in selectors. If true, it prevents identifiers |
| 40 * from being treated as units. This would break things like ":lang(fr)" or |
| 41 * the HTML (unknown) tag name "px", which is legal to use in a selector. |
| 42 */ |
| 43 // TODO(jmesserly): is this a problem elsewhere? "fr" for example will be |
| 44 // processed as a "fraction" unit token, preventing it from working in |
| 45 // places where an identifier is expected. This was breaking selectors like: |
| 46 // :lang(fr) |
| 47 // The assumption that "fr" always means fraction (and similar issue with |
| 48 // other units) doesn't seem valid. We probably should defer this |
| 49 // analysis until we reach places in the parser where units are expected. |
| 50 // I'm not sure this is tokenizing as described in the specs: |
| 51 // http://dev.w3.org/csswg/css-syntax/ |
| 52 // http://dev.w3.org/csswg/selectors4/ |
| 53 bool inSelector = false; |
| 54 |
| 55 int _index; |
| 56 int _startIndex; |
| 57 |
| 58 static const String _CDATA_START = '<![CDATA['; |
| 59 static const String _CDATA_END = ']]>'; |
| 60 |
| 61 TokenizerBase(this._file, this._text, this._skipWhitespace, |
| 62 [this._index = 0]); |
| 63 |
| 64 Token next(); |
| 65 int getIdentifierKind(); |
| 66 |
| 67 /** Snapshot of Tokenizer scanning state. */ |
| 68 TokenizerState get mark => new TokenizerState(this); |
| 69 |
| 70 /** Restore Tokenizer scanning state. */ |
| 71 void restore(TokenizerState markedData) { |
| 72 _index = markedData.index; |
| 73 _startIndex = markedData.startIndex; |
| 74 inSelectorExpression = markedData.inSelectorExpression; |
| 75 inSelector = markedData.inSelector; |
| 76 } |
| 77 |
| 78 int _nextChar() { |
| 79 if (_index < _text.length) { |
| 80 return _text.codeUnitAt(_index++); |
| 81 } else { |
| 82 return 0; |
| 83 } |
| 84 } |
| 85 |
| 86 int _peekChar() { |
| 87 if (_index < _text.length) { |
| 88 return _text.codeUnitAt(_index); |
| 89 } else { |
| 90 return 0; |
| 91 } |
| 92 } |
| 93 |
| 94 bool _maybeEatChar(int ch) { |
| 95 if (_index < _text.length) { |
| 96 if (_text.codeUnitAt(_index) == ch) { |
| 97 _index++; |
| 98 return true; |
| 99 } else { |
| 100 return false; |
| 101 } |
| 102 } else { |
| 103 return false; |
| 104 } |
| 105 } |
| 106 |
| 107 String _tokenText() { |
| 108 if (_index < _text.length) { |
| 109 return _text.substring(_startIndex, _index); |
| 110 } else { |
| 111 return _text.substring(_startIndex, _text.length); |
| 112 } |
| 113 } |
| 114 |
| 115 Token _finishToken(int kind) { |
| 116 return new Token(kind, _file.span(_startIndex, _index)); |
| 117 } |
| 118 |
| 119 Token _errorToken([String message = null]) { |
| 120 return new ErrorToken( |
| 121 TokenKind.ERROR, _file.span(_startIndex, _index), message); |
| 122 } |
| 123 |
| 124 Token finishWhitespace() { |
| 125 _index--; |
| 126 while (_index < _text.length) { |
| 127 final ch = _text.codeUnitAt(_index++); |
| 128 if (ch == TokenChar.SPACE || |
| 129 ch == TokenChar.TAB || |
| 130 ch == TokenChar.RETURN) { |
| 131 // do nothing |
| 132 } else if (ch == TokenChar.NEWLINE) { |
| 133 if (!_skipWhitespace) { |
| 134 return _finishToken(TokenKind.WHITESPACE); // note the newline? |
| 135 } |
| 136 } else { |
| 137 _index--; |
| 138 if (_skipWhitespace) { |
| 139 return next(); |
| 140 } else { |
| 141 return _finishToken(TokenKind.WHITESPACE); |
| 142 } |
| 143 } |
| 144 |
| 145 } |
| 146 return _finishToken(TokenKind.END_OF_FILE); |
| 147 } |
| 148 |
| 149 Token finishMultiLineComment() { |
| 150 int nesting = 1; |
| 151 do { |
| 152 int ch = _nextChar(); |
| 153 if (ch == 0) { |
| 154 return _errorToken(); |
| 155 } else if (ch == TokenChar.ASTERISK) { |
| 156 if (_maybeEatChar(TokenChar.SLASH)) { |
| 157 nesting--; |
| 158 } |
| 159 } else if (ch == TokenChar.SLASH) { |
| 160 if (_maybeEatChar(TokenChar.ASTERISK)) { |
| 161 nesting++; |
| 162 } |
| 163 } |
| 164 } while (nesting > 0); |
| 165 |
| 166 if (_skipWhitespace) { |
| 167 return next(); |
| 168 } else { |
| 169 return _finishToken(TokenKind.COMMENT); |
| 170 } |
| 171 } |
| 172 |
| 173 void eatDigits() { |
| 174 while (_index < _text.length) { |
| 175 if (TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { |
| 176 _index++; |
| 177 } else { |
| 178 return; |
| 179 } |
| 180 } |
| 181 } |
| 182 |
| 183 static int _hexDigit(int c) { |
| 184 if(c >= 48/*0*/ && c <= 57/*9*/) { |
| 185 return c - 48; |
| 186 } else if (c >= 97/*a*/ && c <= 102/*f*/) { |
| 187 return c - 87; |
| 188 } else if (c >= 65/*A*/ && c <= 70/*F*/) { |
| 189 return c - 55; |
| 190 } else { |
| 191 return -1; |
| 192 } |
| 193 } |
| 194 |
| 195 int readHex([int hexLength]) { |
| 196 int maxIndex; |
| 197 if (hexLength == null) { |
| 198 maxIndex = _text.length - 1; |
| 199 } else { |
| 200 // TODO(jimhug): What if this is too long? |
| 201 maxIndex = _index + hexLength; |
| 202 if (maxIndex >= _text.length) return -1; |
| 203 } |
| 204 var result = 0; |
| 205 while (_index < maxIndex) { |
| 206 final digit = _hexDigit(_text.codeUnitAt(_index)); |
| 207 if (digit == -1) { |
| 208 if (hexLength == null) { |
| 209 return result; |
| 210 } else { |
| 211 return -1; |
| 212 } |
| 213 } |
| 214 _hexDigit(_text.codeUnitAt(_index)); |
| 215 // Multiply by 16 rather than shift by 4 since that will result in a |
| 216 // correct value for numbers that exceed the 32 bit precision of JS |
| 217 // 'integers'. |
| 218 // TODO: Figure out a better solution to integer truncation. Issue 638. |
| 219 result = (result * 16) + digit; |
| 220 _index++; |
| 221 } |
| 222 |
| 223 return result; |
| 224 } |
| 225 |
| 226 Token finishNumber() { |
| 227 eatDigits(); |
| 228 |
| 229 if (_peekChar() == TokenChar.DOT) { |
| 230 // Handle the case of 1.toString(). |
| 231 _nextChar(); |
| 232 if (TokenizerHelpers.isDigit(_peekChar())) { |
| 233 eatDigits(); |
| 234 return finishNumberExtra(TokenKind.DOUBLE); |
| 235 } else { |
| 236 _index--; |
| 237 } |
| 238 } |
| 239 |
| 240 return finishNumberExtra(TokenKind.INTEGER); |
| 241 } |
| 242 |
| 243 Token finishNumberExtra(int kind) { |
| 244 if (_maybeEatChar(101/*e*/) || _maybeEatChar(69/*E*/)) { |
| 245 kind = TokenKind.DOUBLE; |
| 246 _maybeEatChar(TokenKind.MINUS); |
| 247 _maybeEatChar(TokenKind.PLUS); |
| 248 eatDigits(); |
| 249 } |
| 250 if (_peekChar() != 0 && TokenizerHelpers.isIdentifierStart(_peekChar())) { |
| 251 _nextChar(); |
| 252 return _errorToken("illegal character in number"); |
| 253 } |
| 254 |
| 255 return _finishToken(kind); |
| 256 } |
| 257 |
| 258 Token _makeStringToken(List<int> buf, bool isPart) { |
| 259 final s = new String.fromCharCodes(buf); |
| 260 final kind = isPart ? TokenKind.STRING_PART : TokenKind.STRING; |
| 261 return new LiteralToken(kind, _file.span(_startIndex, _index), s); |
| 262 } |
| 263 |
| 264 Token makeIEFilter(int start, int end) { |
| 265 var filter = _text.substring(start, end); |
| 266 return new LiteralToken(TokenKind.STRING, _file.span(start, end), filter); |
| 267 } |
| 268 |
| 269 Token _makeRawStringToken(bool isMultiline) { |
| 270 var s; |
| 271 if (isMultiline) { |
| 272 // Skip initial newline in multiline strings |
| 273 int start = _startIndex + 4; |
| 274 if (_text[start] == '\n') start++; |
| 275 s = _text.substring(start, _index - 3); |
| 276 } else { |
| 277 s = _text.substring(_startIndex + 2, _index - 1); |
| 278 } |
| 279 return new LiteralToken(TokenKind.STRING, |
| 280 _file.span(_startIndex, _index), s); |
| 281 } |
| 282 |
| 283 Token finishMultilineString(int quote) { |
| 284 var buf = <int>[]; |
| 285 while (true) { |
| 286 int ch = _nextChar(); |
| 287 if (ch == 0) { |
| 288 return _errorToken(); |
| 289 } else if (ch == quote) { |
| 290 if (_maybeEatChar(quote)) { |
| 291 if (_maybeEatChar(quote)) { |
| 292 return _makeStringToken(buf, false); |
| 293 } |
| 294 buf.add(quote); |
| 295 } |
| 296 buf.add(quote); |
| 297 } else if (ch == TokenChar.BACKSLASH) { |
| 298 var escapeVal = readEscapeSequence(); |
| 299 if (escapeVal == -1) { |
| 300 return _errorToken("invalid hex escape sequence"); |
| 301 } else { |
| 302 buf.add(escapeVal); |
| 303 } |
| 304 } else { |
| 305 buf.add(ch); |
| 306 } |
| 307 } |
| 308 } |
| 309 |
| 310 Token _finishOpenBrace() { |
| 311 return _finishToken(TokenKind.LBRACE); |
| 312 } |
| 313 |
| 314 Token _finishCloseBrace() { |
| 315 return _finishToken(TokenKind.RBRACE); |
| 316 } |
| 317 |
| 318 Token finishString(int quote) { |
| 319 if (_maybeEatChar(quote)) { |
| 320 if (_maybeEatChar(quote)) { |
| 321 // skip an initial newline |
| 322 _maybeEatChar(TokenChar.NEWLINE); |
| 323 return finishMultilineString(quote); |
| 324 } else { |
| 325 return _makeStringToken(new List<int>(), false); |
| 326 } |
| 327 } |
| 328 return finishStringBody(quote); |
| 329 } |
| 330 |
| 331 Token finishRawString(int quote) { |
| 332 if (_maybeEatChar(quote)) { |
| 333 if (_maybeEatChar(quote)) { |
| 334 return finishMultilineRawString(quote); |
| 335 } else { |
| 336 return _makeStringToken(<int>[], false); |
| 337 } |
| 338 } |
| 339 while (true) { |
| 340 int ch = _nextChar(); |
| 341 if (ch == quote) { |
| 342 return _makeRawStringToken(false); |
| 343 } else if (ch == 0) { |
| 344 return _errorToken(); |
| 345 } |
| 346 } |
| 347 } |
| 348 |
| 349 Token finishMultilineRawString(int quote) { |
| 350 while (true) { |
| 351 int ch = _nextChar(); |
| 352 if (ch == 0) { |
| 353 return _errorToken(); |
| 354 } else if (ch == quote && _maybeEatChar(quote) && _maybeEatChar(quote)) { |
| 355 return _makeRawStringToken(true); |
| 356 } |
| 357 } |
| 358 } |
| 359 |
| 360 Token finishStringBody(int quote) { |
| 361 var buf = new List<int>(); |
| 362 while (true) { |
| 363 int ch = _nextChar(); |
| 364 if (ch == quote) { |
| 365 return _makeStringToken(buf, false); |
| 366 } else if (ch == 0) { |
| 367 return _errorToken(); |
| 368 } else if (ch == TokenChar.BACKSLASH) { |
| 369 var escapeVal = readEscapeSequence(); |
| 370 if (escapeVal == -1) { |
| 371 return _errorToken("invalid hex escape sequence"); |
| 372 } else { |
| 373 buf.add(escapeVal); |
| 374 } |
| 375 } else { |
| 376 buf.add(ch); |
| 377 } |
| 378 } |
| 379 } |
| 380 |
| 381 int readEscapeSequence() { |
| 382 final ch = _nextChar(); |
| 383 int hexValue; |
| 384 switch (ch) { |
| 385 case 110/*n*/: |
| 386 return TokenChar.NEWLINE; |
| 387 case 114/*r*/: |
| 388 return TokenChar.RETURN; |
| 389 case 102/*f*/: |
| 390 return TokenChar.FF; |
| 391 case 98/*b*/: |
| 392 return TokenChar.BACKSPACE; |
| 393 case 116/*t*/: |
| 394 return TokenChar.TAB; |
| 395 case 118/*v*/: |
| 396 return TokenChar.FF; |
| 397 case 120/*x*/: |
| 398 hexValue = readHex(2); |
| 399 break; |
| 400 case 117/*u*/: |
| 401 if (_maybeEatChar(TokenChar.LBRACE)) { |
| 402 hexValue = readHex(); |
| 403 if (!_maybeEatChar(TokenChar.RBRACE)) { |
| 404 return -1; |
| 405 } |
| 406 } else { |
| 407 hexValue = readHex(4); |
| 408 } |
| 409 break; |
| 410 default: return ch; |
| 411 } |
| 412 |
| 413 if (hexValue == -1) return -1; |
| 414 |
| 415 // According to the Unicode standard the high and low surrogate halves |
| 416 // used by UTF-16 (U+D800 through U+DFFF) and values above U+10FFFF |
| 417 // are not legal Unicode values. |
| 418 if (hexValue < 0xD800 || hexValue > 0xDFFF && hexValue <= 0xFFFF) { |
| 419 return hexValue; |
| 420 } else if (hexValue <= 0x10FFFF){ |
| 421 messages.error('unicode values greater than 2 bytes not implemented yet', |
| 422 _file.span(_startIndex, _startIndex + 1)); |
| 423 return -1; |
| 424 } else { |
| 425 return -1; |
| 426 } |
| 427 } |
| 428 |
| 429 Token finishDot() { |
| 430 if (TokenizerHelpers.isDigit(_peekChar())) { |
| 431 eatDigits(); |
| 432 return finishNumberExtra(TokenKind.DOUBLE); |
| 433 } else { |
| 434 return _finishToken(TokenKind.DOT); |
| 435 } |
| 436 } |
| 437 } |
| 438 |
OLD | NEW |