OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 // Generated by scripts/tokenizer_gen.py. |
| 5 |
| 6 part of csslib.parser; |
| 7 |
| 8 /** Tokenizer state to support look ahead for Less' nested selectors. */ |
| 9 class TokenizerState { |
| 10 final int index; |
| 11 final int startIndex; |
| 12 final bool inSelectorExpression; |
| 13 final bool inSelector; |
| 14 |
| 15 TokenizerState(TokenizerBase base) |
| 16 : index = base._index, |
| 17 startIndex = base._startIndex, |
| 18 inSelectorExpression = base.inSelectorExpression, |
| 19 inSelector = base.inSelector; |
| 20 } |
| 21 |
| 22 /** |
| 23 * The base class for our tokenizer. The hand coded parts are in this file, with |
| 24 * the generated parts in the subclass Tokenizer. |
| 25 */ |
| 26 abstract class TokenizerBase { |
| 27 final SourceFile _file; |
| 28 final String _text; |
| 29 |
| 30 bool _inString; |
| 31 |
| 32 /** |
| 33 * Changes tokenization when in a pseudo function expression. If true then |
| 34 * minus signs are handled as operators instead of identifiers. |
| 35 */ |
| 36 bool inSelectorExpression = false; |
| 37 |
| 38 /** |
| 39 * Changes tokenization when in selectors. If true, it prevents identifiers |
| 40 * from being treated as units. This would break things like ":lang(fr)" or |
| 41 * the HTML (unknown) tag name "px", which is legal to use in a selector. |
| 42 */ |
| 43 // TODO(jmesserly): is this a problem elsewhere? "fr" for example will be |
| 44 // processed as a "fraction" unit token, preventing it from working in |
| 45 // places where an identifier is expected. This was breaking selectors like: |
| 46 // :lang(fr) |
| 47 // The assumption that "fr" always means fraction (and similar issue with |
| 48 // other units) doesn't seem valid. We probably should defer this |
| 49 // analysis until we reach places in the parser where units are expected. |
| 50 // I'm not sure this is tokenizing as described in the specs: |
| 51 // http://dev.w3.org/csswg/css-syntax/ |
| 52 // http://dev.w3.org/csswg/selectors4/ |
| 53 bool inSelector = false; |
| 54 |
| 55 int _index = 0; |
| 56 int _startIndex = 0; |
| 57 |
| 58 TokenizerBase(this._file, this._text, this._inString, |
| 59 [this._index = 0]); |
| 60 |
| 61 Token next(); |
| 62 int getIdentifierKind(); |
| 63 |
| 64 /** Snapshot of Tokenizer scanning state. */ |
| 65 TokenizerState get mark => new TokenizerState(this); |
| 66 |
| 67 /** Restore Tokenizer scanning state. */ |
| 68 void restore(TokenizerState markedData) { |
| 69 _index = markedData.index; |
| 70 _startIndex = markedData.startIndex; |
| 71 inSelectorExpression = markedData.inSelectorExpression; |
| 72 inSelector = markedData.inSelector; |
| 73 } |
| 74 |
| 75 int _nextChar() { |
| 76 if (_index < _text.length) { |
| 77 return _text.codeUnitAt(_index++); |
| 78 } else { |
| 79 return 0; |
| 80 } |
| 81 } |
| 82 |
| 83 int _peekChar() { |
| 84 if (_index < _text.length) { |
| 85 return _text.codeUnitAt(_index); |
| 86 } else { |
| 87 return 0; |
| 88 } |
| 89 } |
| 90 |
| 91 bool _maybeEatChar(int ch) { |
| 92 if (_index < _text.length) { |
| 93 if (_text.codeUnitAt(_index) == ch) { |
| 94 _index++; |
| 95 return true; |
| 96 } else { |
| 97 return false; |
| 98 } |
| 99 } else { |
| 100 return false; |
| 101 } |
| 102 } |
| 103 |
| 104 Token _finishToken(int kind) { |
| 105 return new Token(kind, _file.span(_startIndex, _index)); |
| 106 } |
| 107 |
| 108 Token _errorToken([String message = null]) { |
| 109 return new ErrorToken( |
| 110 TokenKind.ERROR, _file.span(_startIndex, _index), message); |
| 111 } |
| 112 |
| 113 Token finishWhitespace() { |
| 114 _index--; |
| 115 while (_index < _text.length) { |
| 116 final ch = _text.codeUnitAt(_index++); |
| 117 if (ch == TokenChar.SPACE || |
| 118 ch == TokenChar.TAB || |
| 119 ch == TokenChar.RETURN) { |
| 120 // do nothing |
| 121 } else if (ch == TokenChar.NEWLINE) { |
| 122 if (!_inString) { |
| 123 return _finishToken(TokenKind.WHITESPACE); // note the newline? |
| 124 } |
| 125 } else { |
| 126 _index--; |
| 127 if (_inString) { |
| 128 return next(); |
| 129 } else { |
| 130 return _finishToken(TokenKind.WHITESPACE); |
| 131 } |
| 132 } |
| 133 } |
| 134 return _finishToken(TokenKind.END_OF_FILE); |
| 135 } |
| 136 |
| 137 Token finishMultiLineComment() { |
| 138 int nesting = 1; |
| 139 do { |
| 140 int ch = _nextChar(); |
| 141 if (ch == 0) { |
| 142 return _errorToken(); |
| 143 } else if (ch == TokenChar.ASTERISK) { |
| 144 if (_maybeEatChar(TokenChar.SLASH)) { |
| 145 nesting--; |
| 146 } |
| 147 } else if (ch == TokenChar.SLASH) { |
| 148 if (_maybeEatChar(TokenChar.ASTERISK)) { |
| 149 nesting++; |
| 150 } |
| 151 } |
| 152 } while (nesting > 0); |
| 153 |
| 154 if (_inString) { |
| 155 return next(); |
| 156 } else { |
| 157 return _finishToken(TokenKind.COMMENT); |
| 158 } |
| 159 } |
| 160 |
| 161 void eatDigits() { |
| 162 while (_index < _text.length) { |
| 163 if (TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) { |
| 164 _index++; |
| 165 } else { |
| 166 return; |
| 167 } |
| 168 } |
| 169 } |
| 170 |
| 171 static int _hexDigit(int c) { |
| 172 if (c >= 48 /*0*/ && c <= 57 /*9*/) { |
| 173 return c - 48; |
| 174 } else if (c >= 97 /*a*/ && c <= 102 /*f*/) { |
| 175 return c - 87; |
| 176 } else if (c >= 65 /*A*/ && c <= 70 /*F*/) { |
| 177 return c - 55; |
| 178 } else { |
| 179 return -1; |
| 180 } |
| 181 } |
| 182 |
| 183 int readHex([int hexLength]) { |
| 184 int maxIndex; |
| 185 if (hexLength == null) { |
| 186 maxIndex = _text.length - 1; |
| 187 } else { |
| 188 // TODO(jimhug): What if this is too long? |
| 189 maxIndex = _index + hexLength; |
| 190 if (maxIndex >= _text.length) return -1; |
| 191 } |
| 192 var result = 0; |
| 193 while (_index < maxIndex) { |
| 194 final digit = _hexDigit(_text.codeUnitAt(_index)); |
| 195 if (digit == -1) { |
| 196 if (hexLength == null) { |
| 197 return result; |
| 198 } else { |
| 199 return -1; |
| 200 } |
| 201 } |
| 202 _hexDigit(_text.codeUnitAt(_index)); |
| 203 // Multiply by 16 rather than shift by 4 since that will result in a |
| 204 // correct value for numbers that exceed the 32 bit precision of JS |
| 205 // 'integers'. |
| 206 // TODO: Figure out a better solution to integer truncation. Issue 638. |
| 207 result = (result * 16) + digit; |
| 208 _index++; |
| 209 } |
| 210 |
| 211 return result; |
| 212 } |
| 213 |
| 214 Token finishNumber() { |
| 215 eatDigits(); |
| 216 |
| 217 if (_peekChar() == TokenChar.DOT) { |
| 218 // Handle the case of 1.toString(). |
| 219 _nextChar(); |
| 220 if (TokenizerHelpers.isDigit(_peekChar())) { |
| 221 eatDigits(); |
| 222 return finishNumberExtra(TokenKind.DOUBLE); |
| 223 } else { |
| 224 _index--; |
| 225 } |
| 226 } |
| 227 |
| 228 return finishNumberExtra(TokenKind.INTEGER); |
| 229 } |
| 230 |
| 231 Token finishNumberExtra(int kind) { |
| 232 if (_maybeEatChar(101 /*e*/) || _maybeEatChar(69 /*E*/)) { |
| 233 kind = TokenKind.DOUBLE; |
| 234 _maybeEatChar(TokenKind.MINUS); |
| 235 _maybeEatChar(TokenKind.PLUS); |
| 236 eatDigits(); |
| 237 } |
| 238 if (_peekChar() != 0 && TokenizerHelpers.isIdentifierStart(_peekChar())) { |
| 239 _nextChar(); |
| 240 return _errorToken("illegal character in number"); |
| 241 } |
| 242 |
| 243 return _finishToken(kind); |
| 244 } |
| 245 |
| 246 Token _makeStringToken(List<int> buf, bool isPart) { |
| 247 final s = new String.fromCharCodes(buf); |
| 248 final kind = isPart ? TokenKind.STRING_PART : TokenKind.STRING; |
| 249 return new LiteralToken(kind, _file.span(_startIndex, _index), s); |
| 250 } |
| 251 |
| 252 Token makeIEFilter(int start, int end) { |
| 253 var filter = _text.substring(start, end); |
| 254 return new LiteralToken(TokenKind.STRING, _file.span(start, end), filter); |
| 255 } |
| 256 |
| 257 Token _makeRawStringToken(bool isMultiline) { |
| 258 var s; |
| 259 if (isMultiline) { |
| 260 // Skip initial newline in multiline strings |
| 261 int start = _startIndex + 4; |
| 262 if (_text[start] == '\n') start++; |
| 263 s = _text.substring(start, _index - 3); |
| 264 } else { |
| 265 s = _text.substring(_startIndex + 2, _index - 1); |
| 266 } |
| 267 return new LiteralToken( |
| 268 TokenKind.STRING, _file.span(_startIndex, _index), s); |
| 269 } |
| 270 |
| 271 Token finishMultilineString(int quote) { |
| 272 var buf = <int>[]; |
| 273 while (true) { |
| 274 int ch = _nextChar(); |
| 275 if (ch == 0) { |
| 276 return _errorToken(); |
| 277 } else if (ch == quote) { |
| 278 if (_maybeEatChar(quote)) { |
| 279 if (_maybeEatChar(quote)) { |
| 280 return _makeStringToken(buf, false); |
| 281 } |
| 282 buf.add(quote); |
| 283 } |
| 284 buf.add(quote); |
| 285 } else if (ch == TokenChar.BACKSLASH) { |
| 286 var escapeVal = readEscapeSequence(); |
| 287 if (escapeVal == -1) { |
| 288 return _errorToken("invalid hex escape sequence"); |
| 289 } else { |
| 290 buf.add(escapeVal); |
| 291 } |
| 292 } else { |
| 293 buf.add(ch); |
| 294 } |
| 295 } |
| 296 } |
| 297 |
| 298 Token finishString(int quote) { |
| 299 if (_maybeEatChar(quote)) { |
| 300 if (_maybeEatChar(quote)) { |
| 301 // skip an initial newline |
| 302 _maybeEatChar(TokenChar.NEWLINE); |
| 303 return finishMultilineString(quote); |
| 304 } else { |
| 305 return _makeStringToken(new List<int>(), false); |
| 306 } |
| 307 } |
| 308 return finishStringBody(quote); |
| 309 } |
| 310 |
| 311 Token finishRawString(int quote) { |
| 312 if (_maybeEatChar(quote)) { |
| 313 if (_maybeEatChar(quote)) { |
| 314 return finishMultilineRawString(quote); |
| 315 } else { |
| 316 return _makeStringToken(<int>[], false); |
| 317 } |
| 318 } |
| 319 while (true) { |
| 320 int ch = _nextChar(); |
| 321 if (ch == quote) { |
| 322 return _makeRawStringToken(false); |
| 323 } else if (ch == 0) { |
| 324 return _errorToken(); |
| 325 } |
| 326 } |
| 327 } |
| 328 |
| 329 Token finishMultilineRawString(int quote) { |
| 330 while (true) { |
| 331 int ch = _nextChar(); |
| 332 if (ch == 0) { |
| 333 return _errorToken(); |
| 334 } else if (ch == quote && _maybeEatChar(quote) && _maybeEatChar(quote)) { |
| 335 return _makeRawStringToken(true); |
| 336 } |
| 337 } |
| 338 } |
| 339 |
| 340 Token finishStringBody(int quote) { |
| 341 var buf = new List<int>(); |
| 342 while (true) { |
| 343 int ch = _nextChar(); |
| 344 if (ch == quote) { |
| 345 return _makeStringToken(buf, false); |
| 346 } else if (ch == 0) { |
| 347 return _errorToken(); |
| 348 } else if (ch == TokenChar.BACKSLASH) { |
| 349 var escapeVal = readEscapeSequence(); |
| 350 if (escapeVal == -1) { |
| 351 return _errorToken("invalid hex escape sequence"); |
| 352 } else { |
| 353 buf.add(escapeVal); |
| 354 } |
| 355 } else { |
| 356 buf.add(ch); |
| 357 } |
| 358 } |
| 359 } |
| 360 |
| 361 int readEscapeSequence() { |
| 362 final ch = _nextChar(); |
| 363 int hexValue; |
| 364 switch (ch) { |
| 365 case 110 /*n*/ : |
| 366 return TokenChar.NEWLINE; |
| 367 case 114 /*r*/ : |
| 368 return TokenChar.RETURN; |
| 369 case 102 /*f*/ : |
| 370 return TokenChar.FF; |
| 371 case 98 /*b*/ : |
| 372 return TokenChar.BACKSPACE; |
| 373 case 116 /*t*/ : |
| 374 return TokenChar.TAB; |
| 375 case 118 /*v*/ : |
| 376 return TokenChar.FF; |
| 377 case 120 /*x*/ : |
| 378 hexValue = readHex(2); |
| 379 break; |
| 380 case 117 /*u*/ : |
| 381 if (_maybeEatChar(TokenChar.LBRACE)) { |
| 382 hexValue = readHex(); |
| 383 if (!_maybeEatChar(TokenChar.RBRACE)) { |
| 384 return -1; |
| 385 } |
| 386 } else { |
| 387 hexValue = readHex(4); |
| 388 } |
| 389 break; |
| 390 default: |
| 391 return ch; |
| 392 } |
| 393 |
| 394 if (hexValue == -1) return -1; |
| 395 |
| 396 // According to the Unicode standard the high and low surrogate halves |
| 397 // used by UTF-16 (U+D800 through U+DFFF) and values above U+10FFFF |
| 398 // are not legal Unicode values. |
| 399 if (hexValue < 0xD800 || hexValue > 0xDFFF && hexValue <= 0xFFFF) { |
| 400 return hexValue; |
| 401 } else if (hexValue <= 0x10FFFF) { |
| 402 messages.error('unicode values greater than 2 bytes not implemented yet', |
| 403 _file.span(_startIndex, _startIndex + 1)); |
| 404 return -1; |
| 405 } else { |
| 406 return -1; |
| 407 } |
| 408 } |
| 409 |
| 410 Token finishDot() { |
| 411 if (TokenizerHelpers.isDigit(_peekChar())) { |
| 412 eatDigits(); |
| 413 return finishNumberExtra(TokenKind.DOUBLE); |
| 414 } else { |
| 415 return _finishToken(TokenKind.DOT); |
| 416 } |
| 417 } |
| 418 } |
OLD | NEW |