Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of scanner; | 5 part of scanner; |
| 6 | 6 |
| 7 abstract class Scanner { | 7 abstract class Scanner { |
| 8 Token tokenize(); | 8 Token tokenize(); |
| 9 | |
| 10 factory Scanner(SourceFile file, {bool includeComments: false}) { | |
| 11 if (file is Utf8BytesSourceFile) { | |
| 12 return new Utf8BytesScanner(file, includeComments: includeComments); | |
| 13 } else { | |
| 14 return new StringScanner(file, includeComments: includeComments); | |
| 15 } | |
| 16 } | |
| 9 } | 17 } |
| 10 | 18 |
| 11 /** | 19 abstract class AbstractScanner implements Scanner { |
| 12 * Common base class for a Dart scanner. | 20 final bool includeComments; |
| 13 */ | |
| 14 abstract class AbstractScanner<T extends SourceString> implements Scanner { | |
| 15 int advance(); | |
| 16 int nextByte(); | |
| 17 | 21 |
| 18 /** | 22 /** |
| 19 * Returns the current character or byte depending on the underlying input | 23 * The string offset for the next token that will be created. |
| 20 * kind. For example, [StringScanner] operates on [String] and thus returns | 24 * |
| 21 * characters (Unicode codepoints represented as int) whereas | 25 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values |
| 22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes. | 26 * are different. One string character can be encoded using multiple UTF-8 |
| 27 * bytes. | |
| 28 */ | |
| 29 int tokenStart = -1; | |
| 30 | |
| 31 /** | |
| 32 * A pointer to the token stream created by this scanner. The first token | |
| 33 * is a special token and not part of the source file. This is an | |
| 34 * implementation detail to avoids special cases in the scanner. This token | |
| 35 * is not exposed to clients of the scanner, which are expected to invoke | |
| 36 * [firstToken] to access the token stream. | |
| 37 */ | |
| 38 final Token tokens = new SymbolToken(EOF_INFO, -1); | |
| 39 | |
| 40 /** | |
| 41 * A pointer to the last scanned token. | |
| 42 */ | |
| 43 Token tail; | |
| 44 | |
| 45 /** | |
| 46 * The source file that is being scanned. This field can be [:null:]. | |
| 47 * If the source file is available, the scanner assigns its [:lineStarts:] and | |
| 48 * [:length:] fields at the end of [tokenize]. | |
| 49 */ | |
| 50 final SourceFile file; | |
| 51 | |
| 52 final List<int> lineStarts = [0]; | |
| 53 | |
| 54 AbstractScanner(this.file, this.includeComments) { | |
| 55 this.tail = this.tokens; | |
| 56 } | |
| 57 | |
| 58 | |
| 59 /** | |
| 60 * Advances and returns the next character. | |
| 61 * | |
| 62 * If the next character is non-ASCII, then the returned value depends on the | |
| 63 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while | |
| 64 * the [StringScanner] returns a UTF-16 code unit. | |
| 65 * | |
| 66 * The scanner ensures that [advance] is not invoked after it returned [$EOF]. | |
| 67 * This allows implementations to omit bound checks if the data structure ends | |
| 68 * with '0'. | |
| 69 */ | |
| 70 int advance(); | |
| 71 | |
| 72 /** | |
| 73 * Returns the current unicode character. | |
| 74 * | |
| 75 * If the current character is ASCII, then it is returned unchanged. | |
| 76 * | |
| 77 * The [Utf8BytesScanner] decodes the next unicode code point starting at the | |
| 78 * current position. Note that every unicode character is returned as a single | |
| 79 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following | |
| 80 * [advance] returns the next character. | |
| 81 * | |
| 82 * The [StringScanner] returns the current character unchanged, which might | |
| 83 * be a surrogate character. In the case of '\u{1d11e}', it returns the first | |
| 84 * code unit 55348, and the following [advance] returns the second code unit | |
| 85 * 56606. | |
| 86 * | |
| 87 * Invoking [currentAsUnicode] multiple times is safe, i.e., | |
| 88 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):]. | |
| 89 */ | |
| 90 int currentAsUnicode(int next); | |
| 91 | |
| 92 /** | |
| 93 * Returns the character at the next poisition. Like in [advance], the | |
| 94 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns | |
| 95 * a UTF-16 code unit. | |
| 23 */ | 96 */ |
| 24 int peek(); | 97 int peek(); |
| 25 | 98 |
| 26 /** | 99 /** |
| 27 * Appends a fixed token based on whether the current char is [choice] or not. | 100 * Notifies the scanner that unicode characters were detected in either a |
| 28 * If the current char is [choice] a fixed token whose kind and content | 101 * comment or a string literal between [startScanOffset] and the current |
| 29 * is determined by [yes] is appended, otherwise a fixed token whose kind | 102 * scan offset. |
| 30 * and content is determined by [no] is appended. | |
| 31 */ | 103 */ |
| 104 void handleUnicode(int startScanOffset); | |
| 105 | |
| 106 /** | |
| 107 * Returns the current scan offset. | |
| 108 * | |
| 109 * In the [Utf8BytesScanner] this is the offset into the byte list, in the | |
| 110 * [StringScanner] the offset in the source string. | |
| 111 */ | |
| 112 int get scanOffset; | |
| 113 | |
| 114 /** | |
| 115 * Returns the current string offset. | |
| 116 * | |
| 117 * In the [StringScanner] this is identical to the [scanOffset]. In the | |
| 118 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters. | |
| 119 */ | |
| 120 int get stringOffset; | |
| 121 | |
| 122 /** | |
| 123 * Returns the first token scanned by this [Scanner]. | |
| 124 */ | |
| 125 Token firstToken(); | |
| 126 | |
| 127 /** | |
| 128 * Returns the last token scanned by this [Scanner]. | |
| 129 */ | |
| 130 Token previousToken(); | |
| 131 | |
| 132 /** | |
| 133 * Notifies that a new token starts at current offset. | |
| 134 */ | |
| 135 void beginToken() { | |
| 136 tokenStart = stringOffset; | |
| 137 } | |
| 138 | |
| 139 /** | |
| 140 * Appends a substring from the scan offset [:start:] to the current | |
| 141 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current | |
| 142 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the | |
| 143 * substring string [5,9). | |
| 144 * | |
| 145 * Note that [extraOffset] can only be used if the covered character(s) are | |
| 146 * known to be ASCII. | |
| 147 */ | |
| 148 void appendSubstringToken(PrecedenceInfo info, int start, | |
| 149 bool asciiOnly, [int extraOffset]); | |
| 150 | |
| 151 /** Documentation in subclass [ArrayBasedScanner] */ | |
|
kasperl
2013/10/17 08:50:39
Terminate these comments /** Documentation in subc
lukas
2013/10/17 17:49:34
Done.
| |
| 152 void appendStringToken(PrecedenceInfo info, String value); | |
| 153 | |
| 154 /** Documentation in subclass [ArrayBasedScanner] */ | |
| 155 void appendPrecedenceToken(PrecedenceInfo info); | |
| 156 | |
| 157 /** Documentation in subclass [ArrayBasedScanner] */ | |
| 32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); | 158 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); |
| 33 | 159 |
| 34 /** | 160 /** Documentation in subclass [ArrayBasedScanner] */ |
| 35 * Appends a fixed token whose kind and content is determined by [info]. | 161 void appendKeywordToken(Keyword keyword); |
| 36 */ | |
| 37 void appendPrecedenceToken(PrecedenceInfo info); | |
| 38 | 162 |
| 39 /** | 163 /** Documentation in subclass [ArrayBasedScanner] */ |
| 40 * Appends a token whose kind is determined by [info] and content is [value]. | |
| 41 */ | |
| 42 void appendStringToken(PrecedenceInfo info, String value); | |
| 43 | |
| 44 /** | |
| 45 * Appends a token whose kind is determined by [info] and content is defined | |
| 46 * by the SourceString [value]. | |
| 47 */ | |
| 48 void appendByteStringToken(PrecedenceInfo info, T value); | |
| 49 | |
| 50 /** | |
| 51 * Appends a keyword token whose kind is determined by [keyword]. | |
| 52 */ | |
| 53 void appendKeywordToken(Keyword keyword); | |
| 54 void appendWhiteSpace(int next); | |
| 55 void appendEofToken(); | 164 void appendEofToken(); |
| 56 | 165 |
| 57 /** | 166 /** Documentation in subclass [ArrayBasedScanner] */ |
| 58 * Creates an ASCII SourceString whose content begins at the source byte | 167 void appendWhiteSpace(int next); |
| 59 * offset [start] and ends at [offset] bytes from the current byte offset of | |
| 60 * the scanner. For example, if the current byte offset is 10, | |
| 61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found | |
| 62 * at the [0,9[ byte interval of the source text. | |
| 63 */ | |
| 64 T asciiString(int start, int offset); | |
| 65 T utf8String(int start, int offset); | |
| 66 Token firstToken(); | |
| 67 Token previousToken(); | |
| 68 void beginToken(); | |
| 69 void addToCharOffset(int offset); | |
| 70 int get charOffset; | |
| 71 int get byteOffset; | |
| 72 void appendBeginGroup(PrecedenceInfo info, String value); | |
| 73 int appendEndGroup(PrecedenceInfo info, String value, int openKind); | |
| 74 void appendGt(PrecedenceInfo info, String value); | |
| 75 void appendGtGt(PrecedenceInfo info, String value); | |
| 76 void appendGtGtGt(PrecedenceInfo info, String value); | |
| 77 void appendComment(); | |
| 78 | 168 |
| 79 /** | 169 /** Documentation in subclass [ArrayBasedScanner] */ |
| 80 * We call this method to discard '<' from the "grouping" stack | 170 void lineFeedInMultiline(); |
| 81 * (maintained by subclasses). | 171 |
| 82 * | 172 /** Documentation in subclass [ArrayBasedScanner] */ |
| 83 * [PartialParser.skipExpression] relies on the fact that we do not | 173 void appendBeginGroup(PrecedenceInfo info); |
| 84 * create groups for stuff like: | 174 |
| 85 * [:a = b < c, d = e > f:]. | 175 /** Documentation in subclass [ArrayBasedScanner] */ |
| 86 * | 176 int appendEndGroup(PrecedenceInfo info, int openKind); |
| 87 * In other words, this method is called when the scanner recognizes | 177 |
| 88 * something which cannot possibly be part of a type | 178 /** Documentation in subclass [ArrayBasedScanner] */ |
| 89 * parameter/argument list. | 179 void appendGt(PrecedenceInfo info); |
| 90 */ | 180 |
| 181 /** Documentation in subclass [ArrayBasedScanner] */ | |
| 182 void appendGtGt(PrecedenceInfo info); | |
| 183 | |
| 184 /** Documentation in subclass [ArrayBasedScanner] */ | |
| 185 void appendComment(start, bool asciiOnly); | |
| 186 | |
| 187 /** Documentation in subclass [ArrayBasedScanner] */ | |
| 91 void discardOpenLt(); | 188 void discardOpenLt(); |
| 92 | 189 |
| 93 // TODO(ahe): Move this class to implementation. | 190 // TODO(ahe): Move this class to implementation. |
| 94 | 191 |
| 95 Token tokenize() { | 192 Token tokenize() { |
| 96 int next = advance(); | 193 int next = advance(); |
| 97 while (!identical(next, $EOF)) { | 194 while (!identical(next, $EOF)) { |
| 98 next = bigSwitch(next); | 195 next = bigSwitch(next); |
| 99 } | 196 } |
| 100 appendEofToken(); | 197 appendEofToken(); |
| 198 | |
| 199 if (file != null) { | |
| 200 file.length = stringOffset; | |
| 201 // One additional line start at the end, see [SourceFile.lineStarts]. | |
| 202 lineStarts.add(stringOffset + 1); | |
| 203 file.lineStarts = lineStarts; | |
| 204 } | |
| 205 | |
| 101 return firstToken(); | 206 return firstToken(); |
| 102 } | 207 } |
| 103 | 208 |
| 104 int bigSwitch(int next) { | 209 int bigSwitch(int next) { |
| 105 beginToken(); | 210 beginToken(); |
| 106 if (identical(next, $SPACE) || identical(next, $TAB) | 211 if (identical(next, $SPACE) || identical(next, $TAB) |
| 107 || identical(next, $LF) || identical(next, $CR)) { | 212 || identical(next, $LF) || identical(next, $CR)) { |
| 108 appendWhiteSpace(next); | 213 appendWhiteSpace(next); |
| 109 next = advance(); | 214 next = advance(); |
| 215 // Sequences of spaces are common, so advance through them fast. | |
| 110 while (identical(next, $SPACE)) { | 216 while (identical(next, $SPACE)) { |
| 111 appendWhiteSpace(next); | 217 // We don't invoke [:appendWhiteSpace(next):] here for efficiency, |
| 218 // assuming that it does not do anything for space characters. | |
| 112 next = advance(); | 219 next = advance(); |
| 113 } | 220 } |
| 114 return next; | 221 return next; |
| 115 } | 222 } |
| 116 | 223 |
| 117 if ($a <= next && next <= $z) { | 224 if ($a <= next && next <= $z) { |
| 118 if (identical($r, next)) { | 225 if (identical($r, next)) { |
| 119 return tokenizeRawStringKeywordOrIdentifier(next); | 226 return tokenizeRawStringKeywordOrIdentifier(next); |
| 120 } | 227 } |
| 121 return tokenizeKeywordOrIdentifier(next, true); | 228 return tokenizeKeywordOrIdentifier(next, true); |
| 122 } | 229 } |
| 123 | 230 |
| 124 if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$) ) { | 231 if (($A <= next && next <= $Z) || |
| 125 return tokenizeIdentifier(next, byteOffset, true); | 232 identical(next, $_) || |
| 233 identical(next, $$)) { | |
| 234 return tokenizeIdentifier(next, scanOffset, true); | |
| 126 } | 235 } |
| 127 | 236 |
| 128 if (identical(next, $LT)) { | 237 if (identical(next, $LT)) { |
| 129 return tokenizeLessThan(next); | 238 return tokenizeLessThan(next); |
| 130 } | 239 } |
| 131 | 240 |
| 132 if (identical(next, $GT)) { | 241 if (identical(next, $GT)) { |
| 133 return tokenizeGreaterThan(next); | 242 return tokenizeGreaterThan(next); |
| 134 } | 243 } |
| 135 | 244 |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 180 if (identical(next, $BACKSLASH)) { | 289 if (identical(next, $BACKSLASH)) { |
| 181 appendPrecedenceToken(BACKSLASH_INFO); | 290 appendPrecedenceToken(BACKSLASH_INFO); |
| 182 return advance(); | 291 return advance(); |
| 183 } | 292 } |
| 184 | 293 |
| 185 if (identical(next, $HASH)) { | 294 if (identical(next, $HASH)) { |
| 186 return tokenizeTag(next); | 295 return tokenizeTag(next); |
| 187 } | 296 } |
| 188 | 297 |
| 189 if (identical(next, $OPEN_PAREN)) { | 298 if (identical(next, $OPEN_PAREN)) { |
| 190 appendBeginGroup(OPEN_PAREN_INFO, "("); | 299 appendBeginGroup(OPEN_PAREN_INFO); |
| 191 return advance(); | 300 return advance(); |
| 192 } | 301 } |
| 193 | 302 |
| 194 if (identical(next, $CLOSE_PAREN)) { | 303 if (identical(next, $CLOSE_PAREN)) { |
| 195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); | 304 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN); |
| 196 } | 305 } |
| 197 | 306 |
| 198 if (identical(next, $COMMA)) { | 307 if (identical(next, $COMMA)) { |
| 199 appendPrecedenceToken(COMMA_INFO); | 308 appendPrecedenceToken(COMMA_INFO); |
| 200 return advance(); | 309 return advance(); |
| 201 } | 310 } |
| 202 | 311 |
| 203 if (identical(next, $COLON)) { | 312 if (identical(next, $COLON)) { |
| 204 appendPrecedenceToken(COLON_INFO); | 313 appendPrecedenceToken(COLON_INFO); |
| 205 return advance(); | 314 return advance(); |
| 206 } | 315 } |
| 207 | 316 |
| 208 if (identical(next, $SEMICOLON)) { | 317 if (identical(next, $SEMICOLON)) { |
| 209 appendPrecedenceToken(SEMICOLON_INFO); | 318 appendPrecedenceToken(SEMICOLON_INFO); |
| 210 // Type parameters and arguments cannot contain semicolon. | 319 // Type parameters and arguments cannot contain semicolon. |
| 211 discardOpenLt(); | 320 discardOpenLt(); |
| 212 return advance(); | 321 return advance(); |
| 213 } | 322 } |
| 214 | 323 |
| 215 if (identical(next, $QUESTION)) { | 324 if (identical(next, $QUESTION)) { |
| 216 appendPrecedenceToken(QUESTION_INFO); | 325 appendPrecedenceToken(QUESTION_INFO); |
| 217 return advance(); | 326 return advance(); |
| 218 } | 327 } |
| 219 | 328 |
| 220 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | 329 if (identical(next, $CLOSE_SQUARE_BRACKET)) { |
| 221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", | 330 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, |
| 222 OPEN_SQUARE_BRACKET_TOKEN); | 331 OPEN_SQUARE_BRACKET_TOKEN); |
| 223 } | 332 } |
| 224 | 333 |
| 225 if (identical(next, $BACKPING)) { | 334 if (identical(next, $BACKPING)) { |
| 226 appendPrecedenceToken(BACKPING_INFO); | 335 appendPrecedenceToken(BACKPING_INFO); |
| 227 return advance(); | 336 return advance(); |
| 228 } | 337 } |
| 229 | 338 |
| 230 if (identical(next, $OPEN_CURLY_BRACKET)) { | 339 if (identical(next, $OPEN_CURLY_BRACKET)) { |
| 231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); | 340 appendBeginGroup(OPEN_CURLY_BRACKET_INFO); |
| 232 return advance(); | 341 return advance(); |
| 233 } | 342 } |
| 234 | 343 |
| 235 if (identical(next, $CLOSE_CURLY_BRACKET)) { | 344 if (identical(next, $CLOSE_CURLY_BRACKET)) { |
| 236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", | 345 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, |
| 237 OPEN_CURLY_BRACKET_TOKEN); | 346 OPEN_CURLY_BRACKET_TOKEN); |
| 238 } | 347 } |
| 239 | 348 |
| 240 if (identical(next, $SLASH)) { | 349 if (identical(next, $SLASH)) { |
| 241 return tokenizeSlashOrComment(next); | 350 return tokenizeSlashOrComment(next); |
| 242 } | 351 } |
| 243 | 352 |
| 244 if (identical(next, $AT)) { | 353 if (identical(next, $AT)) { |
| 245 return tokenizeAt(next); | 354 return tokenizeAt(next); |
| 246 } | 355 } |
| 247 | 356 |
| 248 if (identical(next, $DQ) || identical(next, $SQ)) { | 357 if (identical(next, $DQ) || identical(next, $SQ)) { |
| 249 return tokenizeString(next, byteOffset, false); | 358 return tokenizeString(next, scanOffset, false); |
| 250 } | 359 } |
| 251 | 360 |
| 252 if (identical(next, $PERIOD)) { | 361 if (identical(next, $PERIOD)) { |
| 253 return tokenizeDotsOrNumber(next); | 362 return tokenizeDotsOrNumber(next); |
| 254 } | 363 } |
| 255 | 364 |
| 256 if (identical(next, $0)) { | 365 if (identical(next, $0)) { |
| 257 return tokenizeHexOrNumber(next); | 366 return tokenizeHexOrNumber(next); |
| 258 } | 367 } |
| 259 | 368 |
| 260 // TODO(ahe): Would a range check be faster? | 369 // TODO(ahe): Would a range check be faster? |
| 261 if (identical(next, $1) || identical(next, $2) || identical(next, $3) | 370 if (identical(next, $1) || identical(next, $2) || identical(next, $3) |
| 262 || identical(next, $4) || identical(next, $5) || identical(next, $6) | 371 || identical(next, $4) || identical(next, $5) || identical(next, $6) |
| 263 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { | 372 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { |
| 264 return tokenizeNumber(next); | 373 return tokenizeNumber(next); |
| 265 } | 374 } |
| 266 | 375 |
| 267 if (identical(next, $EOF)) { | 376 if (identical(next, $EOF)) { |
| 268 return $EOF; | 377 return $EOF; |
| 269 } | 378 } |
| 270 if (next < 0x1f) { | 379 if (next < 0x1f) { |
| 271 return error(new SourceString("unexpected character $next")); | 380 return error("unexpected character $next"); |
| 381 } | |
| 382 | |
| 383 if (next >= 128) { | |
| 384 next = currentAsUnicode(next); | |
| 272 } | 385 } |
| 273 | 386 |
| 274 // The following are non-ASCII characters. | 387 // The following are non-ASCII characters. |
|
kasperl
2013/10/17 08:50:39
Can the check for $NBSP be guarded by the next >=
lukas
2013/10/17 17:49:34
Actually we can just remove the check for >= 128.
| |
| 275 | 388 |
| 276 if (identical(next, $NBSP)) { | 389 if (identical(next, $NBSP)) { |
| 277 appendWhiteSpace(next); | 390 appendWhiteSpace(next); |
| 278 return advance(); | 391 return advance(); |
| 279 } | 392 } |
| 280 | 393 |
| 281 return tokenizeIdentifier(next, byteOffset, true); | 394 return error("unexpected unicode character $next"); |
| 282 } | 395 } |
| 283 | 396 |
| 284 int tokenizeTag(int next) { | 397 int tokenizeTag(int next) { |
| 285 // # or #!.*[\n\r] | 398 // # or #!.*[\n\r] |
| 286 if (byteOffset == 0) { | 399 if (scanOffset == 0) { |
| 287 if (identical(peek(), $BANG)) { | 400 if (identical(peek(), $BANG)) { |
| 401 int start = scanOffset + 1; | |
| 402 bool asciiOnly = true; | |
| 288 do { | 403 do { |
| 289 next = advance(); | 404 next = advance(); |
| 290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne xt, $EOF)); | 405 if (next > 127) asciiOnly = false; |
| 406 } while (!identical(next, $LF) && | |
| 407 !identical(next, $CR) && | |
| 408 !identical(next, $EOF)); | |
| 409 if (!asciiOnly) handleUnicode(start); | |
| 291 return next; | 410 return next; |
| 292 } | 411 } |
| 293 } | 412 } |
| 294 appendPrecedenceToken(HASH_INFO); | 413 appendPrecedenceToken(HASH_INFO); |
| 295 return advance(); | 414 return advance(); |
| 296 } | 415 } |
| 297 | 416 |
| 298 int tokenizeTilde(int next) { | 417 int tokenizeTilde(int next) { |
| 299 // ~ ~/ ~/= | 418 // ~ ~/ ~/= |
| 300 next = advance(); | 419 next = advance(); |
| 301 if (identical(next, $SLASH)) { | 420 if (identical(next, $SLASH)) { |
| 302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); | 421 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); |
| 303 } else { | 422 } else { |
| 304 appendPrecedenceToken(TILDE_INFO); | 423 appendPrecedenceToken(TILDE_INFO); |
| 305 return next; | 424 return next; |
| 306 } | 425 } |
| 307 } | 426 } |
| 308 | 427 |
| 309 int tokenizeOpenSquareBracket(int next) { | 428 int tokenizeOpenSquareBracket(int next) { |
| 310 // [ [] []= | 429 // [ [] []= |
| 311 next = advance(); | 430 next = advance(); |
| 312 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | 431 if (identical(next, $CLOSE_SQUARE_BRACKET)) { |
| 313 Token token = previousToken(); | 432 Token token = previousToken(); |
| 314 if (token is KeywordToken && identical(token.value.stringValue, 'operator' )) { | 433 if (token is KeywordToken && |
| 434 identical((token as KeywordToken).keyword.syntax, 'operator')) { | |
| 315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); | 435 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); |
| 316 } | 436 } |
| 317 } | 437 } |
| 318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); | 438 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO); |
| 319 return next; | 439 return next; |
| 320 } | 440 } |
| 321 | 441 |
| 322 int tokenizeCaret(int next) { | 442 int tokenizeCaret(int next) { |
| 323 // ^ ^= | 443 // ^ ^= |
| 324 return select($EQ, CARET_EQ_INFO, CARET_INFO); | 444 return select($EQ, CARET_EQ_INFO, CARET_INFO); |
| 325 } | 445 } |
| 326 | 446 |
| 327 int tokenizeBar(int next) { | 447 int tokenizeBar(int next) { |
| 328 // | || |= | 448 // | || |= |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 372 return advance(); | 492 return advance(); |
| 373 } else if (identical(next, $EQ)) { | 493 } else if (identical(next, $EQ)) { |
| 374 appendPrecedenceToken(MINUS_EQ_INFO); | 494 appendPrecedenceToken(MINUS_EQ_INFO); |
| 375 return advance(); | 495 return advance(); |
| 376 } else { | 496 } else { |
| 377 appendPrecedenceToken(MINUS_INFO); | 497 appendPrecedenceToken(MINUS_INFO); |
| 378 return next; | 498 return next; |
| 379 } | 499 } |
| 380 } | 500 } |
| 381 | 501 |
| 382 | |
| 383 int tokenizePlus(int next) { | 502 int tokenizePlus(int next) { |
| 384 // + ++ += | 503 // + ++ += |
| 385 next = advance(); | 504 next = advance(); |
| 386 if (identical($PLUS, next)) { | 505 if (identical($PLUS, next)) { |
| 387 appendPrecedenceToken(PLUS_PLUS_INFO); | 506 appendPrecedenceToken(PLUS_PLUS_INFO); |
| 388 return advance(); | 507 return advance(); |
| 389 } else if (identical($EQ, next)) { | 508 } else if (identical($EQ, next)) { |
| 390 appendPrecedenceToken(PLUS_EQ_INFO); | 509 appendPrecedenceToken(PLUS_EQ_INFO); |
| 391 return advance(); | 510 return advance(); |
| 392 } else { | 511 } else { |
| 393 appendPrecedenceToken(PLUS_INFO); | 512 appendPrecedenceToken(PLUS_INFO); |
| 394 return next; | 513 return next; |
| 395 } | 514 } |
| 396 } | 515 } |
| 397 | 516 |
| 398 int tokenizeExclamation(int next) { | 517 int tokenizeExclamation(int next) { |
| 399 // ! != !== | 518 // ! != |
| 519 // !== is kept for user-friendly error reporting | |
|
kasperl
2013/10/17 08:50:39
Nit: I'd terminate the ... is kept ... comments wi
lukas
2013/10/17 17:49:34
Done.
| |
| 520 | |
| 400 next = advance(); | 521 next = advance(); |
| 401 if (identical(next, $EQ)) { | 522 if (identical(next, $EQ)) { |
| 402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); | 523 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); |
| 403 } | 524 } |
| 404 appendPrecedenceToken(BANG_INFO); | 525 appendPrecedenceToken(BANG_INFO); |
| 405 return next; | 526 return next; |
| 406 } | 527 } |
| 407 | 528 |
| 408 int tokenizeEquals(int next) { | 529 int tokenizeEquals(int next) { |
| 409 // = == === | 530 // = == => |
| 531 // === is kept for user-friendly error reporting | |
| 410 | 532 |
| 411 // Type parameters and arguments cannot contain any token that | 533 // Type parameters and arguments cannot contain any token that |
| 412 // starts with '='. | 534 // starts with '='. |
| 413 discardOpenLt(); | 535 discardOpenLt(); |
| 414 | 536 |
| 415 next = advance(); | 537 next = advance(); |
| 416 if (identical(next, $EQ)) { | 538 if (identical(next, $EQ)) { |
| 417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); | 539 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); |
| 418 } else if (identical(next, $GT)) { | 540 } else if (identical(next, $GT)) { |
| 419 appendPrecedenceToken(FUNCTION_INFO); | 541 appendPrecedenceToken(FUNCTION_INFO); |
| 420 return advance(); | 542 return advance(); |
| 421 } | 543 } |
| 422 appendPrecedenceToken(EQ_INFO); | 544 appendPrecedenceToken(EQ_INFO); |
| 423 return next; | 545 return next; |
| 424 } | 546 } |
| 425 | 547 |
| 426 int tokenizeGreaterThan(int next) { | 548 int tokenizeGreaterThan(int next) { |
| 427 // > >= >> >>= >>> >>>= | 549 // > >= >> >>= |
| 428 next = advance(); | 550 next = advance(); |
| 429 if (identical($EQ, next)) { | 551 if (identical($EQ, next)) { |
| 430 appendPrecedenceToken(GT_EQ_INFO); | 552 appendPrecedenceToken(GT_EQ_INFO); |
| 431 return advance(); | 553 return advance(); |
| 432 } else if (identical($GT, next)) { | 554 } else if (identical($GT, next)) { |
| 433 next = advance(); | 555 next = advance(); |
| 434 if (identical($EQ, next)) { | 556 if (identical($EQ, next)) { |
| 435 appendPrecedenceToken(GT_GT_EQ_INFO); | 557 appendPrecedenceToken(GT_GT_EQ_INFO); |
| 436 return advance(); | 558 return advance(); |
| 437 } else { | 559 } else { |
| 438 appendGtGt(GT_GT_INFO, ">>"); | 560 appendGtGt(GT_GT_INFO); |
| 439 return next; | 561 return next; |
| 440 } | 562 } |
| 441 } else { | 563 } else { |
| 442 appendGt(GT_INFO, ">"); | 564 appendGt(GT_INFO); |
| 443 return next; | 565 return next; |
| 444 } | 566 } |
| 445 } | 567 } |
| 446 | 568 |
| 447 int tokenizeLessThan(int next) { | 569 int tokenizeLessThan(int next) { |
| 448 // < <= << <<= | 570 // < <= << <<= |
| 449 next = advance(); | 571 next = advance(); |
| 450 if (identical($EQ, next)) { | 572 if (identical($EQ, next)) { |
| 451 appendPrecedenceToken(LT_EQ_INFO); | 573 appendPrecedenceToken(LT_EQ_INFO); |
| 452 return advance(); | 574 return advance(); |
| 453 } else if (identical($LT, next)) { | 575 } else if (identical($LT, next)) { |
| 454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); | 576 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); |
| 455 } else { | 577 } else { |
| 456 appendBeginGroup(LT_INFO, "<"); | 578 appendBeginGroup(LT_INFO); |
| 457 return next; | 579 return next; |
| 458 } | 580 } |
| 459 } | 581 } |
| 460 | 582 |
| 461 int tokenizeNumber(int next) { | 583 int tokenizeNumber(int next) { |
| 462 int start = byteOffset; | 584 int start = scanOffset; |
| 463 while (true) { | 585 while (true) { |
| 464 next = advance(); | 586 next = advance(); |
| 465 if ($0 <= next && next <= $9) { | 587 if ($0 <= next && next <= $9) { |
| 466 continue; | 588 continue; |
| 467 } else if (identical(next, $e) || identical(next, $E)) { | 589 } else if (identical(next, $e) || identical(next, $E)) { |
| 468 return tokenizeFractionPart(next, start); | 590 return tokenizeFractionPart(next, start); |
| 469 } else { | 591 } else { |
| 470 if (identical(next, $PERIOD)) { | 592 if (identical(next, $PERIOD)) { |
| 471 int nextnext = peek(); | 593 int nextnext = peek(); |
| 472 if ($0 <= nextnext && nextnext <= $9) { | 594 if ($0 <= nextnext && nextnext <= $9) { |
| 473 return tokenizeFractionPart(advance(), start); | 595 return tokenizeFractionPart(advance(), start); |
| 474 } | 596 } |
| 475 } | 597 } |
| 476 appendByteStringToken(INT_INFO, asciiString(start, 0)); | 598 appendSubstringToken(INT_INFO, start, true); |
| 477 return next; | 599 return next; |
| 478 } | 600 } |
| 479 } | 601 } |
| 480 } | 602 } |
| 481 | 603 |
| 482 int tokenizeHexOrNumber(int next) { | 604 int tokenizeHexOrNumber(int next) { |
| 483 int x = peek(); | 605 int x = peek(); |
| 484 if (identical(x, $x) || identical(x, $X)) { | 606 if (identical(x, $x) || identical(x, $X)) { |
| 485 advance(); | 607 return tokenizeHex(next); |
| 486 return tokenizeHex(x); | |
| 487 } | 608 } |
| 488 return tokenizeNumber(next); | 609 return tokenizeNumber(next); |
| 489 } | 610 } |
| 490 | 611 |
| 491 int tokenizeHex(int next) { | 612 int tokenizeHex(int next) { |
| 492 int start = byteOffset - 1; | 613 int start = scanOffset; |
| 614 next = advance(); // Advance past the $x or $X. | |
| 493 bool hasDigits = false; | 615 bool hasDigits = false; |
| 494 while (true) { | 616 while (true) { |
| 495 next = advance(); | 617 next = advance(); |
| 496 if (($0 <= next && next <= $9) | 618 if (($0 <= next && next <= $9) |
| 497 || ($A <= next && next <= $F) | 619 || ($A <= next && next <= $F) |
| 498 || ($a <= next && next <= $f)) { | 620 || ($a <= next && next <= $f)) { |
| 499 hasDigits = true; | 621 hasDigits = true; |
| 500 } else { | 622 } else { |
| 501 if (!hasDigits) { | 623 if (!hasDigits) { |
| 502 return error(const SourceString("hex digit expected")); | 624 return error("hex digit expected"); |
| 503 } | 625 } |
| 504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); | 626 appendSubstringToken(HEXADECIMAL_INFO, start, true); |
| 505 return next; | 627 return next; |
| 506 } | 628 } |
| 507 } | 629 } |
| 508 } | 630 } |
| 509 | 631 |
| 510 int tokenizeDotsOrNumber(int next) { | 632 int tokenizeDotsOrNumber(int next) { |
| 511 int start = byteOffset; | 633 int start = scanOffset; |
| 512 next = advance(); | 634 next = advance(); |
| 513 if (($0 <= next && next <= $9)) { | 635 if (($0 <= next && next <= $9)) { |
| 514 return tokenizeFractionPart(next, start); | 636 return tokenizeFractionPart(next, start); |
| 515 } else if (identical($PERIOD, next)) { | 637 } else if (identical($PERIOD, next)) { |
| 516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | 638 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
| 517 } else { | 639 } else { |
| 518 appendPrecedenceToken(PERIOD_INFO); | 640 appendPrecedenceToken(PERIOD_INFO); |
| 519 return next; | 641 return next; |
| 520 } | 642 } |
| 521 } | 643 } |
| 522 | 644 |
| 523 int tokenizeFractionPart(int next, int start) { | 645 int tokenizeFractionPart(int next, int start) { |
| 524 bool done = false; | 646 bool done = false; |
| 525 bool hasDigit = false; | 647 bool hasDigit = false; |
| 526 LOOP: while (!done) { | 648 LOOP: while (!done) { |
| 527 if ($0 <= next && next <= $9) { | 649 if ($0 <= next && next <= $9) { |
| 528 hasDigit = true; | 650 hasDigit = true; |
| 529 } else if (identical($e, next) || identical($E, next)) { | 651 } else if (identical($e, next) || identical($E, next)) { |
| 530 hasDigit = true; | 652 hasDigit = true; |
| 531 next = tokenizeExponent(advance()); | 653 next = tokenizeExponent(advance()); |
| 532 done = true; | 654 done = true; |
| 533 continue LOOP; | 655 continue LOOP; |
| 534 } else { | 656 } else { |
| 535 done = true; | 657 done = true; |
| 536 continue LOOP; | 658 continue LOOP; |
| 537 } | 659 } |
| 538 next = advance(); | 660 next = advance(); |
| 539 } | 661 } |
| 540 if (!hasDigit) { | 662 if (!hasDigit) { |
| 541 appendByteStringToken(INT_INFO, asciiString(start, -1)); | 663 // Reduce offset, we already advanced to the token past the period. |
| 664 appendSubstringToken(INT_INFO, start, true, -1); | |
| 665 | |
| 666 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because | |
| 667 // the scanner already advanced past the period. | |
| 542 if (identical($PERIOD, next)) { | 668 if (identical($PERIOD, next)) { |
| 543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | 669 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
| 544 } | 670 } |
| 545 // TODO(ahe): Wrong offset for the period. | |
| 546 appendPrecedenceToken(PERIOD_INFO); | 671 appendPrecedenceToken(PERIOD_INFO); |
| 547 return bigSwitch(next); | 672 return next; |
| 548 } | 673 } |
| 549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); | 674 appendSubstringToken(DOUBLE_INFO, start, true); |
| 550 return next; | 675 return next; |
| 551 } | 676 } |
| 552 | 677 |
| 553 int tokenizeExponent(int next) { | 678 int tokenizeExponent(int next) { |
| 554 if (identical(next, $PLUS) || identical(next, $MINUS)) { | 679 if (identical(next, $PLUS) || identical(next, $MINUS)) { |
| 555 next = advance(); | 680 next = advance(); |
| 556 } | 681 } |
| 557 bool hasDigits = false; | 682 bool hasDigits = false; |
| 558 while (true) { | 683 while (true) { |
| 559 if ($0 <= next && next <= $9) { | 684 if ($0 <= next && next <= $9) { |
| 560 hasDigits = true; | 685 hasDigits = true; |
| 561 } else { | 686 } else { |
| 562 if (!hasDigits) { | 687 if (!hasDigits) { |
| 563 return error(const SourceString("digit expected")); | 688 return error("digit expected"); |
| 564 } | 689 } |
| 565 return next; | 690 return next; |
| 566 } | 691 } |
| 567 next = advance(); | 692 next = advance(); |
| 568 } | 693 } |
| 569 } | 694 } |
| 570 | 695 |
| 571 int tokenizeSlashOrComment(int next) { | 696 int tokenizeSlashOrComment(int next) { |
| 697 int start = scanOffset; | |
| 572 next = advance(); | 698 next = advance(); |
| 573 if (identical($STAR, next)) { | 699 if (identical($STAR, next)) { |
| 574 return tokenizeMultiLineComment(next); | 700 return tokenizeMultiLineComment(next, start); |
| 575 } else if (identical($SLASH, next)) { | 701 } else if (identical($SLASH, next)) { |
| 576 return tokenizeSingleLineComment(next); | 702 return tokenizeSingleLineComment(next, start); |
| 577 } else if (identical($EQ, next)) { | 703 } else if (identical($EQ, next)) { |
| 578 appendPrecedenceToken(SLASH_EQ_INFO); | 704 appendPrecedenceToken(SLASH_EQ_INFO); |
| 579 return advance(); | 705 return advance(); |
| 580 } else { | 706 } else { |
| 581 appendPrecedenceToken(SLASH_INFO); | 707 appendPrecedenceToken(SLASH_INFO); |
| 582 return next; | 708 return next; |
| 583 } | 709 } |
| 584 } | 710 } |
| 585 | 711 |
| 586 int tokenizeSingleLineComment(int next) { | 712 int tokenizeSingleLineComment(int next, int start) { |
| 713 bool asciiOnly = true; | |
| 587 while (true) { | 714 while (true) { |
| 588 next = advance(); | 715 next = advance(); |
| 589 if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) { | 716 if (next > 127) asciiOnly = false; |
| 590 appendComment(); | 717 if (identical($LF, next) || |
| 718 identical($CR, next) || | |
| 719 identical($EOF, next)) { | |
| 720 if (!asciiOnly) handleUnicode(start); | |
| 721 appendComment(start, asciiOnly); | |
| 591 return next; | 722 return next; |
| 592 } | 723 } |
| 593 } | 724 } |
| 594 } | 725 } |
| 595 | 726 |
| 596 int tokenizeMultiLineComment(int next) { | 727 |
| 728 int tokenizeMultiLineComment(int next, int start) { | |
| 729 bool asciiOnlyComment = true; // Track if the entire comment is ASCII. | |
| 730 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode. | |
| 731 int unicodeStart = start; | |
| 597 int nesting = 1; | 732 int nesting = 1; |
| 598 next = advance(); | 733 next = advance(); |
| 599 while (true) { | 734 while (true) { |
| 600 if (identical($EOF, next)) { | 735 if (identical($EOF, next)) { |
| 601 // TODO(ahe): Report error. | 736 if (!asciiOnlyLines) handleUnicode(unicodeStart); |
| 737 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment"); | |
| 602 return next; | 738 return next; |
| 603 } else if (identical($STAR, next)) { | 739 } else if (identical($STAR, next)) { |
| 604 next = advance(); | 740 next = advance(); |
| 605 if (identical($SLASH, next)) { | 741 if (identical($SLASH, next)) { |
| 606 --nesting; | 742 --nesting; |
| 607 if (0 == nesting) { | 743 if (0 == nesting) { |
| 744 if (!asciiOnlyLines) handleUnicode(unicodeStart); | |
| 608 next = advance(); | 745 next = advance(); |
| 609 appendComment(); | 746 appendComment(start, asciiOnlyComment); |
| 610 return next; | 747 return next; |
| 611 } else { | 748 } else { |
| 612 next = advance(); | 749 next = advance(); |
| 613 } | 750 } |
| 614 } | 751 } |
| 615 } else if (identical($SLASH, next)) { | 752 } else if (identical($SLASH, next)) { |
| 616 next = advance(); | 753 next = advance(); |
| 617 if (identical($STAR, next)) { | 754 if (identical($STAR, next)) { |
| 618 next = advance(); | 755 next = advance(); |
| 619 ++nesting; | 756 ++nesting; |
| 620 } | 757 } |
| 758 } else if (identical(next, $LF)) { | |
| 759 if (!asciiOnlyLines) { | |
| 760 // Synchronize the string offset in the utf8 scanner. | |
| 761 handleUnicode(unicodeStart); | |
| 762 asciiOnlyLines = true; | |
| 763 unicodeStart = scanOffset; | |
| 764 } | |
| 765 lineFeedInMultiline(); | |
| 766 next = advance(); | |
| 621 } else { | 767 } else { |
| 768 if (next > 127) { | |
| 769 asciiOnlyLines = false; | |
| 770 asciiOnlyComment = false; | |
| 771 } | |
| 622 next = advance(); | 772 next = advance(); |
| 623 } | 773 } |
| 624 } | 774 } |
| 625 } | 775 } |
| 626 | 776 |
| 627 int tokenizeRawStringKeywordOrIdentifier(int next) { | 777 int tokenizeRawStringKeywordOrIdentifier(int next) { |
| 778 // [next] is $r. | |
| 628 int nextnext = peek(); | 779 int nextnext = peek(); |
| 629 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { | 780 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { |
| 630 int start = byteOffset; | 781 int start = scanOffset; |
| 631 next = advance(); | 782 next = advance(); |
| 632 return tokenizeString(next, start, true); | 783 return tokenizeString(next, start, true); |
| 633 } | 784 } |
| 634 return tokenizeKeywordOrIdentifier(next, true); | 785 return tokenizeKeywordOrIdentifier(next, true); |
| 635 } | 786 } |
| 636 | 787 |
| 637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { | 788 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { |
| 638 KeywordState state = KeywordState.KEYWORD_STATE; | 789 KeywordState state = KeywordState.KEYWORD_STATE; |
| 639 int start = byteOffset; | 790 int start = scanOffset; |
| 640 while (state != null && $a <= next && next <= $z) { | 791 while (state != null && $a <= next && next <= $z) { |
| 641 state = state.next(next); | 792 state = state.next(next); |
| 642 next = advance(); | 793 next = advance(); |
| 643 } | 794 } |
| 644 if (state == null || state.keyword == null) { | 795 if (state == null || state.keyword == null) { |
| 645 return tokenizeIdentifier(next, start, allowDollar); | 796 return tokenizeIdentifier(next, start, allowDollar); |
| 646 } | 797 } |
| 647 if (($A <= next && next <= $Z) || | 798 if (($A <= next && next <= $Z) || |
| 648 ($0 <= next && next <= $9) || | 799 ($0 <= next && next <= $9) || |
| 649 identical(next, $_) || | 800 identical(next, $_) || |
| 650 identical(next, $$)) { | 801 identical(next, $$)) { |
| 651 return tokenizeIdentifier(next, start, allowDollar); | 802 return tokenizeIdentifier(next, start, allowDollar); |
| 652 } else if (next < 128) { | 803 } else { |
| 653 appendKeywordToken(state.keyword); | 804 appendKeywordToken(state.keyword); |
| 654 return next; | 805 return next; |
| 655 } else { | |
| 656 return tokenizeIdentifier(next, start, allowDollar); | |
| 657 } | 806 } |
| 658 } | 807 } |
| 659 | 808 |
| 809 /** | |
| 810 * [allowDollar] can exclude '$', which is not allowed as part of a string | |
| 811 * interpolation identifier. | |
| 812 */ | |
| 660 int tokenizeIdentifier(int next, int start, bool allowDollar) { | 813 int tokenizeIdentifier(int next, int start, bool allowDollar) { |
| 661 bool isAscii = true; | |
| 662 | |
| 663 while (true) { | 814 while (true) { |
| 664 if (($a <= next && next <= $z) || | 815 if (($a <= next && next <= $z) || |
| 665 ($A <= next && next <= $Z) || | 816 ($A <= next && next <= $Z) || |
| 666 ($0 <= next && next <= $9) || | 817 ($0 <= next && next <= $9) || |
| 667 identical(next, $_) || | 818 identical(next, $_) || |
| 668 (identical(next, $$) && allowDollar)) { | 819 (identical(next, $$) && allowDollar)) { |
| 669 next = advance(); | 820 next = advance(); |
| 670 } else if ((next < 128) || (identical(next, $NBSP))) { | 821 } else { |
| 671 // Identifier ends here. | 822 // Identifier ends here. |
| 672 if (start == byteOffset) { | 823 if (start == scanOffset) { |
| 673 return error(const SourceString("expected identifier")); | 824 return error("expected identifier"); |
| 674 } else if (isAscii) { | |
| 675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0)); | |
| 676 } else { | 825 } else { |
| 677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1)); | 826 appendSubstringToken(IDENTIFIER_INFO, start, true); |
| 678 } | 827 } |
| 679 return next; | 828 return next; |
| 680 } else { | |
| 681 int nonAsciiStart = byteOffset; | |
| 682 do { | |
| 683 next = nextByte(); | |
| 684 if (identical(next, $NBSP)) break; | |
| 685 } while (next > 127); | |
| 686 String string = utf8String(nonAsciiStart, -1).slowToString(); | |
| 687 isAscii = false; | |
| 688 int byteLength = nonAsciiStart - byteOffset; | |
| 689 addToCharOffset(string.length - byteLength); | |
| 690 } | 829 } |
| 691 } | 830 } |
| 692 } | 831 } |
| 693 | 832 |
| 694 int tokenizeAt(int next) { | 833 int tokenizeAt(int next) { |
| 695 int start = byteOffset; | |
| 696 next = advance(); | |
| 697 appendPrecedenceToken(AT_INFO); | 834 appendPrecedenceToken(AT_INFO); |
| 698 return next; | 835 return advance(); |
| 699 } | 836 } |
| 700 | 837 |
| 701 int tokenizeString(int next, int start, bool raw) { | 838 int tokenizeString(int next, int start, bool raw) { |
| 702 int quoteChar = next; | 839 int quoteChar = next; |
| 703 next = advance(); | 840 next = advance(); |
| 704 if (identical(quoteChar, next)) { | 841 if (identical(quoteChar, next)) { |
| 705 next = advance(); | 842 next = advance(); |
| 706 if (identical(quoteChar, next)) { | 843 if (identical(quoteChar, next)) { |
| 707 // Multiline string. | 844 // Multiline string. |
| 708 return tokenizeMultiLineString(quoteChar, start, raw); | 845 return tokenizeMultiLineString(quoteChar, start, raw); |
| 709 } else { | 846 } else { |
| 710 // Empty string. | 847 // Empty string. |
| 711 appendByteStringToken(STRING_INFO, utf8String(start, -1)); | 848 appendSubstringToken(STRING_INFO, start, true); |
| 712 return next; | 849 return next; |
| 713 } | 850 } |
| 714 } | 851 } |
| 715 if (raw) { | 852 if (raw) { |
| 716 return tokenizeSingleLineRawString(next, quoteChar, start); | 853 return tokenizeSingleLineRawString(next, quoteChar, start); |
| 717 } else { | 854 } else { |
| 718 return tokenizeSingleLineString(next, quoteChar, start); | 855 return tokenizeSingleLineString(next, quoteChar, start); |
| 719 } | 856 } |
| 720 } | 857 } |
| 721 | 858 |
| 722 static bool isHexDigit(int character) { | 859 /** |
| 723 if ($0 <= character && character <= $9) return true; | 860 * [next] is the first character after the qoute. |
| 724 character |= 0x20; | 861 * [start] is the scanOffset of the quote. |
| 725 return ($a <= character && character <= $f); | 862 * |
| 726 } | 863 * The token contains a substring of the source file, including the |
| 727 | 864 * string quotes, backslashes for escaping. For interpolated strings, |
| 865 * the parts before and after are separate tokens. | |
| 866 * | |
| 867 * "a $b c" | |
| 868 * | |
| 869 * gives StringToken("a $), StringToken(b) and StringToken( c"). | |
| 870 */ | |
| 728 int tokenizeSingleLineString(int next, int quoteChar, int start) { | 871 int tokenizeSingleLineString(int next, int quoteChar, int start) { |
| 872 bool asciiOnly = true; | |
| 729 while (!identical(next, quoteChar)) { | 873 while (!identical(next, quoteChar)) { |
| 730 if (identical(next, $BACKSLASH)) { | 874 if (identical(next, $BACKSLASH)) { |
| 731 next = advance(); | 875 next = advance(); |
| 732 } else if (identical(next, $$)) { | 876 } else if (identical(next, $$)) { |
| 733 next = tokenizeStringInterpolation(start); | 877 if (!asciiOnly) handleUnicode(start); |
| 734 start = byteOffset; | 878 next = tokenizeStringInterpolation(start, asciiOnly); |
| 879 start = scanOffset; | |
| 880 asciiOnly = true; | |
| 735 continue; | 881 continue; |
| 736 } | 882 } |
| 737 if (next <= $CR | 883 if (next <= $CR |
| 738 && (identical(next, $LF) || identical(next, $CR) || identical(next, $E OF))) { | 884 && (identical(next, $LF) || |
| 739 return error(const SourceString("unterminated string literal")); | 885 identical(next, $CR) || |
| 886 identical(next, $EOF))) { | |
| 887 if (!asciiOnly) handleUnicode(start); | |
| 888 return error("unterminated string literal"); | |
| 740 } | 889 } |
| 890 if (next > 127) asciiOnly = false; | |
| 741 next = advance(); | 891 next = advance(); |
| 742 } | 892 } |
| 743 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 893 if (!asciiOnly) handleUnicode(start); |
| 744 return advance(); | 894 // Advance past the quote character. |
| 895 next = advance(); | |
| 896 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
| 897 return next; | |
| 745 } | 898 } |
| 746 | 899 |
| 747 int tokenizeStringInterpolation(int start) { | 900 int tokenizeStringInterpolation(int start, bool asciiOnly) { |
| 748 appendByteStringToken(STRING_INFO, utf8String(start, -1)); | 901 appendSubstringToken(STRING_INFO, start, asciiOnly); |
| 749 beginToken(); // $ starts here. | 902 beginToken(); // $ starts here. |
| 750 int next = advance(); | 903 int next = advance(); |
| 751 if (identical(next, $OPEN_CURLY_BRACKET)) { | 904 if (identical(next, $OPEN_CURLY_BRACKET)) { |
| 752 return tokenizeInterpolatedExpression(next, start); | 905 return tokenizeInterpolatedExpression(next); |
| 753 } else { | 906 } else { |
| 754 return tokenizeInterpolatedIdentifier(next, start); | 907 return tokenizeInterpolatedIdentifier(next); |
| 755 } | 908 } |
| 756 } | 909 } |
| 757 | 910 |
| 758 int tokenizeInterpolatedExpression(int next, int start) { | 911 int tokenizeInterpolatedExpression(int next) { |
| 759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); | 912 appendBeginGroup(STRING_INTERPOLATION_INFO); |
| 760 beginToken(); // The expression starts here. | 913 beginToken(); // The expression starts here. |
| 761 next = advance(); | 914 next = advance(); // Move past the curly bracket. |
| 762 while (!identical(next, $EOF) && !identical(next, $STX)) { | 915 while (!identical(next, $EOF) && !identical(next, $STX)) { |
| 763 next = bigSwitch(next); | 916 next = bigSwitch(next); |
| 764 } | 917 } |
| 765 if (identical(next, $EOF)) return next; | 918 if (identical(next, $EOF)) return next; |
| 766 next = advance(); | 919 next = advance(); // Move past the $STX. |
| 767 beginToken(); // The string interpolation suffix starts here. | 920 beginToken(); // The string interpolation suffix starts here. |
| 768 return next; | 921 return next; |
| 769 } | 922 } |
| 770 | 923 |
| 771 int tokenizeInterpolatedIdentifier(int next, int start) { | 924 int tokenizeInterpolatedIdentifier(int next) { |
| 772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); | 925 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); |
| 773 beginToken(); // The identifier starts here. | 926 beginToken(); // The identifier starts here. |
| 774 next = tokenizeKeywordOrIdentifier(next, false); | 927 next = tokenizeKeywordOrIdentifier(next, false); |
| 775 beginToken(); // The string interpolation suffix starts here. | 928 beginToken(); // The string interpolation suffix starts here. |
| 776 return next; | 929 return next; |
| 777 } | 930 } |
| 778 | 931 |
| 779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { | 932 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { |
| 780 next = advance(); | 933 bool asciiOnly = true; |
| 934 next = advance(); // Advance past the quote | |
| 781 while (next != $EOF) { | 935 while (next != $EOF) { |
| 782 if (identical(next, quoteChar)) { | 936 if (identical(next, quoteChar)) { |
| 783 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 937 if (!asciiOnly) handleUnicode(start); |
| 784 return advance(); | 938 next = advance(); |
| 939 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
| 940 return next; | |
| 785 } else if (identical(next, $LF) || identical(next, $CR)) { | 941 } else if (identical(next, $LF) || identical(next, $CR)) { |
| 786 return error(const SourceString("unterminated string literal")); | 942 if (!asciiOnly) handleUnicode(start); |
| 943 return error("unterminated string literal"); | |
| 944 } else if (next > 127) { | |
| 945 asciiOnly = false; | |
| 787 } | 946 } |
| 788 next = advance(); | 947 next = advance(); |
| 789 } | 948 } |
| 790 return error(const SourceString("unterminated string literal")); | 949 if (!asciiOnly) handleUnicode(start); |
| 950 return error("unterminated string literal"); | |
| 791 } | 951 } |
| 792 | 952 |
| 793 int tokenizeMultiLineRawString(int quoteChar, int start) { | 953 int tokenizeMultiLineRawString(int quoteChar, int start) { |
| 794 int next = advance(); | 954 bool asciiOnlyString = true; |
| 955 bool asciiOnlyLine = true; | |
| 956 int unicodeStart = start; | |
| 957 int next = advance(); // Advance past the (last) quote (of three) | |
| 795 outer: while (!identical(next, $EOF)) { | 958 outer: while (!identical(next, $EOF)) { |
| 796 while (!identical(next, quoteChar)) { | 959 while (!identical(next, quoteChar)) { |
| 960 if (identical(next, $LF)) { | |
| 961 if (!asciiOnlyLine) { | |
| 962 // Synchronize the string offset in the utf8 scanner. | |
| 963 handleUnicode(unicodeStart); | |
| 964 asciiOnlyLine = true; | |
| 965 unicodeStart = scanOffset; | |
| 966 } | |
| 967 lineFeedInMultiline(); | |
| 968 } else if (next > 127) { | |
| 969 asciiOnlyLine = false; | |
| 970 asciiOnlyString = false; | |
| 971 } | |
| 797 next = advance(); | 972 next = advance(); |
| 798 if (identical(next, $EOF)) break outer; | 973 if (identical(next, $EOF)) break outer; |
| 799 } | 974 } |
| 800 next = advance(); | 975 next = advance(); |
| 801 if (identical(next, quoteChar)) { | 976 if (identical(next, quoteChar)) { |
| 802 next = advance(); | 977 next = advance(); |
| 803 if (identical(next, quoteChar)) { | 978 if (identical(next, quoteChar)) { |
| 804 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 979 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 805 return advance(); | 980 next = advance(); |
| 981 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
| 982 return next; | |
| 806 } | 983 } |
| 807 } | 984 } |
| 808 } | 985 } |
| 809 return error(const SourceString("unterminated string literal")); | 986 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 987 return error("unterminated string literal"); | |
| 810 } | 988 } |
| 811 | 989 |
| 812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { | 990 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { |
| 813 if (raw) return tokenizeMultiLineRawString(quoteChar, start); | 991 if (raw) return tokenizeMultiLineRawString(quoteChar, start); |
| 814 int next = advance(); | 992 bool asciiOnlyString = true; |
| 993 bool asciiOnlyLine = true; | |
| 994 int unicodeStart = start; | |
| 995 int next = advance(); // Advance past the (last) quote (of three). | |
| 815 while (!identical(next, $EOF)) { | 996 while (!identical(next, $EOF)) { |
| 816 if (identical(next, $$)) { | 997 if (identical(next, $$)) { |
| 817 next = tokenizeStringInterpolation(start); | 998 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 818 start = byteOffset; | 999 next = tokenizeStringInterpolation(start, asciiOnlyString); |
| 1000 start = scanOffset; | |
| 1001 unicodeStart = start; | |
| 1002 asciiOnlyString = true; // A new string token is created for the rest. | |
| 1003 asciiOnlyLine = true; | |
| 819 continue; | 1004 continue; |
| 820 } | 1005 } |
| 821 if (identical(next, quoteChar)) { | 1006 if (identical(next, quoteChar)) { |
| 822 next = advance(); | 1007 next = advance(); |
| 823 if (identical(next, quoteChar)) { | 1008 if (identical(next, quoteChar)) { |
| 824 next = advance(); | 1009 next = advance(); |
| 825 if (identical(next, quoteChar)) { | 1010 if (identical(next, quoteChar)) { |
| 826 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 1011 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 827 return advance(); | 1012 next = advance(); |
| 1013 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
| 1014 return next; | |
| 828 } | 1015 } |
| 829 } | 1016 } |
| 830 continue; | 1017 continue; |
| 831 } | 1018 } |
| 832 if (identical(next, $BACKSLASH)) { | 1019 if (identical(next, $BACKSLASH)) { |
| 833 next = advance(); | 1020 next = advance(); |
| 834 if (identical(next, $EOF)) break; | 1021 if (identical(next, $EOF)) break; |
| 835 } | 1022 } |
| 1023 if (identical(next, $LF)) { | |
| 1024 if (!asciiOnlyLine) { | |
| 1025 // Synchronize the string offset in the utf8 scanner. | |
| 1026 handleUnicode(unicodeStart); | |
| 1027 asciiOnlyLine = true; | |
| 1028 unicodeStart = scanOffset; | |
| 1029 } | |
| 1030 lineFeedInMultiline(); | |
| 1031 } else if (next > 127) { | |
| 1032 asciiOnlyString = false; | |
| 1033 asciiOnlyLine = false; | |
| 1034 } | |
| 836 next = advance(); | 1035 next = advance(); |
| 837 } | 1036 } |
| 838 return error(const SourceString("unterminated string literal")); | 1037 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 1038 return error("unterminated string literal"); | |
| 839 } | 1039 } |
| 840 | 1040 |
| 841 int error(SourceString message) { | 1041 int error(String message) { |
| 842 appendByteStringToken(BAD_INPUT_INFO, message); | 1042 appendStringToken(BAD_INPUT_INFO, message); |
| 843 return advance(); // Ensure progress. | 1043 return advance(); // Ensure progress. |
| 844 } | 1044 } |
| 1045 | |
| 1046 void unmatchedBeginGroup(BeginGroupToken begin) { | |
| 1047 String error = 'unmatched "${begin.stringValue}"'; | |
| 1048 Token close = | |
| 1049 new StringToken.fromString( | |
| 1050 BAD_INPUT_INFO, error, begin.charOffset, true); | |
| 1051 | |
| 1052 // We want to ensure that unmatched BeginGroupTokens are reported | |
| 1053 // as errors. However, the rest of the parser assume the groups | |
| 1054 // are well-balanced and will never look at the endGroup | |
| 1055 // token. This is a nice property that allows us to skip quickly | |
| 1056 // over correct code. By inserting an additional error token in | |
| 1057 // the stream, we can keep ignoring endGroup tokens. | |
| 1058 // | |
| 1059 // [begin] --next--> [tail] | |
| 1060 // [begin] --endG--> [close] --next--> [next] --next--> [tail] | |
| 1061 // | |
| 1062 // This allows the parser to skip from [begin] via endGroup to [close] and | |
| 1063 // ignore the [close] token (assuming it's correct), then the error will be | |
| 1064 // reported when parsing the [next] token. | |
| 1065 | |
| 1066 Token next = new StringToken.fromString( | |
| 1067 BAD_INPUT_INFO, error, begin.charOffset, true); | |
| 1068 begin.endGroup = close; | |
| 1069 close.next = next; | |
| 1070 next.next = begin.next; | |
| 1071 } | |
| 845 } | 1072 } |
| OLD | NEW |