sdk/lib/_internal/compiler/implementation/scanner/scanner.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« sdk/lib/_internal/compiler/implementation/js_emitter/code_emitter_task.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/parser.dart ('k') | sdk/lib/_internal/compiler/implementation/scanner/scanner_task.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of scanner;	5 part of scanner;

6	6

7 abstract class Scanner {	7 abstract class Scanner {

8 Token tokenize();	8 Token tokenize();

	9

	10 factory Scanner(SourceFile file, {bool includeComments: false}) {

	11 if (file is Utf8BytesSourceFile) {

	12 return new Utf8BytesScanner(file, includeComments: includeComments);

	13 } else {

	14 return new StringScanner(file, includeComments: includeComments);

	15 }

	16 }

9 }	17 }

10	18

11 /**	19 abstract class AbstractScanner implements Scanner {

12 * Common base class for a Dart scanner.	20 final bool includeComments;

13 */	21

14 abstract class AbstractScanner<T extends SourceString> implements Scanner {	22 /**

	23 * The string offset for the next token that will be created.

	24 *

	25 * Note that in the [Utf8BytesScanner], string offsets and [scanOffset] values

	26 * are different. One string character can be encoded using multiple UTF-8

	27 * bytes.

	28 */

	29 int tokenStart = -1;

	30

	31 /**

	32 * A pointer to the token stream created by this scanner. The first token

	33 * is a special token and not part of the source file. This is an

	34 * implementation detail to avoids special cases in the scanner. This token

	35 * is not exposed to clients of the scanner, which are expected to invoke

	36 * [firstToken] to access the token stream.

	37 */

	38 final Token tokens = new SymbolToken(EOF_INFO, -1);

	39

	40 /**

	41 * A pointer to the last scanned token.

	42 */

	43 Token tail;

	44

	45 /**

	46 * The stack of open groups, e.g [: { ... ( .. :]

	47 * Each BeginGroupToken has a pointer to the token where the group

	48 * ends. This field is set when scanning the end group token.

	49 */

	50 Link<BeginGroupToken> groupingStack = const Link<BeginGroupToken>();

	51

	52 /**

	53 * The source file that is being scanned. This field can be [:null:].

	54 * If the source file is available, the scanner assigns its [:lineStarts:] and

	55 * [:length:] fields at the end of [tokenize].

	56 */

	57 final SourceFile file;

	58

	59 final List<int> lineStarts = [0];

	60

	61 AbstractScanner(this.file, this.includeComments) {

	62 this.tail = this.tokens;

	63 }

	64

	65

	66 /**

	67 * Advances and returns the next character.

	68 *

	69 * If the next character is non-ASCII, then the returned value depends on the

	70 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while

	71 * the [StringScanner] returns a UTF-16 code unit.

	72 *

	73 * The scanner ensures that [advance] is not invoked after it returned [$EOF].

	74 * This allows implementations to omit bound checks if the data structure ends

	75 * with '0'.

	76 */

15 int advance();	77 int advance();

16 int nextByte();	78

17	79 /**

18 /**	80 * Returns the current unicode character.

19 * Returns the current character or byte depending on the underlying input	81 *

20 * kind. For example, [StringScanner] operates on [String] and thus returns	82 * If the current character is ASCII, then it is returned unchanged.

21 * characters (Unicode codepoints represented as int) whereas	83 *

22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes.	84 * The [Utf8BytesScanner] decodes the next unicode code point starting at the

	85 * current position. Note that every unicode character is returned as a single

	86 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following

	87 * [advance] returns the next character.

	88 *

	89 * The [StringScanner] returns the current character unchanged, which might

	90 * be a surrogate character. In the case of '\u{1d11e}', it returns the first

	91 * code unit 55348, and the following [advance] returns the second code unit

	92 * 56606.

	93 *

	94 * Invoking [currentAsUnicode] multiple times is safe, i.e.,

	95 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].

	96 */

	97 int currentAsUnicode(int next);

	98

	99 /**

	100 * Returns the character at the next poisition. Like in [advance], the

	101 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns

	102 * a UTF-16 code unit.

23 */	103 */

24 int peek();	104 int peek();

25	105

26 /**	106 /**

	107 * Notifies the scanner that unicode characters were detected in either a

	108 * comment or a string literal between [startScanOffset] and the current

	109 * scan offset.

	110 */

	111 void handleUnicode(int startScanOffset);

	112

	113 /**

	114 * Returns the current scan offset.

	115 *

	116 * In the [Utf8BytesScanner] this is the offset into the byte list, in the

	117 * [StringScanner] the offset in the source string.

	118 */

	119 int get scanOffset;

	120

	121 /**

	122 * Returns the current string offset.

	123 *

	124 * In the [StringScanner] this is identical to the [scanOffset]. In the

	125 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.

	126 */

	127 int get stringOffset;

	128

	129 /**

	130 * Returns the first token scanned by this [Scanner].

	131 */

	132 Token firstToken();

	133

	134 /**

	135 * Returns the last token scanned by this [Scanner].

	136 */

	137 Token previousToken();

	138

	139 /**

	140 * Notifies that a new token starts at current offset.

	141 */

	142 void beginToken() {

	143 tokenStart = stringOffset;

	144 }

	145

	146 /**

	147 * Appends a substring from the scan offset [:start:] to the current

	148 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current

	149 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the

	150 * substring string [5,9).

	151 *

	152 * Note that [extraOffset] can only be used if the covered character(s) are

	153 * known to be ASCII.

	154 */

	155 void appendSubstringToken(PrecedenceInfo info, int start,

	156 bool asciiOnly, [int extraOffset]);

	157

	158 /**

	159 * Appends a token whose kind is determined by [info] and content is defined

	160 * by the String [value].

	161 *

	162 * This method is invoked for class names, field names, method names, types,

	163 * etc.

	164 */

	165 void appendStringToken(PrecedenceInfo info, String value) {

	166 tail.next = new StringToken.fromString(info, value, tokenStart, true);

	167 tail = tail.next;

	168 }

	169

	170 /**

	171 * Appends a fixed token whose kind and content is determined by [info].

	172 * Appends an operator token from [info].

	173 *

	174 * An operator token represent operators like ':', '.', ';', '&&', '==', '--',

	175 * '=>', etc.

	176 */

	177 void appendPrecedenceToken(PrecedenceInfo info) {

	178 tail.next = new SymbolToken(info, tokenStart);

	179 tail = tail.next;

	180 }

	181

	182 /**

27 * Appends a fixed token based on whether the current char is [choice] or not.	183 * Appends a fixed token based on whether the current char is [choice] or not.

28 * If the current char is [choice] a fixed token whose kind and content	184 * If the current char is [choice] a fixed token whose kind and content

29 * is determined by [yes] is appended, otherwise a fixed token whose kind	185 * is determined by [yes] is appended, otherwise a fixed token whose kind

30 * and content is determined by [no] is appended.	186 * and content is determined by [no] is appended.

31 */	187 */

32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);	188 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no) {

33	189 int next = advance();

34 /**	190 if (identical(next, choice)) {

35 * Appends a fixed token whose kind and content is determined by [info].	191 appendPrecedenceToken(yes);

36 */	192 return advance();

37 void appendPrecedenceToken(PrecedenceInfo info);	193 } else {

38	194 appendPrecedenceToken(no);

39 /**	195 return next;

40 * Appends a token whose kind is determined by [info] and content is [value].	196 }

41 */	197 }

42 void appendStringToken(PrecedenceInfo info, String value);

43

44 /**

45 * Appends a token whose kind is determined by [info] and content is defined

46 * by the SourceString [value].

47 */

48 void appendByteStringToken(PrecedenceInfo info, T value);

49	198

50 /**	199 /**

51 * Appends a keyword token whose kind is determined by [keyword].	200 * Appends a keyword token whose kind is determined by [keyword].

52 */	201 */

53 void appendKeywordToken(Keyword keyword);	202 void appendKeywordToken(Keyword keyword) {

54 void appendWhiteSpace(int next);	203 String syntax = keyword.syntax;

55 void appendEofToken();	204 // Type parameters and arguments cannot contain 'this' or 'super'.

56	205 if (identical(syntax, 'this') \|\| identical(syntax, 'super')) {

57 /**	206 discardOpenLt();

58 * Creates an ASCII SourceString whose content begins at the source byte	207 }

59 * offset [start] and ends at [offset] bytes from the current byte offset of	208 tail.next = new KeywordToken(keyword, tokenStart);

60 * the scanner. For example, if the current byte offset is 10,	209 tail = tail.next;

61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found	210 }

62 * at the [0,9[ byte interval of the source text.	211

63 */	212 void appendEofToken() {

64 T asciiString(int start, int offset);	213 beginToken();

65 T utf8String(int start, int offset);	214 tail.next = new SymbolToken(EOF_INFO, tokenStart);

66 Token firstToken();	215 tail = tail.next;

67 Token previousToken();	216 // EOF points to itself so there's always infinite look-ahead.

68 void beginToken();	217 tail.next = tail;

69 void addToCharOffset(int offset);	218 discardOpenLt();

70 int get charOffset;	219 while (!groupingStack.isEmpty) {

71 int get byteOffset;	220 unmatchedBeginGroup(groupingStack.head);

72 void appendBeginGroup(PrecedenceInfo info, String value);	221 groupingStack = groupingStack.tail;

73 int appendEndGroup(PrecedenceInfo info, String value, int openKind);	222 }

74 void appendGt(PrecedenceInfo info, String value);	223 }

75 void appendGtGt(PrecedenceInfo info, String value);	224

76 void appendGtGtGt(PrecedenceInfo info, String value);	225 /**

77 void appendComment();	226 * Notifies scanning a whitespace character. Note that [appendWhiteSpace] is

	227 * not always invoked for [$SPACE] characters.

	228 *

	229 * This method is used by the scanners to track line breaks and create the

	230 * [lineStarts] map.

	231 */

	232 void appendWhiteSpace(int next) {

	233 if (next == $LF && file != null) {

	234 lineStarts.add(stringOffset + 1); // +1, the line starts after the $LF.

	235 }

	236 }

	237

	238 /**

	239 * Notifies on [$LF] characters in multi-line commends or strings.

	240 *

	241 * This method is used by the scanners to track line breaks and create the

	242 * [lineStarts] map.

	243 */

	244 void lineFeedInMultiline() {

	245 if (file != null) {

	246 lineStarts.add(stringOffset + 1);

	247 }

	248 }

	249

	250 /**

	251 * Appends a token that begins a new group, represented by [value].

	252 * Group begin tokens are '{', '(', '[' and '${'.

	253 */

	254 void appendBeginGroup(PrecedenceInfo info) {

	255 Token token = new BeginGroupToken(info, tokenStart);

	256 tail.next = token;

	257 tail = tail.next;

	258

	259 // { ( [ ${ cannot appear inside a type parameters / arguments.

	260 if (!identical(info.kind, LT_TOKEN)) discardOpenLt();

	261 groupingStack = groupingStack.prepend(token);

	262 }

	263

	264 /**

	265 * Appends a token that begins a ends group, represented by [value].

	266 * It handles the group end tokens '}', ')' and ']'. The tokens '>' and

	267 * '>>' are handled separately bo [appendGt] and [appendGtGt].

	268 */

	269 int appendEndGroup(PrecedenceInfo info, int openKind) {

	270 assert(!identical(openKind, LT_TOKEN)); // openKind is < for > and >>

	271 appendPrecedenceToken(info);

	272 // Don't report unmatched errors for <; it is also the less-than operator.

	273 discardOpenLt();

	274 if (groupingStack.isEmpty) {

	275 return advance();

	276 }

	277 BeginGroupToken begin = groupingStack.head;

	278 if (!identical(begin.kind, openKind)) {

	279 if (!identical(openKind, OPEN_CURLY_BRACKET_TOKEN) \|\|

	280 !identical(begin.kind, STRING_INTERPOLATION_TOKEN)) {

	281 // Not ending string interpolation.

	282 unmatchedBeginGroup(begin);

	283 return advance();

	284 }

	285 // We're ending an interpolated expression.

	286 begin.endGroup = tail;

	287 groupingStack = groupingStack.tail;

	288 // Using "start-of-text" to signal that we're back in string

	289 // scanning mode.

	290 return $STX;

	291 }

	292 begin.endGroup = tail;

	293 groupingStack = groupingStack.tail;

	294 return advance();

	295 }

	296

	297 /**

	298 * Appends a token for '>'.

	299 * This method does not issue unmatched errors, because > is also the

	300 * greater-than operator. It does not necessarily have to close a group.

	301 */

	302 void appendGt(PrecedenceInfo info) {

	303 appendPrecedenceToken(info);

	304 if (groupingStack.isEmpty) return;

	305 if (identical(groupingStack.head.kind, LT_TOKEN)) {

	306 groupingStack.head.endGroup = tail;

	307 groupingStack = groupingStack.tail;

	308 }

	309 }

	310

	311 /**

	312 * Appends a token for '>>'.

	313 * This method does not issue unmatched errors, because >> is also the

	314 * shift operator. It does not necessarily have to close a group.

	315 */

	316 void appendGtGt(PrecedenceInfo info) {

	317 appendPrecedenceToken(info);

	318 if (groupingStack.isEmpty) return;

	319 if (identical(groupingStack.head.kind, LT_TOKEN)) {

	320 // Don't assign endGroup: in "T<U<V>>", the '>>' token closes the outer

	321 // '<', the inner '<' is left without endGroup.

	322 groupingStack = groupingStack.tail;

	323 }

	324 if (groupingStack.isEmpty) return;

	325 if (identical(groupingStack.head.kind, LT_TOKEN)) {

	326 groupingStack.head.endGroup = tail;

	327 groupingStack = groupingStack.tail;

	328 }

	329 }

	330

	331 void appendComment(start, bool asciiOnly) {

	332 if (!includeComments) return;

	333 appendSubstringToken(COMMENT_INFO, start, asciiOnly);

	334 }

78	335

79 /**	336 /**

80 * We call this method to discard '<' from the "grouping" stack	337 * We call this method to discard '<' from the "grouping" stack

81 * (maintained by subclasses).	338 * (maintained by subclasses).

82 *	339 *

83 * [PartialParser.skipExpression] relies on the fact that we do not	340 * [PartialParser.skipExpression] relies on the fact that we do not

84 * create groups for stuff like:	341 * create groups for stuff like:

85 * [:a = b < c, d = e > f:].	342 * [:a = b < c, d = e > f:].

86 *	343 *

87 * In other words, this method is called when the scanner recognizes	344 * In other words, this method is called when the scanner recognizes

88 * something which cannot possibly be part of a type	345 * something which cannot possibly be part of a type

89 * parameter/argument list.	346 * parameter/argument list.

90 */	347 */

91 void discardOpenLt();	348 void discardOpenLt() {

	349 while (!groupingStack.isEmpty

	350 && identical(groupingStack.head.kind, LT_TOKEN)) {

	351 groupingStack = groupingStack.tail;

	352 }

	353 }

92	354

93 // TODO(ahe): Move this class to implementation.	355 // TODO(ahe): Move this class to implementation.

94	356

95 Token tokenize() {	357 Token tokenize() {

96 int next = advance();	358 int next = advance();

97 while (!identical(next, $EOF)) {	359 while (!identical(next, $EOF)) {

98 next = bigSwitch(next);	360 next = bigSwitch(next);

99 }	361 }

100 appendEofToken();	362 appendEofToken();

	363

	364 if (file != null) {

	365 file.length = stringOffset;

	366 // One additional line start at the end, see [SourceFile.lineStarts].

	367 lineStarts.add(stringOffset + 1);

	368 file.lineStarts = lineStarts;

	369 }

	370

101 return firstToken();	371 return firstToken();

102 }	372 }

103	373

104 int bigSwitch(int next) {	374 int bigSwitch(int next) {

105 beginToken();	375 beginToken();

106 if (identical(next, $SPACE) \|\| identical(next, $TAB)	376 if (identical(next, $SPACE) \|\| identical(next, $TAB)

107 \|\| identical(next, $LF) \|\| identical(next, $CR)) {	377 \|\| identical(next, $LF) \|\| identical(next, $CR)) {

108 appendWhiteSpace(next);	378 appendWhiteSpace(next);

109 next = advance();	379 next = advance();

	380 // Sequences of spaces are common, so advance through them fast.

110 while (identical(next, $SPACE)) {	381 while (identical(next, $SPACE)) {

111 appendWhiteSpace(next);	382 // We don't invoke [:appendWhiteSpace(next):] here for efficiency,

	383 // assuming that it does not do anything for space characters.

112 next = advance();	384 next = advance();

113 }	385 }

114 return next;	386 return next;

115 }	387 }

116	388

117 if ($a <= next && next <= $z) {	389 if ($a <= next && next <= $z) {

118 if (identical($r, next)) {	390 if (identical($r, next)) {

119 return tokenizeRawStringKeywordOrIdentifier(next);	391 return tokenizeRawStringKeywordOrIdentifier(next);

120 }	392 }

121 return tokenizeKeywordOrIdentifier(next, true);	393 return tokenizeKeywordOrIdentifier(next, true);

122 }	394 }

123	395

124 if (($A <= next && next <= $Z) \|\| identical(next, $_) \|\| identical(next, $$) ) {	396 if (($A <= next && next <= $Z) \|\|

125 return tokenizeIdentifier(next, byteOffset, true);	397 identical(next, $_) \|\|

	398 identical(next, $$)) {

	399 return tokenizeIdentifier(next, scanOffset, true);

126 }	400 }

127	401

128 if (identical(next, $LT)) {	402 if (identical(next, $LT)) {

129 return tokenizeLessThan(next);	403 return tokenizeLessThan(next);

130 }	404 }

131	405

132 if (identical(next, $GT)) {	406 if (identical(next, $GT)) {

133 return tokenizeGreaterThan(next);	407 return tokenizeGreaterThan(next);

134 }	408 }

135	409

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
180 if (identical(next, $BACKSLASH)) {	454 if (identical(next, $BACKSLASH)) {

181 appendPrecedenceToken(BACKSLASH_INFO);	455 appendPrecedenceToken(BACKSLASH_INFO);

182 return advance();	456 return advance();

183 }	457 }

184	458

185 if (identical(next, $HASH)) {	459 if (identical(next, $HASH)) {

186 return tokenizeTag(next);	460 return tokenizeTag(next);

187 }	461 }

188	462

189 if (identical(next, $OPEN_PAREN)) {	463 if (identical(next, $OPEN_PAREN)) {

190 appendBeginGroup(OPEN_PAREN_INFO, "(");	464 appendBeginGroup(OPEN_PAREN_INFO);

191 return advance();	465 return advance();

192 }	466 }

193	467

194 if (identical(next, $CLOSE_PAREN)) {	468 if (identical(next, $CLOSE_PAREN)) {

195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN);	469 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);

196 }	470 }

197	471

198 if (identical(next, $COMMA)) {	472 if (identical(next, $COMMA)) {

199 appendPrecedenceToken(COMMA_INFO);	473 appendPrecedenceToken(COMMA_INFO);

200 return advance();	474 return advance();

201 }	475 }

202	476

203 if (identical(next, $COLON)) {	477 if (identical(next, $COLON)) {

204 appendPrecedenceToken(COLON_INFO);	478 appendPrecedenceToken(COLON_INFO);

205 return advance();	479 return advance();

206 }	480 }

207	481

208 if (identical(next, $SEMICOLON)) {	482 if (identical(next, $SEMICOLON)) {

209 appendPrecedenceToken(SEMICOLON_INFO);	483 appendPrecedenceToken(SEMICOLON_INFO);

210 // Type parameters and arguments cannot contain semicolon.	484 // Type parameters and arguments cannot contain semicolon.

211 discardOpenLt();	485 discardOpenLt();

212 return advance();	486 return advance();

213 }	487 }

214	488

215 if (identical(next, $QUESTION)) {	489 if (identical(next, $QUESTION)) {

216 appendPrecedenceToken(QUESTION_INFO);	490 appendPrecedenceToken(QUESTION_INFO);

217 return advance();	491 return advance();

218 }	492 }

219	493

220 if (identical(next, $CLOSE_SQUARE_BRACKET)) {	494 if (identical(next, $CLOSE_SQUARE_BRACKET)) {

221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]",	495 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,

222 OPEN_SQUARE_BRACKET_TOKEN);	496 OPEN_SQUARE_BRACKET_TOKEN);

223 }	497 }

224	498

225 if (identical(next, $BACKPING)) {	499 if (identical(next, $BACKPING)) {

226 appendPrecedenceToken(BACKPING_INFO);	500 appendPrecedenceToken(BACKPING_INFO);

227 return advance();	501 return advance();

228 }	502 }

229	503

230 if (identical(next, $OPEN_CURLY_BRACKET)) {	504 if (identical(next, $OPEN_CURLY_BRACKET)) {

231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{");	505 appendBeginGroup(OPEN_CURLY_BRACKET_INFO);

232 return advance();	506 return advance();

233 }	507 }

234	508

235 if (identical(next, $CLOSE_CURLY_BRACKET)) {	509 if (identical(next, $CLOSE_CURLY_BRACKET)) {

236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}",	510 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,

237 OPEN_CURLY_BRACKET_TOKEN);	511 OPEN_CURLY_BRACKET_TOKEN);

238 }	512 }

239	513

240 if (identical(next, $SLASH)) {	514 if (identical(next, $SLASH)) {

241 return tokenizeSlashOrComment(next);	515 return tokenizeSlashOrComment(next);

242 }	516 }

243	517

244 if (identical(next, $AT)) {	518 if (identical(next, $AT)) {

245 return tokenizeAt(next);	519 return tokenizeAt(next);

246 }	520 }

247	521

248 if (identical(next, $DQ) \|\| identical(next, $SQ)) {	522 if (identical(next, $DQ) \|\| identical(next, $SQ)) {

249 return tokenizeString(next, byteOffset, false);	523 return tokenizeString(next, scanOffset, false);

250 }	524 }

251	525

252 if (identical(next, $PERIOD)) {	526 if (identical(next, $PERIOD)) {

253 return tokenizeDotsOrNumber(next);	527 return tokenizeDotsOrNumber(next);

254 }	528 }

255	529

256 if (identical(next, $0)) {	530 if (identical(next, $0)) {

257 return tokenizeHexOrNumber(next);	531 return tokenizeHexOrNumber(next);

258 }	532 }

259	533

260 // TODO(ahe): Would a range check be faster?	534 // TODO(ahe): Would a range check be faster?

261 if (identical(next, $1) \|\| identical(next, $2) \|\| identical(next, $3)	535 if (identical(next, $1) \|\| identical(next, $2) \|\| identical(next, $3)

262 \|\| identical(next, $4) \|\| identical(next, $5) \|\| identical(next, $6)	536 \|\| identical(next, $4) \|\| identical(next, $5) \|\| identical(next, $6)

263 \|\| identical(next, $7) \|\| identical(next, $8) \|\| identical(next, $9)) {	537 \|\| identical(next, $7) \|\| identical(next, $8) \|\| identical(next, $9)) {

264 return tokenizeNumber(next);	538 return tokenizeNumber(next);

265 }	539 }

266	540

267 if (identical(next, $EOF)) {	541 if (identical(next, $EOF)) {

268 return $EOF;	542 return $EOF;

269 }	543 }

270 if (next < 0x1f) {	544 if (next < 0x1f) {

271 return error(new SourceString("unexpected character $next"));	545 return error("unexpected character $next");

	546 }

	547

	548 if (next >= 128) {

	549 next = currentAsUnicode(next);

272 }	550 }

273	551

274 // The following are non-ASCII characters.	552 // The following are non-ASCII characters.

275	553

276 if (identical(next, $NBSP)) {	554 if (identical(next, $NBSP)) {

277 appendWhiteSpace(next);	555 appendWhiteSpace(next);

278 return advance();	556 return advance();

279 }	557 }

280	558

281 return tokenizeIdentifier(next, byteOffset, true);	559 return error("unexpected unicode character $next");

282 }	560 }

283	561

284 int tokenizeTag(int next) {	562 int tokenizeTag(int next) {

285 // # or #!.*[\n\r]	563 // # or #!.*[\n\r]

286 if (byteOffset == 0) {	564 if (scanOffset == 0) {

287 if (identical(peek(), $BANG)) {	565 if (identical(peek(), $BANG)) {

	566 int start = scanOffset + 1;

	567 bool asciiOnly = true;

288 do {	568 do {

289 next = advance();	569 next = advance();

290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne xt, $EOF));	570 if (next > 127) asciiOnly = false;

	571 } while (!identical(next, $LF) &&

	572 !identical(next, $CR) &&

	573 !identical(next, $EOF));

	574 if (!asciiOnly) handleUnicode(start);

291 return next;	575 return next;

292 }	576 }

293 }	577 }

294 appendPrecedenceToken(HASH_INFO);	578 appendPrecedenceToken(HASH_INFO);

295 return advance();	579 return advance();

296 }	580 }

297	581

298 int tokenizeTilde(int next) {	582 int tokenizeTilde(int next) {

299 // ~ ~/ ~/=	583 // ~ ~/ ~/=

300 next = advance();	584 next = advance();

301 if (identical(next, $SLASH)) {	585 if (identical(next, $SLASH)) {

302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);	586 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);

303 } else {	587 } else {

304 appendPrecedenceToken(TILDE_INFO);	588 appendPrecedenceToken(TILDE_INFO);

305 return next;	589 return next;

306 }	590 }

307 }	591 }

308	592

309 int tokenizeOpenSquareBracket(int next) {	593 int tokenizeOpenSquareBracket(int next) {

310 // [ [] []=	594 // [ [] []=

311 next = advance();	595 next = advance();

312 if (identical(next, $CLOSE_SQUARE_BRACKET)) {	596 if (identical(next, $CLOSE_SQUARE_BRACKET)) {

313 Token token = previousToken();	597 Token token = previousToken();

314 if (token is KeywordToken && identical(token.value.stringValue, 'operator' )) {	598 if (token is KeywordToken &&

	599 identical((token as KeywordToken).keyword.syntax, 'operator')) {

315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);	600 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);

316 }	601 }

317 }	602 }

318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "[");	603 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);

319 return next;	604 return next;

320 }	605 }

321	606

322 int tokenizeCaret(int next) {	607 int tokenizeCaret(int next) {

323 // ^ ^=	608 // ^ ^=

324 return select($EQ, CARET_EQ_INFO, CARET_INFO);	609 return select($EQ, CARET_EQ_INFO, CARET_INFO);

325 }	610 }

326	611

327 int tokenizeBar(int next) {	612 int tokenizeBar(int next) {

328 // \| \|\| \|=	613 // \| \|\| \|=

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
372 return advance();	657 return advance();

373 } else if (identical(next, $EQ)) {	658 } else if (identical(next, $EQ)) {

374 appendPrecedenceToken(MINUS_EQ_INFO);	659 appendPrecedenceToken(MINUS_EQ_INFO);

375 return advance();	660 return advance();

376 } else {	661 } else {

377 appendPrecedenceToken(MINUS_INFO);	662 appendPrecedenceToken(MINUS_INFO);

378 return next;	663 return next;

379 }	664 }

380 }	665 }

381	666

382

383 int tokenizePlus(int next) {	667 int tokenizePlus(int next) {

384 // + ++ +=	668 // + ++ +=

385 next = advance();	669 next = advance();

386 if (identical($PLUS, next)) {	670 if (identical($PLUS, next)) {

387 appendPrecedenceToken(PLUS_PLUS_INFO);	671 appendPrecedenceToken(PLUS_PLUS_INFO);

388 return advance();	672 return advance();

389 } else if (identical($EQ, next)) {	673 } else if (identical($EQ, next)) {

390 appendPrecedenceToken(PLUS_EQ_INFO);	674 appendPrecedenceToken(PLUS_EQ_INFO);

391 return advance();	675 return advance();

392 } else {	676 } else {

393 appendPrecedenceToken(PLUS_INFO);	677 appendPrecedenceToken(PLUS_INFO);

394 return next;	678 return next;

395 }	679 }

396 }	680 }

397	681

398 int tokenizeExclamation(int next) {	682 int tokenizeExclamation(int next) {

399 // ! != !==	683 // ! !=

	684 // !== is kept for user-friendly error reporting

	685

400 next = advance();	686 next = advance();

401 if (identical(next, $EQ)) {	687 if (identical(next, $EQ)) {

402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);	688 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);

403 }	689 }

404 appendPrecedenceToken(BANG_INFO);	690 appendPrecedenceToken(BANG_INFO);

405 return next;	691 return next;

406 }	692 }

407	693

408 int tokenizeEquals(int next) {	694 int tokenizeEquals(int next) {

409 // = == ===	695 // = == =>

	696 // === is kept for user-friendly error reporting

410	697

411 // Type parameters and arguments cannot contain any token that	698 // Type parameters and arguments cannot contain any token that

412 // starts with '='.	699 // starts with '='.

413 discardOpenLt();	700 discardOpenLt();

414	701

415 next = advance();	702 next = advance();

416 if (identical(next, $EQ)) {	703 if (identical(next, $EQ)) {

417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);	704 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);

418 } else if (identical(next, $GT)) {	705 } else if (identical(next, $GT)) {

419 appendPrecedenceToken(FUNCTION_INFO);	706 appendPrecedenceToken(FUNCTION_INFO);

420 return advance();	707 return advance();

421 }	708 }

422 appendPrecedenceToken(EQ_INFO);	709 appendPrecedenceToken(EQ_INFO);

423 return next;	710 return next;

424 }	711 }

425	712

426 int tokenizeGreaterThan(int next) {	713 int tokenizeGreaterThan(int next) {

427 // > >= >> >>= >>> >>>=	714 // > >= >> >>=

428 next = advance();	715 next = advance();

429 if (identical($EQ, next)) {	716 if (identical($EQ, next)) {

430 appendPrecedenceToken(GT_EQ_INFO);	717 appendPrecedenceToken(GT_EQ_INFO);

431 return advance();	718 return advance();

432 } else if (identical($GT, next)) {	719 } else if (identical($GT, next)) {

433 next = advance();	720 next = advance();

434 if (identical($EQ, next)) {	721 if (identical($EQ, next)) {

435 appendPrecedenceToken(GT_GT_EQ_INFO);	722 appendPrecedenceToken(GT_GT_EQ_INFO);

436 return advance();	723 return advance();

437 } else {	724 } else {

438 appendGtGt(GT_GT_INFO, ">>");	725 appendGtGt(GT_GT_INFO);

439 return next;	726 return next;

440 }	727 }

441 } else {	728 } else {

442 appendGt(GT_INFO, ">");	729 appendGt(GT_INFO);

443 return next;	730 return next;

444 }	731 }

445 }	732 }

446	733

447 int tokenizeLessThan(int next) {	734 int tokenizeLessThan(int next) {

448 // < <= << <<=	735 // < <= << <<=

449 next = advance();	736 next = advance();

450 if (identical($EQ, next)) {	737 if (identical($EQ, next)) {

451 appendPrecedenceToken(LT_EQ_INFO);	738 appendPrecedenceToken(LT_EQ_INFO);

452 return advance();	739 return advance();

453 } else if (identical($LT, next)) {	740 } else if (identical($LT, next)) {

454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);	741 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);

455 } else {	742 } else {

456 appendBeginGroup(LT_INFO, "<");	743 appendBeginGroup(LT_INFO);

457 return next;	744 return next;

458 }	745 }

459 }	746 }

460	747

461 int tokenizeNumber(int next) {	748 int tokenizeNumber(int next) {

462 int start = byteOffset;	749 int start = scanOffset;

463 while (true) {	750 while (true) {

464 next = advance();	751 next = advance();

465 if ($0 <= next && next <= $9) {	752 if ($0 <= next && next <= $9) {

466 continue;	753 continue;

467 } else if (identical(next, $e) \|\| identical(next, $E)) {	754 } else if (identical(next, $e) \|\| identical(next, $E)) {

468 return tokenizeFractionPart(next, start);	755 return tokenizeFractionPart(next, start);

469 } else {	756 } else {

470 if (identical(next, $PERIOD)) {	757 if (identical(next, $PERIOD)) {

471 int nextnext = peek();	758 int nextnext = peek();

472 if ($0 <= nextnext && nextnext <= $9) {	759 if ($0 <= nextnext && nextnext <= $9) {

473 return tokenizeFractionPart(advance(), start);	760 return tokenizeFractionPart(advance(), start);

474 }	761 }

475 }	762 }

476 appendByteStringToken(INT_INFO, asciiString(start, 0));	763 appendSubstringToken(INT_INFO, start, true);

477 return next;	764 return next;

478 }	765 }

479 }	766 }

480 }	767 }

481	768

482 int tokenizeHexOrNumber(int next) {	769 int tokenizeHexOrNumber(int next) {

483 int x = peek();	770 int x = peek();

484 if (identical(x, $x) \|\| identical(x, $X)) {	771 if (identical(x, $x) \|\| identical(x, $X)) {

485 advance();	772 return tokenizeHex(next);

486 return tokenizeHex(x);

487 }	773 }

488 return tokenizeNumber(next);	774 return tokenizeNumber(next);

489 }	775 }

490	776

491 int tokenizeHex(int next) {	777 int tokenizeHex(int next) {

492 int start = byteOffset - 1;	778 int start = scanOffset;

	779 next = advance(); // Advance past the $x or $X.

493 bool hasDigits = false;	780 bool hasDigits = false;

494 while (true) {	781 while (true) {

495 next = advance();	782 next = advance();

496 if (($0 <= next && next <= $9)	783 if (($0 <= next && next <= $9)

497 \|\| ($A <= next && next <= $F)	784 \|\| ($A <= next && next <= $F)

498 \|\| ($a <= next && next <= $f)) {	785 \|\| ($a <= next && next <= $f)) {

499 hasDigits = true;	786 hasDigits = true;

500 } else {	787 } else {

501 if (!hasDigits) {	788 if (!hasDigits) {

502 return error(const SourceString("hex digit expected"));	789 return error("hex digit expected");

503 }	790 }

504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0));	791 appendSubstringToken(HEXADECIMAL_INFO, start, true);

505 return next;	792 return next;

506 }	793 }

507 }	794 }

508 }	795 }

509	796

510 int tokenizeDotsOrNumber(int next) {	797 int tokenizeDotsOrNumber(int next) {

511 int start = byteOffset;	798 int start = scanOffset;

512 next = advance();	799 next = advance();

513 if (($0 <= next && next <= $9)) {	800 if (($0 <= next && next <= $9)) {

514 return tokenizeFractionPart(next, start);	801 return tokenizeFractionPart(next, start);

515 } else if (identical($PERIOD, next)) {	802 } else if (identical($PERIOD, next)) {

516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);	803 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);

517 } else {	804 } else {

518 appendPrecedenceToken(PERIOD_INFO);	805 appendPrecedenceToken(PERIOD_INFO);

519 return next;	806 return next;

520 }	807 }

521 }	808 }

522	809

523 int tokenizeFractionPart(int next, int start) {	810 int tokenizeFractionPart(int next, int start) {

524 bool done = false;	811 bool done = false;

525 bool hasDigit = false;	812 bool hasDigit = false;

526 LOOP: while (!done) {	813 LOOP: while (!done) {

527 if ($0 <= next && next <= $9) {	814 if ($0 <= next && next <= $9) {

528 hasDigit = true;	815 hasDigit = true;

529 } else if (identical($e, next) \|\| identical($E, next)) {	816 } else if (identical($e, next) \|\| identical($E, next)) {

530 hasDigit = true;	817 hasDigit = true;

531 next = tokenizeExponent(advance());	818 next = tokenizeExponent(advance());

532 done = true;	819 done = true;

533 continue LOOP;	820 continue LOOP;

534 } else {	821 } else {

535 done = true;	822 done = true;

536 continue LOOP;	823 continue LOOP;

537 }	824 }

538 next = advance();	825 next = advance();

539 }	826 }

540 if (!hasDigit) {	827 if (!hasDigit) {

541 appendByteStringToken(INT_INFO, asciiString(start, -1));	828 // Reduce offset, we already advanced to the token past the period.

	829 appendSubstringToken(INT_INFO, start, true, -1);

	830

	831 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because

	832 // the scanner already advanced past the period.

542 if (identical($PERIOD, next)) {	833 if (identical($PERIOD, next)) {

543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);	834 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);

544 }	835 }

545 // TODO(ahe): Wrong offset for the period.

546 appendPrecedenceToken(PERIOD_INFO);	836 appendPrecedenceToken(PERIOD_INFO);

547 return bigSwitch(next);	837 return next;

548 }	838 }

549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0));	839 appendSubstringToken(DOUBLE_INFO, start, true);

550 return next;	840 return next;

551 }	841 }

552	842

553 int tokenizeExponent(int next) {	843 int tokenizeExponent(int next) {

554 if (identical(next, $PLUS) \|\| identical(next, $MINUS)) {	844 if (identical(next, $PLUS) \|\| identical(next, $MINUS)) {

555 next = advance();	845 next = advance();

556 }	846 }

557 bool hasDigits = false;	847 bool hasDigits = false;

558 while (true) {	848 while (true) {

559 if ($0 <= next && next <= $9) {	849 if ($0 <= next && next <= $9) {

560 hasDigits = true;	850 hasDigits = true;

561 } else {	851 } else {

562 if (!hasDigits) {	852 if (!hasDigits) {

563 return error(const SourceString("digit expected"));	853 return error("digit expected");

564 }	854 }

565 return next;	855 return next;

566 }	856 }

567 next = advance();	857 next = advance();

568 }	858 }

569 }	859 }

570	860

571 int tokenizeSlashOrComment(int next) {	861 int tokenizeSlashOrComment(int next) {

	862 int start = scanOffset;

572 next = advance();	863 next = advance();

573 if (identical($STAR, next)) {	864 if (identical($STAR, next)) {

574 return tokenizeMultiLineComment(next);	865 return tokenizeMultiLineComment(next, start);

575 } else if (identical($SLASH, next)) {	866 } else if (identical($SLASH, next)) {

576 return tokenizeSingleLineComment(next);	867 return tokenizeSingleLineComment(next, start);

577 } else if (identical($EQ, next)) {	868 } else if (identical($EQ, next)) {

578 appendPrecedenceToken(SLASH_EQ_INFO);	869 appendPrecedenceToken(SLASH_EQ_INFO);

579 return advance();	870 return advance();

580 } else {	871 } else {

581 appendPrecedenceToken(SLASH_INFO);	872 appendPrecedenceToken(SLASH_INFO);

582 return next;	873 return next;

583 }	874 }

584 }	875 }

585	876

586 int tokenizeSingleLineComment(int next) {	877 int tokenizeSingleLineComment(int next, int start) {

	878 bool asciiOnly = true;

587 while (true) {	879 while (true) {

588 next = advance();	880 next = advance();

589 if (identical($LF, next) \|\| identical($CR, next) \|\| identical($EOF, next)) {	881 if (next > 127) asciiOnly = false;

590 appendComment();	882 if (identical($LF, next) \|\|

	883 identical($CR, next) \|\|

	884 identical($EOF, next)) {

	885 if (!asciiOnly) handleUnicode(start);

	886 appendComment(start, asciiOnly);

591 return next;	887 return next;

592 }	888 }

593 }	889 }

594 }	890 }

595	891

596 int tokenizeMultiLineComment(int next) {	892

	893 int tokenizeMultiLineComment(int next, int start) {

	894 bool asciiOnlyComment = true; // Track if the entire comment is ASCII.

	895 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.

	896 int unicodeStart = start;

597 int nesting = 1;	897 int nesting = 1;

598 next = advance();	898 next = advance();

599 while (true) {	899 while (true) {

600 if (identical($EOF, next)) {	900 if (identical($EOF, next)) {

601 // TODO(ahe): Report error.	901 if (!asciiOnlyLines) handleUnicode(unicodeStart);

	902 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");

602 return next;	903 return next;

603 } else if (identical($STAR, next)) {	904 } else if (identical($STAR, next)) {

604 next = advance();	905 next = advance();

605 if (identical($SLASH, next)) {	906 if (identical($SLASH, next)) {

606 --nesting;	907 --nesting;

607 if (0 == nesting) {	908 if (0 == nesting) {

	909 if (!asciiOnlyLines) handleUnicode(unicodeStart);

608 next = advance();	910 next = advance();

609 appendComment();	911 appendComment(start, asciiOnlyComment);

610 return next;	912 return next;

611 } else {	913 } else {

612 next = advance();	914 next = advance();

613 }	915 }

614 }	916 }

615 } else if (identical($SLASH, next)) {	917 } else if (identical($SLASH, next)) {

616 next = advance();	918 next = advance();

617 if (identical($STAR, next)) {	919 if (identical($STAR, next)) {

618 next = advance();	920 next = advance();

619 ++nesting;	921 ++nesting;

620 }	922 }

	923 } else if (identical(next, $LF)) {

	924 if (!asciiOnlyLines) {

	925 // Synchronize the string offset in the utf8 scanner.

	926 handleUnicode(unicodeStart);

	927 asciiOnlyLines = true;

	928 unicodeStart = scanOffset;

	929 }

	930 lineFeedInMultiline();

	931 next = advance();

621 } else {	932 } else {

	933 if (next > 127) {

	934 asciiOnlyLines = false;

	935 asciiOnlyComment = false;

	936 }

622 next = advance();	937 next = advance();

623 }	938 }

624 }	939 }

625 }	940 }

626	941

627 int tokenizeRawStringKeywordOrIdentifier(int next) {	942 int tokenizeRawStringKeywordOrIdentifier(int next) {

	943 // [next] is $r.

628 int nextnext = peek();	944 int nextnext = peek();

629 if (identical(nextnext, $DQ) \|\| identical(nextnext, $SQ)) {	945 if (identical(nextnext, $DQ) \|\| identical(nextnext, $SQ)) {

630 int start = byteOffset;	946 int start = scanOffset;

631 next = advance();	947 next = advance();

632 return tokenizeString(next, start, true);	948 return tokenizeString(next, start, true);

633 }	949 }

634 return tokenizeKeywordOrIdentifier(next, true);	950 return tokenizeKeywordOrIdentifier(next, true);

635 }	951 }

636	952

637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {	953 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {

638 KeywordState state = KeywordState.KEYWORD_STATE;	954 KeywordState state = KeywordState.KEYWORD_STATE;

639 int start = byteOffset;	955 int start = scanOffset;

640 while (state != null && $a <= next && next <= $z) {	956 while (state != null && $a <= next && next <= $z) {

641 state = state.next(next);	957 state = state.next(next);

642 next = advance();	958 next = advance();

643 }	959 }

644 if (state == null \|\| state.keyword == null) {	960 if (state == null \|\| state.keyword == null) {

645 return tokenizeIdentifier(next, start, allowDollar);	961 return tokenizeIdentifier(next, start, allowDollar);

646 }	962 }

647 if (($A <= next && next <= $Z) \|\|	963 if (($A <= next && next <= $Z) \|\|

648 ($0 <= next && next <= $9) \|\|	964 ($0 <= next && next <= $9) \|\|

649 identical(next, $_) \|\|	965 identical(next, $_) \|\|

650 identical(next, $$)) {	966 identical(next, $$)) {

651 return tokenizeIdentifier(next, start, allowDollar);	967 return tokenizeIdentifier(next, start, allowDollar);

652 } else if (next < 128) {	968 } else {

653 appendKeywordToken(state.keyword);	969 appendKeywordToken(state.keyword);

654 return next;	970 return next;

655 } else {

656 return tokenizeIdentifier(next, start, allowDollar);

657 }	971 }

658 }	972 }

659	973

	974 /**

	975 * [allowDollar] can exclude '$', which is not allowed as part of a string

	976 * interpolation identifier.

	977 */

660 int tokenizeIdentifier(int next, int start, bool allowDollar) {	978 int tokenizeIdentifier(int next, int start, bool allowDollar) {

661 bool isAscii = true;

662

663 while (true) {	979 while (true) {

664 if (($a <= next && next <= $z) \|\|	980 if (($a <= next && next <= $z) \|\|

665 ($A <= next && next <= $Z) \|\|	981 ($A <= next && next <= $Z) \|\|

666 ($0 <= next && next <= $9) \|\|	982 ($0 <= next && next <= $9) \|\|

667 identical(next, $_) \|\|	983 identical(next, $_) \|\|

668 (identical(next, $$) && allowDollar)) {	984 (identical(next, $$) && allowDollar)) {

669 next = advance();	985 next = advance();

670 } else if ((next < 128) \|\| (identical(next, $NBSP))) {	986 } else {

671 // Identifier ends here.	987 // Identifier ends here.

672 if (start == byteOffset) {	988 if (start == scanOffset) {

673 return error(const SourceString("expected identifier"));	989 return error("expected identifier");

674 } else if (isAscii) {

675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));

676 } else {	990 } else {

677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1));	991 appendSubstringToken(IDENTIFIER_INFO, start, true);

678 }	992 }

679 return next;	993 return next;

680 } else {

681 int nonAsciiStart = byteOffset;

682 do {

683 next = nextByte();

684 if (identical(next, $NBSP)) break;

685 } while (next > 127);

686 String string = utf8String(nonAsciiStart, -1).slowToString();

687 isAscii = false;

688 int byteLength = nonAsciiStart - byteOffset;

689 addToCharOffset(string.length - byteLength);

690 }	994 }

691 }	995 }

692 }	996 }

693	997

694 int tokenizeAt(int next) {	998 int tokenizeAt(int next) {

695 int start = byteOffset;

696 next = advance();

697 appendPrecedenceToken(AT_INFO);	999 appendPrecedenceToken(AT_INFO);

698 return next;	1000 return advance();

699 }	1001 }

700	1002

701 int tokenizeString(int next, int start, bool raw) {	1003 int tokenizeString(int next, int start, bool raw) {

702 int quoteChar = next;	1004 int quoteChar = next;

703 next = advance();	1005 next = advance();

704 if (identical(quoteChar, next)) {	1006 if (identical(quoteChar, next)) {

705 next = advance();	1007 next = advance();

706 if (identical(quoteChar, next)) {	1008 if (identical(quoteChar, next)) {

707 // Multiline string.	1009 // Multiline string.

708 return tokenizeMultiLineString(quoteChar, start, raw);	1010 return tokenizeMultiLineString(quoteChar, start, raw);

709 } else {	1011 } else {

710 // Empty string.	1012 // Empty string.

711 appendByteStringToken(STRING_INFO, utf8String(start, -1));	1013 appendSubstringToken(STRING_INFO, start, true);

712 return next;	1014 return next;

713 }	1015 }

714 }	1016 }

715 if (raw) {	1017 if (raw) {

716 return tokenizeSingleLineRawString(next, quoteChar, start);	1018 return tokenizeSingleLineRawString(next, quoteChar, start);

717 } else {	1019 } else {

718 return tokenizeSingleLineString(next, quoteChar, start);	1020 return tokenizeSingleLineString(next, quoteChar, start);

719 }	1021 }

720 }	1022 }

721	1023

722 static bool isHexDigit(int character) {	1024 /**

723 if ($0 <= character && character <= $9) return true;	1025 * [next] is the first character after the qoute.

724 character \|= 0x20;	1026 * [start] is the scanOffset of the quote.

725 return ($a <= character && character <= $f);	1027 *

726 }	1028 * The token contains a substring of the source file, including the

727	1029 * string quotes, backslashes for escaping. For interpolated strings,

	1030 * the parts before and after are separate tokens.

	1031 *

	1032 * "a $b c"

	1033 *

	1034 * gives StringToken("a $), StringToken(b) and StringToken( c").

	1035 */

728 int tokenizeSingleLineString(int next, int quoteChar, int start) {	1036 int tokenizeSingleLineString(int next, int quoteChar, int start) {

	1037 bool asciiOnly = true;

729 while (!identical(next, quoteChar)) {	1038 while (!identical(next, quoteChar)) {

730 if (identical(next, $BACKSLASH)) {	1039 if (identical(next, $BACKSLASH)) {

731 next = advance();	1040 next = advance();

732 } else if (identical(next, $$)) {	1041 } else if (identical(next, $$)) {

733 next = tokenizeStringInterpolation(start);	1042 if (!asciiOnly) handleUnicode(start);

734 start = byteOffset;	1043 next = tokenizeStringInterpolation(start, asciiOnly);

	1044 start = scanOffset;

	1045 asciiOnly = true;

735 continue;	1046 continue;

736 }	1047 }

737 if (next <= $CR	1048 if (next <= $CR

738 && (identical(next, $LF) \|\| identical(next, $CR) \|\| identical(next, $E OF))) {	1049 && (identical(next, $LF) \|\|

739 return error(const SourceString("unterminated string literal"));	1050 identical(next, $CR) \|\|

	1051 identical(next, $EOF))) {

	1052 if (!asciiOnly) handleUnicode(start);

	1053 return error("unterminated string literal");

740 }	1054 }

	1055 if (next > 127) asciiOnly = false;

741 next = advance();	1056 next = advance();

742 }	1057 }

743 appendByteStringToken(STRING_INFO, utf8String(start, 0));	1058 if (!asciiOnly) handleUnicode(start);

744 return advance();	1059 // Advance past the quote character.

	1060 next = advance();

	1061 appendSubstringToken(STRING_INFO, start, asciiOnly);

	1062 return next;

745 }	1063 }

746	1064

747 int tokenizeStringInterpolation(int start) {	1065 int tokenizeStringInterpolation(int start, bool asciiOnly) {

748 appendByteStringToken(STRING_INFO, utf8String(start, -1));	1066 appendSubstringToken(STRING_INFO, start, asciiOnly);

749 beginToken(); // $ starts here.	1067 beginToken(); // $ starts here.

750 int next = advance();	1068 int next = advance();

751 if (identical(next, $OPEN_CURLY_BRACKET)) {	1069 if (identical(next, $OPEN_CURLY_BRACKET)) {

752 return tokenizeInterpolatedExpression(next, start);	1070 return tokenizeInterpolatedExpression(next);

753 } else {	1071 } else {

754 return tokenizeInterpolatedIdentifier(next, start);	1072 return tokenizeInterpolatedIdentifier(next);

755 }	1073 }

756 }	1074 }

757	1075

758 int tokenizeInterpolatedExpression(int next, int start) {	1076 int tokenizeInterpolatedExpression(int next) {

759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${");	1077 appendBeginGroup(STRING_INTERPOLATION_INFO);

760 beginToken(); // The expression starts here.	1078 beginToken(); // The expression starts here.

761 next = advance();	1079 next = advance(); // Move past the curly bracket.

762 while (!identical(next, $EOF) && !identical(next, $STX)) {	1080 while (!identical(next, $EOF) && !identical(next, $STX)) {

763 next = bigSwitch(next);	1081 next = bigSwitch(next);

764 }	1082 }

765 if (identical(next, $EOF)) return next;	1083 if (identical(next, $EOF)) return next;

766 next = advance();	1084 next = advance(); // Move past the $STX.

767 beginToken(); // The string interpolation suffix starts here.	1085 beginToken(); // The string interpolation suffix starts here.

768 return next;	1086 return next;

769 }	1087 }

770	1088

771 int tokenizeInterpolatedIdentifier(int next, int start) {	1089 int tokenizeInterpolatedIdentifier(int next) {

772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);	1090 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);

773 beginToken(); // The identifier starts here.	1091 beginToken(); // The identifier starts here.

774 next = tokenizeKeywordOrIdentifier(next, false);	1092 next = tokenizeKeywordOrIdentifier(next, false);

775 beginToken(); // The string interpolation suffix starts here.	1093 beginToken(); // The string interpolation suffix starts here.

776 return next;	1094 return next;

777 }	1095 }

778	1096

779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {	1097 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {

780 next = advance();	1098 bool asciiOnly = true;

	1099 next = advance(); // Advance past the quote

781 while (next != $EOF) {	1100 while (next != $EOF) {

782 if (identical(next, quoteChar)) {	1101 if (identical(next, quoteChar)) {

783 appendByteStringToken(STRING_INFO, utf8String(start, 0));	1102 if (!asciiOnly) handleUnicode(start);

784 return advance();	1103 next = advance();

	1104 appendSubstringToken(STRING_INFO, start, asciiOnly);

	1105 return next;

785 } else if (identical(next, $LF) \|\| identical(next, $CR)) {	1106 } else if (identical(next, $LF) \|\| identical(next, $CR)) {

786 return error(const SourceString("unterminated string literal"));	1107 if (!asciiOnly) handleUnicode(start);

	1108 return error("unterminated string literal");

	1109 } else if (next > 127) {

	1110 asciiOnly = false;

787 }	1111 }

788 next = advance();	1112 next = advance();

789 }	1113 }

790 return error(const SourceString("unterminated string literal"));	1114 if (!asciiOnly) handleUnicode(start);

	1115 return error("unterminated string literal");

791 }	1116 }

792	1117

793 int tokenizeMultiLineRawString(int quoteChar, int start) {	1118 int tokenizeMultiLineRawString(int quoteChar, int start) {

794 int next = advance();	1119 bool asciiOnlyString = true;

	1120 bool asciiOnlyLine = true;

	1121 int unicodeStart = start;

	1122 int next = advance(); // Advance past the (last) quote (of three)

795 outer: while (!identical(next, $EOF)) {	1123 outer: while (!identical(next, $EOF)) {

796 while (!identical(next, quoteChar)) {	1124 while (!identical(next, quoteChar)) {

	1125 if (identical(next, $LF)) {

	1126 if (!asciiOnlyLine) {

	1127 // Synchronize the string offset in the utf8 scanner.

	1128 handleUnicode(unicodeStart);

	1129 asciiOnlyLine = true;

	1130 unicodeStart = scanOffset;

	1131 }

	1132 lineFeedInMultiline();

	1133 } else if (next > 127) {

	1134 asciiOnlyLine = false;

	1135 asciiOnlyString = false;

	1136 }

797 next = advance();	1137 next = advance();

798 if (identical(next, $EOF)) break outer;	1138 if (identical(next, $EOF)) break outer;

799 }	1139 }

800 next = advance();	1140 next = advance();

801 if (identical(next, quoteChar)) {	1141 if (identical(next, quoteChar)) {

802 next = advance();	1142 next = advance();

803 if (identical(next, quoteChar)) {	1143 if (identical(next, quoteChar)) {

804 appendByteStringToken(STRING_INFO, utf8String(start, 0));	1144 if (!asciiOnlyLine) handleUnicode(unicodeStart);

805 return advance();	1145 next = advance();

	1146 appendSubstringToken(STRING_INFO, start, asciiOnlyString);

	1147 return next;

806 }	1148 }

807 }	1149 }

808 }	1150 }

809 return error(const SourceString("unterminated string literal"));	1151 if (!asciiOnlyLine) handleUnicode(unicodeStart);

	1152 return error("unterminated string literal");

810 }	1153 }

811	1154

812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {	1155 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {

813 if (raw) return tokenizeMultiLineRawString(quoteChar, start);	1156 if (raw) return tokenizeMultiLineRawString(quoteChar, start);

814 int next = advance();	1157 bool asciiOnlyString = true;

	1158 bool asciiOnlyLine = true;

	1159 int unicodeStart = start;

	1160 int next = advance(); // Advance past the (last) quote (of three).

815 while (!identical(next, $EOF)) {	1161 while (!identical(next, $EOF)) {

816 if (identical(next, $$)) {	1162 if (identical(next, $$)) {

817 next = tokenizeStringInterpolation(start);	1163 if (!asciiOnlyLine) handleUnicode(unicodeStart);

818 start = byteOffset;	1164 next = tokenizeStringInterpolation(start, asciiOnlyString);

	1165 start = scanOffset;

	1166 unicodeStart = start;

	1167 asciiOnlyString = true; // A new string token is created for the rest.

	1168 asciiOnlyLine = true;

819 continue;	1169 continue;

820 }	1170 }

821 if (identical(next, quoteChar)) {	1171 if (identical(next, quoteChar)) {

822 next = advance();	1172 next = advance();

823 if (identical(next, quoteChar)) {	1173 if (identical(next, quoteChar)) {

824 next = advance();	1174 next = advance();

825 if (identical(next, quoteChar)) {	1175 if (identical(next, quoteChar)) {

826 appendByteStringToken(STRING_INFO, utf8String(start, 0));	1176 if (!asciiOnlyLine) handleUnicode(unicodeStart);

827 return advance();	1177 next = advance();

	1178 appendSubstringToken(STRING_INFO, start, asciiOnlyString);

	1179 return next;

828 }	1180 }

829 }	1181 }

830 continue;	1182 continue;

831 }	1183 }

832 if (identical(next, $BACKSLASH)) {	1184 if (identical(next, $BACKSLASH)) {

833 next = advance();	1185 next = advance();

834 if (identical(next, $EOF)) break;	1186 if (identical(next, $EOF)) break;

835 }	1187 }

	1188 if (identical(next, $LF)) {

	1189 if (!asciiOnlyLine) {

	1190 // Synchronize the string offset in the utf8 scanner.

	1191 handleUnicode(unicodeStart);

	1192 asciiOnlyLine = true;

	1193 unicodeStart = scanOffset;

	1194 }

	1195 lineFeedInMultiline();

	1196 } else if (next > 127) {

	1197 asciiOnlyString = false;

	1198 asciiOnlyLine = false;

	1199 }

836 next = advance();	1200 next = advance();

837 }	1201 }

838 return error(const SourceString("unterminated string literal"));	1202 if (!asciiOnlyLine) handleUnicode(unicodeStart);

	1203 return error("unterminated string literal");

839 }	1204 }

840	1205

841 int error(SourceString message) {	1206 int error(String message) {

842 appendByteStringToken(BAD_INPUT_INFO, message);	1207 appendStringToken(BAD_INPUT_INFO, message);

843 return advance(); // Ensure progress.	1208 return advance(); // Ensure progress.

844 }	1209 }

	1210

	1211 void unmatchedBeginGroup(BeginGroupToken begin) {

	1212 String error = 'unmatched "${begin.stringValue}"';

	1213 Token close =

	1214 new StringToken.fromString(

	1215 BAD_INPUT_INFO, error, begin.charOffset, true);

	1216

	1217 // We want to ensure that unmatched BeginGroupTokens are reported

	1218 // as errors. However, the rest of the parser assume the groups

	1219 // are well-balanced and will never look at the endGroup

	1220 // token. This is a nice property that allows us to skip quickly

	1221 // over correct code. By inserting an additional error token in

	1222 // the stream, we can keep ignoring endGroup tokens.

	1223 //

	1224 // [begin] --next--> [tail]

	1225 // [begin] --endG--> [close] --next--> [next] --next--> [tail]

	1226 //

	1227 // This allows the parser to skip from [begin] via endGroup to [close] and

	1228 // ignore the [close] token (assuming it's correct), then the error will be

	1229 // reported when parsing the [next] token.

	1230

	1231 Token next = new StringToken.fromString(

	1232 BAD_INPUT_INFO, error, begin.charOffset, true);

	1233 begin.endGroup = close;

	1234 close.next = next;

	1235 next.next = begin.next;

	1236 }

845 }	1237 }

OLD	NEW