pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart - Issue 2664593002: Port parser and scanner fixes from rasta branch.

Side by Side Diff: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)

Patch Set: Rebased on ef8ec26cf36d1f07b4fdf5d605003210826ae1c2. Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 library fasta.scanner.utf8_bytes_scanner;	5 library fasta.scanner.utf8_bytes_scanner;

6	6

7 import 'dart:convert' show	7 import 'dart:convert' show

8 UNICODE_BOM_CHARACTER_RUNE,	8 UNICODE_BOM_CHARACTER_RUNE,

9 UTF8;	9 UTF8;

10	10

	11 import '../scanner.dart' show

	12 unicodeReplacementCharacter;

	13

11 import 'precedence.dart' show	14 import 'precedence.dart' show

12 PrecedenceInfo;	15 PrecedenceInfo;

13	16

14 import 'token.dart' show	17 import 'token.dart' show

15 StringToken,	18 StringToken,

16 Token;	19 Token;

17	20

18 import 'array_based_scanner.dart' show	21 import 'array_based_scanner.dart' show

19 ArrayBasedScanner;	22 ArrayBasedScanner;

20	23

(...skipping 75 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
96 return offset + 3 < bytes.length &&	99 return offset + 3 < bytes.length &&

97 bytes[offset] == BOM_UTF8[0] &&	100 bytes[offset] == BOM_UTF8[0] &&

98 bytes[offset + 1] == BOM_UTF8[1] &&	101 bytes[offset + 1] == BOM_UTF8[1] &&

99 bytes[offset + 2] == BOM_UTF8[2];	102 bytes[offset + 2] == BOM_UTF8[2];

100 }	103 }

101	104

102 int advance() => bytes[++byteOffset];	105 int advance() => bytes[++byteOffset];

103	106

104 int peek() => bytes[byteOffset + 1];	107 int peek() => bytes[byteOffset + 1];

105	108

106 /**	109 /// Returns the unicode code point starting at the byte offset [startOffset]

107 * Returns the unicode code point starting at the byte offset [startOffset]	110 /// with the byte [nextByte].

108 * with the byte [nextByte]. If [advance] is true the current [byteOffset]	111 int nextCodePoint(int startOffset, int nextByte) {

109 * is advanced to the last byte of the code point.	112 int expectedHighBytes;

110 */	113 if (nextByte < 0xC2) {

111 int nextCodePoint(int startOffset, int nextByte, bool advance) {	114 expectedHighBytes = 1; // Bad code unit.

112 // The number of 1s in the first byte indicate the number of bytes, at	115 } else if (nextByte < 0xE0) {

113 // least 2.	116 expectedHighBytes = 2;

114 int numBytes = 2;	117 } else if (nextByte < 0xF0) {

115 int bit = 0x20;	118 expectedHighBytes = 3;

116 while ((nextByte & bit) != 0) {	119 } else if (nextByte < 0xF5) {

	120 expectedHighBytes = 4;

	121 } else {

	122 expectedHighBytes = 1; // Bad code unit.

	123 }

	124 int numBytes = 0;

	125 for (int i = 0; i < expectedHighBytes; i++) {

	126 if (bytes[byteOffset + i] < 0x80) {

	127 break;

	128 }

117 numBytes++;	129 numBytes++;

118 bit >>= 1;

119 }	130 }

120 int end = startOffset + numBytes;	131 int end = startOffset + numBytes;

121 if (advance) {	132 byteOffset = end - 1;

122 byteOffset = end - 1;	133 if (expectedHighBytes == 1 \|\| numBytes != expectedHighBytes) {

	134 return unicodeReplacementCharacter;

123 }	135 }

124 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a	136 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

125 // _Utf8Decoder instance. Also the sublist is eagerly allocated.	137 // _Utf8Decoder instance. Also the sublist is eagerly allocated.

126 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));	138 String codePoint =

	139 UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);

127 if (codePoint.length == 0) {	140 if (codePoint.length == 0) {

128 // The UTF-8 decoder discards leading BOM characters.	141 // The UTF-8 decoder discards leading BOM characters.

129 // TODO(floitsch): don't just assume that removed characters were the	142 // TODO(floitsch): don't just assume that removed characters were the

130 // BOM.	143 // BOM.

131 assert(containsBomAt(startOffset));	144 assert(containsBomAt(startOffset));

132 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);	145 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

133 }	146 }

134 if (codePoint.length == 1) {	147 if (codePoint.length == 1) {

135 if (advance) {	148 utf8Slack += (numBytes - 1);

136 utf8Slack += (numBytes - 1);	149 scanSlack = numBytes - 1;

137 scanSlack = numBytes - 1;	150 scanSlackOffset = byteOffset;

138 scanSlackOffset = byteOffset;

139 }

140 return codePoint.codeUnitAt(0);	151 return codePoint.codeUnitAt(0);

141 } else if (codePoint.length == 2) {	152 } else if (codePoint.length == 2) {

142 if (advance) {	153 utf8Slack += (numBytes - 2);

143 utf8Slack += (numBytes - 2);	154 scanSlack = numBytes - 1;

144 scanSlack = numBytes - 1;	155 scanSlackOffset = byteOffset;

145 scanSlackOffset = byteOffset;	156 stringOffsetSlackOffset = byteOffset;

146 stringOffsetSlackOffset = byteOffset;

147 }

148 // In case of a surrogate pair, return a single code point.	157 // In case of a surrogate pair, return a single code point.

149 return codePoint.runes.single;	158 return codePoint.runes.single;

150 } else {	159 } else {

151 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";	160 return unicodeReplacementCharacter;

152 }	161 }

153 }	162 }

154	163

155 int lastUnicodeOffset = -1;	164 int lastUnicodeOffset = -1;

156 int currentAsUnicode(int next) {	165 int currentAsUnicode(int next) {

157 if (next < 128) return next;	166 if (next < 128) return next;

158 // Check if currentAsUnicode was already invoked.	167 // Check if currentAsUnicode was already invoked.

159 if (byteOffset == lastUnicodeOffset) return next;	168 if (byteOffset == lastUnicodeOffset) return next;

160 int res = nextCodePoint(byteOffset, next, true);	169 int res = nextCodePoint(byteOffset, next);

161 lastUnicodeOffset = byteOffset;	170 lastUnicodeOffset = byteOffset;

162 return res;	171 return res;

163 }	172 }

164	173

165 void handleUnicode(int startScanOffset) {	174 void handleUnicode(int startScanOffset) {

166 int end = byteOffset;	175 int end = byteOffset;

167 // TODO(lry): this measurably slows down the scanner for files with unicode.	176 // TODO(lry): this measurably slows down the scanner for files with unicode.

168 String s = UTF8.decode(bytes.sublist(startScanOffset, end));	177 String s =

	178 UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);

169 utf8Slack += (end - startScanOffset) - s.length;	179 utf8Slack += (end - startScanOffset) - s.length;

170 }	180 }

171	181

172 /**	182 /**

173 * This field remembers the byte offset of the last character decoded with	183 * This field remembers the byte offset of the last character decoded with

174 * [nextCodePoint] that used two code units in UTF-16.	184 * [nextCodePoint] that used two code units in UTF-16.

175 *	185 *

176 * [nextCodePoint] returns a single code point for each unicode character,	186 * [nextCodePoint] returns a single code point for each unicode character,

177 * even if it needs two code units in UTF-16.	187 * even if it needs two code units in UTF-16.

178 *	188 *

(...skipping 18 matching lines...) Expand all Loading...
197	207

198 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,	208 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

199 [int extraOffset = 0]) {	209 [int extraOffset = 0]) {

200 tail.next = new StringToken.fromUtf8Bytes(	210 tail.next = new StringToken.fromUtf8Bytes(

201 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);	211 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

202 tail = tail.next;	212 tail = tail.next;

203 }	213 }

204	214

205 bool atEndOfFile() => byteOffset >= bytes.length - 1;	215 bool atEndOfFile() => byteOffset >= bytes.length - 1;

206 }	216 }

OLD	NEW

« no previous file with comments | « pkg/front_end/lib/src/fasta/scanner/token.dart ('k') | pkg/front_end/lib/src/fasta/source/source_loader.dart » ('j') | no next file with comments »