pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart - Issue 2664593002: Port parser and scanner fixes from rasta branch.

Side by Side Diff: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)

Patch Set: Update status files. Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 library dart_scanner.utf8_bytes_scanner;	5 library dart_scanner.utf8_bytes_scanner;

6	6

7 import 'dart:convert' show	7 import 'dart:convert' show

8 UNICODE_BOM_CHARACTER_RUNE,	8 UNICODE_BOM_CHARACTER_RUNE,

9 UTF8;	9 UTF8;

10	10

	11 import '../dart_scanner.dart' show

	12 unicodeReplacementCharacter;

	13

11 import 'precedence.dart' show	14 import 'precedence.dart' show

12 PrecedenceInfo;	15 PrecedenceInfo;

13	16

14 import 'token.dart' show	17 import 'token.dart' show

15 StringToken,	18 StringToken,

16 Token;	19 Token;

17	20

18 import 'array_based_scanner.dart' show	21 import 'array_based_scanner.dart' show

19 ArrayBasedScanner;	22 ArrayBasedScanner;

20	23

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
98 bytes[offset + 1] == BOM_UTF8[1] &&	101 bytes[offset + 1] == BOM_UTF8[1] &&

99 bytes[offset + 2] == BOM_UTF8[2];	102 bytes[offset + 2] == BOM_UTF8[2];

100 }	103 }

101	104

102 int advance() => bytes[++byteOffset];	105 int advance() => bytes[++byteOffset];

103	106

104 int peek() => bytes[byteOffset + 1];	107 int peek() => bytes[byteOffset + 1];

105	108

106 /**	109 /**

107 * Returns the unicode code point starting at the byte offset [startOffset]	110 * Returns the unicode code point starting at the byte offset [startOffset]

108 * with the byte [nextByte]. If [advance] is true the current [byteOffset]	111 * with the byte [nextByte]. If [advance] is true the current [byteOffset]
	Johnni Winther 2017/01/30 09:04:38 Remove doc about [advance]. Remove doc about [advance]. ahe 2017/01/30 13:26:22 Done. Show quoted text On 2017/01/30 09:04:38, Johnni Winther wrote: > Remove doc about [advance]. Done.
109 * is advanced to the last byte of the code point.	112 * is advanced to the last byte of the code point.

110 */	113 */

111 int nextCodePoint(int startOffset, int nextByte, bool advance) {	114 int nextCodePoint(int startOffset, int nextByte) {

112 // The number of 1s in the first byte indicate the number of bytes, at	115 int expectedHighBytes;

113 // least 2.	116 if (nextByte < 0xC2) {

114 int numBytes = 2;	117 expectedHighBytes = 1; // Bad code unit.

115 int bit = 0x20;	118 } else if (nextByte < 0xE0) {

116 while ((nextByte & bit) != 0) {	119 expectedHighBytes = 2;

	120 } else if (nextByte < 0xF0) {

	121 expectedHighBytes = 3;

	122 } else if (nextByte < 0xF5) {

	123 expectedHighBytes = 4;

	124 } else {

	125 expectedHighBytes = 1; // Bad code unit.

	126 }

	127 int numBytes = 0;

	128 for (int i = 0; i < expectedHighBytes; i++) {

	129 if (bytes[byteOffset + i] < 0x80) {

	130 break;

	131 }

117 numBytes++;	132 numBytes++;

118 bit >>= 1;

119 }	133 }

120 int end = startOffset + numBytes;	134 int end = startOffset + numBytes;

121 if (advance) {	135 byteOffset = end - 1;

122 byteOffset = end - 1;	136 if (expectedHighBytes == 1 \|\| numBytes != expectedHighBytes) {

	137 return unicodeReplacementCharacter;

123 }	138 }

124 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a	139 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

125 // _Utf8Decoder instance. Also the sublist is eagerly allocated.	140 // _Utf8Decoder instance. Also the sublist is eagerly allocated.

126 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));	141 String codePoint =

	142 UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);

127 if (codePoint.length == 0) {	143 if (codePoint.length == 0) {

128 // The UTF-8 decoder discards leading BOM characters.	144 // The UTF-8 decoder discards leading BOM characters.

129 // TODO(floitsch): don't just assume that removed characters were the	145 // TODO(floitsch): don't just assume that removed characters were the

130 // BOM.	146 // BOM.

131 assert(containsBomAt(startOffset));	147 assert(containsBomAt(startOffset));

132 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);	148 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

133 }	149 }

134 if (codePoint.length == 1) {	150 if (codePoint.length == 1) {

135 if (advance) {	151 utf8Slack += (numBytes - 1);

136 utf8Slack += (numBytes - 1);	152 scanSlack = numBytes - 1;

137 scanSlack = numBytes - 1;	153 scanSlackOffset = byteOffset;

138 scanSlackOffset = byteOffset;

139 }

140 return codePoint.codeUnitAt(0);	154 return codePoint.codeUnitAt(0);

141 } else if (codePoint.length == 2) {	155 } else if (codePoint.length == 2) {

142 if (advance) {	156 utf8Slack += (numBytes - 2);

143 utf8Slack += (numBytes - 2);	157 scanSlack = numBytes - 1;

144 scanSlack = numBytes - 1;	158 scanSlackOffset = byteOffset;

145 scanSlackOffset = byteOffset;	159 stringOffsetSlackOffset = byteOffset;

146 stringOffsetSlackOffset = byteOffset;

147 }

148 // In case of a surrogate pair, return a single code point.	160 // In case of a surrogate pair, return a single code point.

149 return codePoint.runes.single;	161 return codePoint.runes.single;

150 } else {	162 } else {

151 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";	163 return unicodeReplacementCharacter;

152 }	164 }

153 }	165 }

154	166

155 int lastUnicodeOffset = -1;	167 int lastUnicodeOffset = -1;

156 int currentAsUnicode(int next) {	168 int currentAsUnicode(int next) {

157 if (next < 128) return next;	169 if (next < 128) return next;

158 // Check if currentAsUnicode was already invoked.	170 // Check if currentAsUnicode was already invoked.

159 if (byteOffset == lastUnicodeOffset) return next;	171 if (byteOffset == lastUnicodeOffset) return next;

160 int res = nextCodePoint(byteOffset, next, true);	172 int res = nextCodePoint(byteOffset, next);

161 lastUnicodeOffset = byteOffset;	173 lastUnicodeOffset = byteOffset;

162 return res;	174 return res;

163 }	175 }

164	176

165 void handleUnicode(int startScanOffset) {	177 void handleUnicode(int startScanOffset) {

166 int end = byteOffset;	178 int end = byteOffset;

167 // TODO(lry): this measurably slows down the scanner for files with unicode.	179 // TODO(lry): this measurably slows down the scanner for files with unicode.

168 String s = UTF8.decode(bytes.sublist(startScanOffset, end));	180 String s =

	181 UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);

169 utf8Slack += (end - startScanOffset) - s.length;	182 utf8Slack += (end - startScanOffset) - s.length;

170 }	183 }

171	184

172 /**	185 /**

173 * This field remembers the byte offset of the last character decoded with	186 * This field remembers the byte offset of the last character decoded with

174 * [nextCodePoint] that used two code units in UTF-16.	187 * [nextCodePoint] that used two code units in UTF-16.

175 *	188 *

176 * [nextCodePoint] returns a single code point for each unicode character,	189 * [nextCodePoint] returns a single code point for each unicode character,

177 * even if it needs two code units in UTF-16.	190 * even if it needs two code units in UTF-16.

178 *	191 *

(...skipping 18 matching lines...) Expand all Loading...
197	210

198 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,	211 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

199 [int extraOffset = 0]) {	212 [int extraOffset = 0]) {

200 tail.next = new StringToken.fromUtf8Bytes(	213 tail.next = new StringToken.fromUtf8Bytes(

201 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);	214 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

202 tail = tail.next;	215 tail = tail.next;

203 }	216 }

204	217

205 bool atEndOfFile() => byteOffset >= bytes.length - 1;	218 bool atEndOfFile() => byteOffset >= bytes.length - 1;

206 }	219 }

OLD	NEW

« pkg/dart_scanner/lib/src/recover.dart ('K') | « pkg/dart_scanner/lib/src/token.dart ('k') | pkg/fasta/lib/src/errors.dart » ('j') | pkg/fasta/lib/src/source/source_loader.dart » ('J')