Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(433)

Unified Diff: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)
Patch Set: Rebased on ef8ec26cf36d1f07b4fdf5d605003210826ae1c2. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
diff --git a/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart b/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
index b0914d43609ce0fff24aa5006e62f8415ef57695..98f3293c3d5c86d34b26e730a21e36c3ac522a9a 100644
--- a/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
+++ b/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
@@ -8,6 +8,9 @@ import 'dart:convert' show
UNICODE_BOM_CHARACTER_RUNE,
UTF8;
+import '../scanner.dart' show
+ unicodeReplacementCharacter;
+
import 'precedence.dart' show
PrecedenceInfo;
@@ -103,27 +106,37 @@ class Utf8BytesScanner extends ArrayBasedScanner {
int peek() => bytes[byteOffset + 1];
- /**
- * Returns the unicode code point starting at the byte offset [startOffset]
- * with the byte [nextByte]. If [advance] is true the current [byteOffset]
- * is advanced to the last byte of the code point.
- */
- int nextCodePoint(int startOffset, int nextByte, bool advance) {
- // The number of 1s in the first byte indicate the number of bytes, at
- // least 2.
- int numBytes = 2;
- int bit = 0x20;
- while ((nextByte & bit) != 0) {
+ /// Returns the unicode code point starting at the byte offset [startOffset]
+ /// with the byte [nextByte].
+ int nextCodePoint(int startOffset, int nextByte) {
+ int expectedHighBytes;
+ if (nextByte < 0xC2) {
+ expectedHighBytes = 1; // Bad code unit.
+ } else if (nextByte < 0xE0) {
+ expectedHighBytes = 2;
+ } else if (nextByte < 0xF0) {
+ expectedHighBytes = 3;
+ } else if (nextByte < 0xF5) {
+ expectedHighBytes = 4;
+ } else {
+ expectedHighBytes = 1; // Bad code unit.
+ }
+ int numBytes = 0;
+ for (int i = 0; i < expectedHighBytes; i++) {
+ if (bytes[byteOffset + i] < 0x80) {
+ break;
+ }
numBytes++;
- bit >>= 1;
}
int end = startOffset + numBytes;
- if (advance) {
- byteOffset = end - 1;
+ byteOffset = end - 1;
+ if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {
+ return unicodeReplacementCharacter;
}
// TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
// _Utf8Decoder instance. Also the sublist is eagerly allocated.
- String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
+ String codePoint =
+ UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);
if (codePoint.length == 0) {
// The UTF-8 decoder discards leading BOM characters.
// TODO(floitsch): don't just assume that removed characters were the
@@ -132,23 +145,19 @@ class Utf8BytesScanner extends ArrayBasedScanner {
codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
}
if (codePoint.length == 1) {
- if (advance) {
- utf8Slack += (numBytes - 1);
- scanSlack = numBytes - 1;
- scanSlackOffset = byteOffset;
- }
+ utf8Slack += (numBytes - 1);
+ scanSlack = numBytes - 1;
+ scanSlackOffset = byteOffset;
return codePoint.codeUnitAt(0);
} else if (codePoint.length == 2) {
- if (advance) {
- utf8Slack += (numBytes - 2);
- scanSlack = numBytes - 1;
- scanSlackOffset = byteOffset;
- stringOffsetSlackOffset = byteOffset;
- }
+ utf8Slack += (numBytes - 2);
+ scanSlack = numBytes - 1;
+ scanSlackOffset = byteOffset;
+ stringOffsetSlackOffset = byteOffset;
// In case of a surrogate pair, return a single code point.
return codePoint.runes.single;
} else {
- throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
+ return unicodeReplacementCharacter;
}
}
@@ -157,7 +166,7 @@ class Utf8BytesScanner extends ArrayBasedScanner {
if (next < 128) return next;
// Check if currentAsUnicode was already invoked.
if (byteOffset == lastUnicodeOffset) return next;
- int res = nextCodePoint(byteOffset, next, true);
+ int res = nextCodePoint(byteOffset, next);
lastUnicodeOffset = byteOffset;
return res;
}
@@ -165,7 +174,8 @@ class Utf8BytesScanner extends ArrayBasedScanner {
void handleUnicode(int startScanOffset) {
int end = byteOffset;
// TODO(lry): this measurably slows down the scanner for files with unicode.
- String s = UTF8.decode(bytes.sublist(startScanOffset, end));
+ String s =
+ UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);
utf8Slack += (end - startScanOffset) - s.length;
}
« no previous file with comments | « pkg/front_end/lib/src/fasta/scanner/token.dart ('k') | pkg/front_end/lib/src/fasta/source/source_loader.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698