Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(43)

Unified Diff: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)
Patch Set: Update status files. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart
diff --git a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart
index 301801ef08a08f517b03285cf05d77d7619fb2a7..b7bfddb7bd4065427a01177818c6b9be1152ce28 100644
--- a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart
+++ b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart
@@ -8,6 +8,9 @@ import 'dart:convert' show
UNICODE_BOM_CHARACTER_RUNE,
UTF8;
+import '../dart_scanner.dart' show
+ unicodeReplacementCharacter;
+
import 'precedence.dart' show
PrecedenceInfo;
@@ -108,22 +111,35 @@ class Utf8BytesScanner extends ArrayBasedScanner {
* with the byte [nextByte]. If [advance] is true the current [byteOffset]
Johnni Winther 2017/01/30 09:04:38 Remove doc about [advance].
ahe 2017/01/30 13:26:22 Done.
* is advanced to the last byte of the code point.
*/
- int nextCodePoint(int startOffset, int nextByte, bool advance) {
- // The number of 1s in the first byte indicate the number of bytes, at
- // least 2.
- int numBytes = 2;
- int bit = 0x20;
- while ((nextByte & bit) != 0) {
+ int nextCodePoint(int startOffset, int nextByte) {
+ int expectedHighBytes;
+ if (nextByte < 0xC2) {
+ expectedHighBytes = 1; // Bad code unit.
+ } else if (nextByte < 0xE0) {
+ expectedHighBytes = 2;
+ } else if (nextByte < 0xF0) {
+ expectedHighBytes = 3;
+ } else if (nextByte < 0xF5) {
+ expectedHighBytes = 4;
+ } else {
+ expectedHighBytes = 1; // Bad code unit.
+ }
+ int numBytes = 0;
+ for (int i = 0; i < expectedHighBytes; i++) {
+ if (bytes[byteOffset + i] < 0x80) {
+ break;
+ }
numBytes++;
- bit >>= 1;
}
int end = startOffset + numBytes;
- if (advance) {
- byteOffset = end - 1;
+ byteOffset = end - 1;
+ if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {
+ return unicodeReplacementCharacter;
}
// TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
// _Utf8Decoder instance. Also the sublist is eagerly allocated.
- String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
+ String codePoint =
+ UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);
if (codePoint.length == 0) {
// The UTF-8 decoder discards leading BOM characters.
// TODO(floitsch): don't just assume that removed characters were the
@@ -132,23 +148,19 @@ class Utf8BytesScanner extends ArrayBasedScanner {
codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
}
if (codePoint.length == 1) {
- if (advance) {
- utf8Slack += (numBytes - 1);
- scanSlack = numBytes - 1;
- scanSlackOffset = byteOffset;
- }
+ utf8Slack += (numBytes - 1);
+ scanSlack = numBytes - 1;
+ scanSlackOffset = byteOffset;
return codePoint.codeUnitAt(0);
} else if (codePoint.length == 2) {
- if (advance) {
- utf8Slack += (numBytes - 2);
- scanSlack = numBytes - 1;
- scanSlackOffset = byteOffset;
- stringOffsetSlackOffset = byteOffset;
- }
+ utf8Slack += (numBytes - 2);
+ scanSlack = numBytes - 1;
+ scanSlackOffset = byteOffset;
+ stringOffsetSlackOffset = byteOffset;
// In case of a surrogate pair, return a single code point.
return codePoint.runes.single;
} else {
- throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
+ return unicodeReplacementCharacter;
}
}
@@ -157,7 +169,7 @@ class Utf8BytesScanner extends ArrayBasedScanner {
if (next < 128) return next;
// Check if currentAsUnicode was already invoked.
if (byteOffset == lastUnicodeOffset) return next;
- int res = nextCodePoint(byteOffset, next, true);
+ int res = nextCodePoint(byteOffset, next);
lastUnicodeOffset = byteOffset;
return res;
}
@@ -165,7 +177,8 @@ class Utf8BytesScanner extends ArrayBasedScanner {
void handleUnicode(int startScanOffset) {
int end = byteOffset;
// TODO(lry): this measurably slows down the scanner for files with unicode.
- String s = UTF8.decode(bytes.sublist(startScanOffset, end));
+ String s =
+ UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);
utf8Slack += (end - startScanOffset) - s.length;
}

Powered by Google App Engine
This is Rietveld 408576698