| Index: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
|
| diff --git a/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart b/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
|
| index b0914d43609ce0fff24aa5006e62f8415ef57695..98f3293c3d5c86d34b26e730a21e36c3ac522a9a 100644
|
| --- a/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
|
| +++ b/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart
|
| @@ -8,6 +8,9 @@ import 'dart:convert' show
|
| UNICODE_BOM_CHARACTER_RUNE,
|
| UTF8;
|
|
|
| +import '../scanner.dart' show
|
| + unicodeReplacementCharacter;
|
| +
|
| import 'precedence.dart' show
|
| PrecedenceInfo;
|
|
|
| @@ -103,27 +106,37 @@ class Utf8BytesScanner extends ArrayBasedScanner {
|
|
|
| int peek() => bytes[byteOffset + 1];
|
|
|
| - /**
|
| - * Returns the unicode code point starting at the byte offset [startOffset]
|
| - * with the byte [nextByte]. If [advance] is true the current [byteOffset]
|
| - * is advanced to the last byte of the code point.
|
| - */
|
| - int nextCodePoint(int startOffset, int nextByte, bool advance) {
|
| - // The number of 1s in the first byte indicate the number of bytes, at
|
| - // least 2.
|
| - int numBytes = 2;
|
| - int bit = 0x20;
|
| - while ((nextByte & bit) != 0) {
|
| + /// Returns the unicode code point starting at the byte offset [startOffset]
|
| + /// with the byte [nextByte].
|
| + int nextCodePoint(int startOffset, int nextByte) {
|
| + int expectedHighBytes;
|
| + if (nextByte < 0xC2) {
|
| + expectedHighBytes = 1; // Bad code unit.
|
| + } else if (nextByte < 0xE0) {
|
| + expectedHighBytes = 2;
|
| + } else if (nextByte < 0xF0) {
|
| + expectedHighBytes = 3;
|
| + } else if (nextByte < 0xF5) {
|
| + expectedHighBytes = 4;
|
| + } else {
|
| + expectedHighBytes = 1; // Bad code unit.
|
| + }
|
| + int numBytes = 0;
|
| + for (int i = 0; i < expectedHighBytes; i++) {
|
| + if (bytes[byteOffset + i] < 0x80) {
|
| + break;
|
| + }
|
| numBytes++;
|
| - bit >>= 1;
|
| }
|
| int end = startOffset + numBytes;
|
| - if (advance) {
|
| - byteOffset = end - 1;
|
| + byteOffset = end - 1;
|
| + if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {
|
| + return unicodeReplacementCharacter;
|
| }
|
| // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
|
| // _Utf8Decoder instance. Also the sublist is eagerly allocated.
|
| - String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
|
| + String codePoint =
|
| + UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);
|
| if (codePoint.length == 0) {
|
| // The UTF-8 decoder discards leading BOM characters.
|
| // TODO(floitsch): don't just assume that removed characters were the
|
| @@ -132,23 +145,19 @@ class Utf8BytesScanner extends ArrayBasedScanner {
|
| codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
|
| }
|
| if (codePoint.length == 1) {
|
| - if (advance) {
|
| - utf8Slack += (numBytes - 1);
|
| - scanSlack = numBytes - 1;
|
| - scanSlackOffset = byteOffset;
|
| - }
|
| + utf8Slack += (numBytes - 1);
|
| + scanSlack = numBytes - 1;
|
| + scanSlackOffset = byteOffset;
|
| return codePoint.codeUnitAt(0);
|
| } else if (codePoint.length == 2) {
|
| - if (advance) {
|
| - utf8Slack += (numBytes - 2);
|
| - scanSlack = numBytes - 1;
|
| - scanSlackOffset = byteOffset;
|
| - stringOffsetSlackOffset = byteOffset;
|
| - }
|
| + utf8Slack += (numBytes - 2);
|
| + scanSlack = numBytes - 1;
|
| + scanSlackOffset = byteOffset;
|
| + stringOffsetSlackOffset = byteOffset;
|
| // In case of a surrogate pair, return a single code point.
|
| return codePoint.runes.single;
|
| } else {
|
| - throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
|
| + return unicodeReplacementCharacter;
|
| }
|
| }
|
|
|
| @@ -157,7 +166,7 @@ class Utf8BytesScanner extends ArrayBasedScanner {
|
| if (next < 128) return next;
|
| // Check if currentAsUnicode was already invoked.
|
| if (byteOffset == lastUnicodeOffset) return next;
|
| - int res = nextCodePoint(byteOffset, next, true);
|
| + int res = nextCodePoint(byteOffset, next);
|
| lastUnicodeOffset = byteOffset;
|
| return res;
|
| }
|
| @@ -165,7 +174,8 @@ class Utf8BytesScanner extends ArrayBasedScanner {
|
| void handleUnicode(int startScanOffset) {
|
| int end = byteOffset;
|
| // TODO(lry): this measurably slows down the scanner for files with unicode.
|
| - String s = UTF8.decode(bytes.sublist(startScanOffset, end));
|
| + String s =
|
| + UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);
|
| utf8Slack += (end - startScanOffset) - s.length;
|
| }
|
|
|
|
|