pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart - Issue 2664593002: Port parser and scanner fixes from rasta branch.

Unified Diff: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)

Patch Set: Rebased on ef8ec26cf36d1f07b4fdf5d605003210826ae1c2. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

diff --git a/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart b/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

index b0914d43609ce0fff24aa5006e62f8415ef57695..98f3293c3d5c86d34b26e730a21e36c3ac522a9a 100644

--- a/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

+++ b/pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

@@ -8,6 +8,9 @@ import 'dart:convert' show

UNICODE_BOM_CHARACTER_RUNE,

UTF8;

+import '../scanner.dart' show

+ unicodeReplacementCharacter;

import 'precedence.dart' show

PrecedenceInfo;

@@ -103,27 +106,37 @@ class Utf8BytesScanner extends ArrayBasedScanner {

int peek() => bytes[byteOffset + 1];

- /**

- * Returns the unicode code point starting at the byte offset [startOffset]

- * with the byte [nextByte]. If [advance] is true the current [byteOffset]

- * is advanced to the last byte of the code point.

- */

- int nextCodePoint(int startOffset, int nextByte, bool advance) {

- // The number of 1s in the first byte indicate the number of bytes, at

- // least 2.

- int numBytes = 2;

- int bit = 0x20;

- while ((nextByte & bit) != 0) {

+ /// Returns the unicode code point starting at the byte offset [startOffset]

+ /// with the byte [nextByte].

+ int nextCodePoint(int startOffset, int nextByte) {

+ int expectedHighBytes;

+ if (nextByte < 0xC2) {

+ expectedHighBytes = 1; // Bad code unit.

+ } else if (nextByte < 0xE0) {

+ expectedHighBytes = 2;

+ } else if (nextByte < 0xF0) {

+ expectedHighBytes = 3;

+ } else if (nextByte < 0xF5) {

+ expectedHighBytes = 4;

+ } else {

+ expectedHighBytes = 1; // Bad code unit.

+ }

+ int numBytes = 0;

+ for (int i = 0; i < expectedHighBytes; i++) {

+ if (bytes[byteOffset + i] < 0x80) {

+ break;

+ }

numBytes++;

- bit >>= 1;

}

int end = startOffset + numBytes;

- if (advance) {

- byteOffset = end - 1;

+ byteOffset = end - 1;

+ if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {

+ return unicodeReplacementCharacter;

}

// TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

// _Utf8Decoder instance. Also the sublist is eagerly allocated.

- String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

+ String codePoint =

+ UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);

if (codePoint.length == 0) {

// The UTF-8 decoder discards leading BOM characters.

// TODO(floitsch): don't just assume that removed characters were the

@@ -132,23 +145,19 @@ class Utf8BytesScanner extends ArrayBasedScanner {

codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

}

if (codePoint.length == 1) {

- if (advance) {

- utf8Slack += (numBytes - 1);

- scanSlack = numBytes - 1;

- scanSlackOffset = byteOffset;

- }

+ utf8Slack += (numBytes - 1);

+ scanSlack = numBytes - 1;

+ scanSlackOffset = byteOffset;

return codePoint.codeUnitAt(0);

} else if (codePoint.length == 2) {

- if (advance) {

- utf8Slack += (numBytes - 2);

- scanSlack = numBytes - 1;

- scanSlackOffset = byteOffset;

- stringOffsetSlackOffset = byteOffset;

- }

+ utf8Slack += (numBytes - 2);

+ scanSlack = numBytes - 1;

+ scanSlackOffset = byteOffset;

+ stringOffsetSlackOffset = byteOffset;

// In case of a surrogate pair, return a single code point.

return codePoint.runes.single;

} else {

- throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";

+ return unicodeReplacementCharacter;

}

@@ -157,7 +166,7 @@ class Utf8BytesScanner extends ArrayBasedScanner {

if (next < 128) return next;

// Check if currentAsUnicode was already invoked.

if (byteOffset == lastUnicodeOffset) return next;

- int res = nextCodePoint(byteOffset, next, true);

+ int res = nextCodePoint(byteOffset, next);

lastUnicodeOffset = byteOffset;

return res;

}

@@ -165,7 +174,8 @@ class Utf8BytesScanner extends ArrayBasedScanner {

void handleUnicode(int startScanOffset) {

int end = byteOffset;

// TODO(lry): this measurably slows down the scanner for files with unicode.

- String s = UTF8.decode(bytes.sublist(startScanOffset, end));

+ String s =

+ UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);

utf8Slack += (end - startScanOffset) - s.length;

}

« no previous file with comments | « pkg/front_end/lib/src/fasta/scanner/token.dart ('k') | pkg/front_end/lib/src/fasta/source/source_loader.dart » ('j') | no next file with comments »