pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart - Issue 2664593002: Port parser and scanner fixes from rasta branch.

Unified Diff: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)

Patch Set: Update status files. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

diff --git a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

index 301801ef08a08f517b03285cf05d77d7619fb2a7..b7bfddb7bd4065427a01177818c6b9be1152ce28 100644

--- a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

+++ b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

@@ -8,6 +8,9 @@ import 'dart:convert' show

UNICODE_BOM_CHARACTER_RUNE,

UTF8;

+import '../dart_scanner.dart' show

+ unicodeReplacementCharacter;

import 'precedence.dart' show

PrecedenceInfo;

@@ -108,22 +111,35 @@ class Utf8BytesScanner extends ArrayBasedScanner {

* with the byte [nextByte]. If [advance] is true the current [byteOffset]

Johnni Winther 2017/01/30 09:04:38 Remove doc about [advance].

ahe 2017/01/30 13:26:22 Done.

* is advanced to the last byte of the code point.

- int nextCodePoint(int startOffset, int nextByte, bool advance) {

- // The number of 1s in the first byte indicate the number of bytes, at

- // least 2.

- int numBytes = 2;

- int bit = 0x20;

- while ((nextByte & bit) != 0) {

+ int nextCodePoint(int startOffset, int nextByte) {

+ int expectedHighBytes;

+ if (nextByte < 0xC2) {

+ expectedHighBytes = 1; // Bad code unit.

+ } else if (nextByte < 0xE0) {

+ expectedHighBytes = 2;

+ } else if (nextByte < 0xF0) {

+ expectedHighBytes = 3;

+ } else if (nextByte < 0xF5) {

+ expectedHighBytes = 4;

+ } else {

+ expectedHighBytes = 1; // Bad code unit.

+ }

+ int numBytes = 0;

+ for (int i = 0; i < expectedHighBytes; i++) {

+ if (bytes[byteOffset + i] < 0x80) {

+ break;

+ }

numBytes++;

- bit >>= 1;

}

int end = startOffset + numBytes;

- if (advance) {

- byteOffset = end - 1;

+ byteOffset = end - 1;

+ if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {

+ return unicodeReplacementCharacter;

}

// TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

// _Utf8Decoder instance. Also the sublist is eagerly allocated.

- String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

+ String codePoint =

+ UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);

if (codePoint.length == 0) {

// The UTF-8 decoder discards leading BOM characters.

// TODO(floitsch): don't just assume that removed characters were the

@@ -132,23 +148,19 @@ class Utf8BytesScanner extends ArrayBasedScanner {

codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

}

if (codePoint.length == 1) {

- if (advance) {

- utf8Slack += (numBytes - 1);

- scanSlack = numBytes - 1;

- scanSlackOffset = byteOffset;

- }

+ utf8Slack += (numBytes - 1);

+ scanSlack = numBytes - 1;

+ scanSlackOffset = byteOffset;

return codePoint.codeUnitAt(0);

} else if (codePoint.length == 2) {

- if (advance) {

- utf8Slack += (numBytes - 2);

- scanSlack = numBytes - 1;

- scanSlackOffset = byteOffset;

- stringOffsetSlackOffset = byteOffset;

- }

+ utf8Slack += (numBytes - 2);

+ scanSlack = numBytes - 1;

+ scanSlackOffset = byteOffset;

+ stringOffsetSlackOffset = byteOffset;

// In case of a surrogate pair, return a single code point.

return codePoint.runes.single;

} else {

- throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";

+ return unicodeReplacementCharacter;

}

@@ -157,7 +169,7 @@ class Utf8BytesScanner extends ArrayBasedScanner {

if (next < 128) return next;

// Check if currentAsUnicode was already invoked.

if (byteOffset == lastUnicodeOffset) return next;

- int res = nextCodePoint(byteOffset, next, true);

+ int res = nextCodePoint(byteOffset, next);

lastUnicodeOffset = byteOffset;

return res;

}

@@ -165,7 +177,8 @@ class Utf8BytesScanner extends ArrayBasedScanner {

void handleUnicode(int startScanOffset) {

int end = byteOffset;

// TODO(lry): this measurably slows down the scanner for files with unicode.

- String s = UTF8.decode(bytes.sublist(startScanOffset, end));

+ String s =

+ UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);

utf8Slack += (end - startScanOffset) - s.length;

}

« pkg/dart_scanner/lib/src/recover.dart ('K') | « pkg/dart_scanner/lib/src/token.dart ('k') | pkg/fasta/lib/src/errors.dart » ('j') | pkg/fasta/lib/src/source/source_loader.dart » ('J')