Chromium Code Reviews| Index: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
| diff --git a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
| index 301801ef08a08f517b03285cf05d77d7619fb2a7..b7bfddb7bd4065427a01177818c6b9be1152ce28 100644 |
| --- a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
| +++ b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
| @@ -8,6 +8,9 @@ import 'dart:convert' show |
| UNICODE_BOM_CHARACTER_RUNE, |
| UTF8; |
| +import '../dart_scanner.dart' show |
| + unicodeReplacementCharacter; |
| + |
| import 'precedence.dart' show |
| PrecedenceInfo; |
| @@ -108,22 +111,35 @@ class Utf8BytesScanner extends ArrayBasedScanner { |
| * with the byte [nextByte]. If [advance] is true the current [byteOffset] |
|
Johnni Winther
2017/01/30 09:04:38
Remove doc about [advance].
ahe
2017/01/30 13:26:22
Done.
|
| * is advanced to the last byte of the code point. |
| */ |
| - int nextCodePoint(int startOffset, int nextByte, bool advance) { |
| - // The number of 1s in the first byte indicate the number of bytes, at |
| - // least 2. |
| - int numBytes = 2; |
| - int bit = 0x20; |
| - while ((nextByte & bit) != 0) { |
| + int nextCodePoint(int startOffset, int nextByte) { |
| + int expectedHighBytes; |
| + if (nextByte < 0xC2) { |
| + expectedHighBytes = 1; // Bad code unit. |
| + } else if (nextByte < 0xE0) { |
| + expectedHighBytes = 2; |
| + } else if (nextByte < 0xF0) { |
| + expectedHighBytes = 3; |
| + } else if (nextByte < 0xF5) { |
| + expectedHighBytes = 4; |
| + } else { |
| + expectedHighBytes = 1; // Bad code unit. |
| + } |
| + int numBytes = 0; |
| + for (int i = 0; i < expectedHighBytes; i++) { |
| + if (bytes[byteOffset + i] < 0x80) { |
| + break; |
| + } |
| numBytes++; |
| - bit >>= 1; |
| } |
| int end = startOffset + numBytes; |
| - if (advance) { |
| - byteOffset = end - 1; |
| + byteOffset = end - 1; |
| + if (expectedHighBytes == 1 || numBytes != expectedHighBytes) { |
| + return unicodeReplacementCharacter; |
| } |
| // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a |
| // _Utf8Decoder instance. Also the sublist is eagerly allocated. |
| - String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); |
| + String codePoint = |
| + UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true); |
| if (codePoint.length == 0) { |
| // The UTF-8 decoder discards leading BOM characters. |
| // TODO(floitsch): don't just assume that removed characters were the |
| @@ -132,23 +148,19 @@ class Utf8BytesScanner extends ArrayBasedScanner { |
| codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); |
| } |
| if (codePoint.length == 1) { |
| - if (advance) { |
| - utf8Slack += (numBytes - 1); |
| - scanSlack = numBytes - 1; |
| - scanSlackOffset = byteOffset; |
| - } |
| + utf8Slack += (numBytes - 1); |
| + scanSlack = numBytes - 1; |
| + scanSlackOffset = byteOffset; |
| return codePoint.codeUnitAt(0); |
| } else if (codePoint.length == 2) { |
| - if (advance) { |
| - utf8Slack += (numBytes - 2); |
| - scanSlack = numBytes - 1; |
| - scanSlackOffset = byteOffset; |
| - stringOffsetSlackOffset = byteOffset; |
| - } |
| + utf8Slack += (numBytes - 2); |
| + scanSlack = numBytes - 1; |
| + scanSlackOffset = byteOffset; |
| + stringOffsetSlackOffset = byteOffset; |
| // In case of a surrogate pair, return a single code point. |
| return codePoint.runes.single; |
| } else { |
| - throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; |
| + return unicodeReplacementCharacter; |
| } |
| } |
| @@ -157,7 +169,7 @@ class Utf8BytesScanner extends ArrayBasedScanner { |
| if (next < 128) return next; |
| // Check if currentAsUnicode was already invoked. |
| if (byteOffset == lastUnicodeOffset) return next; |
| - int res = nextCodePoint(byteOffset, next, true); |
| + int res = nextCodePoint(byteOffset, next); |
| lastUnicodeOffset = byteOffset; |
| return res; |
| } |
| @@ -165,7 +177,8 @@ class Utf8BytesScanner extends ArrayBasedScanner { |
| void handleUnicode(int startScanOffset) { |
| int end = byteOffset; |
| // TODO(lry): this measurably slows down the scanner for files with unicode. |
| - String s = UTF8.decode(bytes.sublist(startScanOffset, end)); |
| + String s = |
| + UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true); |
| utf8Slack += (end - startScanOffset) - s.length; |
| } |