pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart - Issue 2644843006: Use packages dart_parser, dart_scanner, and compiler_util.

Unified Diff: pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart

Issue 2644843006: Use packages dart_parser, dart_scanner, and compiler_util. (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« pkg/compiler/lib/src/resolution/enum_creator.dart ('K') | « pkg/compiler/lib/src/scanner/string_scanner.dart ('k') | pkg/compiler/lib/src/serialization/modelz.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart

diff --git a/pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart b/pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart

deleted file mode 100644

index ea46e35457c8ed133b82c176a63d0a9b841a4f5f..0000000000000000000000000000000000000000

--- a/pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart

+++ /dev/null

@@ -1,214 +0,0 @@

-// BSD-style license that can be found in the LICENSE file.

-library dart2js.scanner.utf8;

-import 'dart:convert' show UNICODE_BOM_CHARACTER_RUNE, UTF8;

-import '../io/source_file.dart' show SourceFile;

-import '../tokens/precedence.dart' show PrecedenceInfo;

-import '../tokens/token.dart' show StringToken, Token;

-import 'array_based_scanner.dart' show ArrayBasedScanner;

-/**

- * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens

- * that points to substrings.

- */

-class Utf8BytesScanner extends ArrayBasedScanner {

- /**

- * The file content.

- *

- * The content is zero-terminated.

- */

- List<int> bytes;

- /**

- * Points to the offset of the last byte returned by [advance].

- *

- * After invoking [currentAsUnicode], the [byteOffset] points to the last

- * byte that is part of the (unicode or ASCII) character. That way, [advance]

- * can always increase the byte offset by 1.

- */

- int byteOffset = -1;

- /**

- * The getter [scanOffset] is expected to return the index where the current

- * character *starts*. In case of a non-ascii character, after invoking

- * [currentAsUnicode], the byte offset points to the *last* byte.

- *

- * This field keeps track of the number of bytes for the current unicode

- * character. For example, if bytes 7,8,9 encode one unicode character, the

- * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]

- * will be 2, so that [scanOffset] returns 7.

- */

- int scanSlack = 0;

- /**

- * Holds the [byteOffset] value for which the current [scanSlack] is valid.

- */

- int scanSlackOffset = -1;

- /**

- * Returns the byte offset of the first byte that belongs to the current

- * character.

- */

- int get scanOffset {

- if (byteOffset == scanSlackOffset) {

- return byteOffset - scanSlack;

- } else {

- return byteOffset;

- }

- /**

- * The difference between the number of bytes and the number of corresponding

- * string characters, up to the current [byteOffset].

- */

- int utf8Slack = 0;

- /**

- * Creates a new Utf8BytesScanner. The source file is expected to be a

- * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the

- * string text of the source file is decoded.

- *

- * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an

- * array whose last element is '0' to signal the end of the file. If this

- * is not the case, the entire array is copied before scanning.

- */

- Utf8BytesScanner(SourceFile file, {bool includeComments: false})

- : bytes = file.slowUtf8ZeroTerminatedBytes(),

- super(file, includeComments) {

- assert(bytes.last == 0);

- // Skip a leading BOM.

- if (_containsBomAt(0)) byteOffset += 3;

- }

- /**

- * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.

- *

- * The last element of the list is expected to be '0' to signal the end of

- * the file. If this is not the case, the entire array is copied before

- * scanning.

- */

- Utf8BytesScanner.fromBytes(List<int> zeroTerminatedBytes,

- {bool includeComments: false})

- : this.bytes = zeroTerminatedBytes,

- super(null, includeComments) {

- assert(bytes.last == 0);

- }

- bool _containsBomAt(int offset) {

- const BOM_UTF8 = const [0xEF, 0xBB, 0xBF];

- return offset + 3 < bytes.length &&

- bytes[offset] == BOM_UTF8[0] &&

- bytes[offset + 1] == BOM_UTF8[1] &&

- bytes[offset + 2] == BOM_UTF8[2];

- }

- int advance() => bytes[++byteOffset];

- int peek() => bytes[byteOffset + 1];

- /**

- * Returns the unicode code point starting at the byte offset [startOffset]

- * with the byte [nextByte]. If [advance] is true the current [byteOffset]

- * is advanced to the last byte of the code point.

- */

- int nextCodePoint(int startOffset, int nextByte, bool advance) {

- // The number of 1s in the first byte indicate the number of bytes, at

- // least 2.

- int numBytes = 2;

- int bit = 0x20;

- while ((nextByte & bit) != 0) {

- numBytes++;

- bit >>= 1;

- }

- int end = startOffset + numBytes;

- if (advance) {

- byteOffset = end - 1;

- }

- // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

- // _Utf8Decoder instance. Also the sublist is eagerly allocated.

- String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

- if (codePoint.length == 0) {

- // The UTF-8 decoder discards leading BOM characters.

- // TODO(floitsch): don't just assume that removed characters were the

- // BOM.

- assert(_containsBomAt(startOffset));

- codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

- }

- if (codePoint.length == 1) {

- if (advance) {

- utf8Slack += (numBytes - 1);

- scanSlack = numBytes - 1;

- scanSlackOffset = byteOffset;

- }

- return codePoint.codeUnitAt(0);

- } else if (codePoint.length == 2) {

- if (advance) {

- utf8Slack += (numBytes - 2);

- scanSlack = numBytes - 1;

- scanSlackOffset = byteOffset;

- stringOffsetSlackOffset = byteOffset;

- }

- // In case of a surrogate pair, return a single code point.

- return codePoint.runes.single;

- } else {

- throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";

- }

- int lastUnicodeOffset = -1;

- int currentAsUnicode(int next) {

- if (next < 128) return next;

- // Check if currentAsUnicode was already invoked.

- if (byteOffset == lastUnicodeOffset) return next;

- int res = nextCodePoint(byteOffset, next, true);

- lastUnicodeOffset = byteOffset;

- return res;

- }

- void handleUnicode(int startScanOffset) {

- int end = byteOffset;

- // TODO(lry): this measurably slows down the scanner for files with unicode.

- String s = UTF8.decode(bytes.sublist(startScanOffset, end));

- utf8Slack += (end - startScanOffset) - s.length;

- }

- /**

- * This field remembers the byte offset of the last character decoded with

- * [nextCodePoint] that used two code units in UTF-16.

- *

- * [nextCodePoint] returns a single code point for each unicode character,

- * even if it needs two code units in UTF-16.

- *

- * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in

- * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the

- * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should

- * return the offset of the first one, which is one position more left than

- * the [utf8Slack].

- */

- int stringOffsetSlackOffset = -1;

- int get stringOffset {

- if (stringOffsetSlackOffset == byteOffset) {

- return byteOffset - utf8Slack - 1;

- } else {

- return byteOffset - utf8Slack;

- }

- Token firstToken() => tokens.next;

- Token previousToken() => tail;

- void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

- [int extraOffset = 0]) {

- tail.next = new StringToken.fromUtf8Bytes(

- info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

- tail = tail.next;

- }

- bool atEndOfFile() => byteOffset >= bytes.length - 1;