Index: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
diff --git a/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
new file mode 100644 |
index 0000000000000000000000000000000000000000..ea46e35457c8ed133b82c176a63d0a9b841a4f5f |
--- /dev/null |
+++ b/pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart |
@@ -0,0 +1,214 @@ |
+// Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
+// for details. All rights reserved. Use of this source code is governed by a |
+// BSD-style license that can be found in the LICENSE file. |
+ |
+library dart2js.scanner.utf8; |
+ |
+import 'dart:convert' show UNICODE_BOM_CHARACTER_RUNE, UTF8; |
+ |
+import '../io/source_file.dart' show SourceFile; |
+import '../tokens/precedence.dart' show PrecedenceInfo; |
+import '../tokens/token.dart' show StringToken, Token; |
+import 'array_based_scanner.dart' show ArrayBasedScanner; |
+ |
+/** |
+ * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens |
+ * that points to substrings. |
+ */ |
+class Utf8BytesScanner extends ArrayBasedScanner { |
+ /** |
+ * The file content. |
+ * |
+ * The content is zero-terminated. |
+ */ |
+ List<int> bytes; |
+ |
+ /** |
+ * Points to the offset of the last byte returned by [advance]. |
+ * |
+ * After invoking [currentAsUnicode], the [byteOffset] points to the last |
+ * byte that is part of the (unicode or ASCII) character. That way, [advance] |
+ * can always increase the byte offset by 1. |
+ */ |
+ int byteOffset = -1; |
+ |
+ /** |
+ * The getter [scanOffset] is expected to return the index where the current |
+ * character *starts*. In case of a non-ascii character, after invoking |
+ * [currentAsUnicode], the byte offset points to the *last* byte. |
+ * |
+ * This field keeps track of the number of bytes for the current unicode |
+ * character. For example, if bytes 7,8,9 encode one unicode character, the |
+ * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack] |
+ * will be 2, so that [scanOffset] returns 7. |
+ */ |
+ int scanSlack = 0; |
+ |
+ /** |
+ * Holds the [byteOffset] value for which the current [scanSlack] is valid. |
+ */ |
+ int scanSlackOffset = -1; |
+ |
+ /** |
+ * Returns the byte offset of the first byte that belongs to the current |
+ * character. |
+ */ |
+ int get scanOffset { |
+ if (byteOffset == scanSlackOffset) { |
+ return byteOffset - scanSlack; |
+ } else { |
+ return byteOffset; |
+ } |
+ } |
+ |
+ /** |
+ * The difference between the number of bytes and the number of corresponding |
+ * string characters, up to the current [byteOffset]. |
+ */ |
+ int utf8Slack = 0; |
+ |
+ /** |
+ * Creates a new Utf8BytesScanner. The source file is expected to be a |
+ * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the |
+ * string text of the source file is decoded. |
+ * |
+ * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an |
+ * array whose last element is '0' to signal the end of the file. If this |
+ * is not the case, the entire array is copied before scanning. |
+ */ |
+ Utf8BytesScanner(SourceFile file, {bool includeComments: false}) |
+ : bytes = file.slowUtf8ZeroTerminatedBytes(), |
+ super(file, includeComments) { |
+ assert(bytes.last == 0); |
+ // Skip a leading BOM. |
+ if (_containsBomAt(0)) byteOffset += 3; |
+ } |
+ |
+ /** |
+ * Creates a new Utf8BytesScanner from a list of UTF-8 bytes. |
+ * |
+ * The last element of the list is expected to be '0' to signal the end of |
+ * the file. If this is not the case, the entire array is copied before |
+ * scanning. |
+ */ |
+ Utf8BytesScanner.fromBytes(List<int> zeroTerminatedBytes, |
+ {bool includeComments: false}) |
+ : this.bytes = zeroTerminatedBytes, |
+ super(null, includeComments) { |
+ assert(bytes.last == 0); |
+ } |
+ |
+ bool _containsBomAt(int offset) { |
+ const BOM_UTF8 = const [0xEF, 0xBB, 0xBF]; |
+ |
+ return offset + 3 < bytes.length && |
+ bytes[offset] == BOM_UTF8[0] && |
+ bytes[offset + 1] == BOM_UTF8[1] && |
+ bytes[offset + 2] == BOM_UTF8[2]; |
+ } |
+ |
+ int advance() => bytes[++byteOffset]; |
+ |
+ int peek() => bytes[byteOffset + 1]; |
+ |
+ /** |
+ * Returns the unicode code point starting at the byte offset [startOffset] |
+ * with the byte [nextByte]. If [advance] is true the current [byteOffset] |
+ * is advanced to the last byte of the code point. |
+ */ |
+ int nextCodePoint(int startOffset, int nextByte, bool advance) { |
+ // The number of 1s in the first byte indicate the number of bytes, at |
+ // least 2. |
+ int numBytes = 2; |
+ int bit = 0x20; |
+ while ((nextByte & bit) != 0) { |
+ numBytes++; |
+ bit >>= 1; |
+ } |
+ int end = startOffset + numBytes; |
+ if (advance) { |
+ byteOffset = end - 1; |
+ } |
+ // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a |
+ // _Utf8Decoder instance. Also the sublist is eagerly allocated. |
+ String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); |
+ if (codePoint.length == 0) { |
+ // The UTF-8 decoder discards leading BOM characters. |
+ // TODO(floitsch): don't just assume that removed characters were the |
+ // BOM. |
+ assert(_containsBomAt(startOffset)); |
+ codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); |
+ } |
+ if (codePoint.length == 1) { |
+ if (advance) { |
+ utf8Slack += (numBytes - 1); |
+ scanSlack = numBytes - 1; |
+ scanSlackOffset = byteOffset; |
+ } |
+ return codePoint.codeUnitAt(0); |
+ } else if (codePoint.length == 2) { |
+ if (advance) { |
+ utf8Slack += (numBytes - 2); |
+ scanSlack = numBytes - 1; |
+ scanSlackOffset = byteOffset; |
+ stringOffsetSlackOffset = byteOffset; |
+ } |
+ // In case of a surrogate pair, return a single code point. |
+ return codePoint.runes.single; |
+ } else { |
+ throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; |
+ } |
+ } |
+ |
+ int lastUnicodeOffset = -1; |
+ int currentAsUnicode(int next) { |
+ if (next < 128) return next; |
+ // Check if currentAsUnicode was already invoked. |
+ if (byteOffset == lastUnicodeOffset) return next; |
+ int res = nextCodePoint(byteOffset, next, true); |
+ lastUnicodeOffset = byteOffset; |
+ return res; |
+ } |
+ |
+ void handleUnicode(int startScanOffset) { |
+ int end = byteOffset; |
+ // TODO(lry): this measurably slows down the scanner for files with unicode. |
+ String s = UTF8.decode(bytes.sublist(startScanOffset, end)); |
+ utf8Slack += (end - startScanOffset) - s.length; |
+ } |
+ |
+ /** |
+ * This field remembers the byte offset of the last character decoded with |
+ * [nextCodePoint] that used two code units in UTF-16. |
+ * |
+ * [nextCodePoint] returns a single code point for each unicode character, |
+ * even if it needs two code units in UTF-16. |
+ * |
+ * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in |
+ * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the |
+ * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should |
+ * return the offset of the first one, which is one position more left than |
+ * the [utf8Slack]. |
+ */ |
+ int stringOffsetSlackOffset = -1; |
+ |
+ int get stringOffset { |
+ if (stringOffsetSlackOffset == byteOffset) { |
+ return byteOffset - utf8Slack - 1; |
+ } else { |
+ return byteOffset - utf8Slack; |
+ } |
+ } |
+ |
+ Token firstToken() => tokens.next; |
+ Token previousToken() => tail; |
+ |
+ void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, |
+ [int extraOffset = 0]) { |
+ tail.next = new StringToken.fromUtf8Bytes( |
+ info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); |
+ tail = tail.next; |
+ } |
+ |
+ bool atEndOfFile() => byteOffset >= bytes.length - 1; |
+} |