| Index: pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart
 | 
| diff --git a/pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart b/pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart
 | 
| deleted file mode 100644
 | 
| index ea46e35457c8ed133b82c176a63d0a9b841a4f5f..0000000000000000000000000000000000000000
 | 
| --- a/pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart
 | 
| +++ /dev/null
 | 
| @@ -1,214 +0,0 @@
 | 
| -// Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
 | 
| -// for details. All rights reserved. Use of this source code is governed by a
 | 
| -// BSD-style license that can be found in the LICENSE file.
 | 
| -
 | 
| -library dart2js.scanner.utf8;
 | 
| -
 | 
| -import 'dart:convert' show UNICODE_BOM_CHARACTER_RUNE, UTF8;
 | 
| -
 | 
| -import '../io/source_file.dart' show SourceFile;
 | 
| -import '../tokens/precedence.dart' show PrecedenceInfo;
 | 
| -import '../tokens/token.dart' show StringToken, Token;
 | 
| -import 'array_based_scanner.dart' show ArrayBasedScanner;
 | 
| -
 | 
| -/**
 | 
| - * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens
 | 
| - * that points to substrings.
 | 
| - */
 | 
| -class Utf8BytesScanner extends ArrayBasedScanner {
 | 
| -  /**
 | 
| -   * The file content.
 | 
| -   *
 | 
| -   * The content is zero-terminated.
 | 
| -   */
 | 
| -  List<int> bytes;
 | 
| -
 | 
| -  /**
 | 
| -   * Points to the offset of the last byte returned by [advance].
 | 
| -   *
 | 
| -   * After invoking [currentAsUnicode], the [byteOffset] points to the last
 | 
| -   * byte that is part of the (unicode or ASCII) character. That way, [advance]
 | 
| -   * can always increase the byte offset by 1.
 | 
| -   */
 | 
| -  int byteOffset = -1;
 | 
| -
 | 
| -  /**
 | 
| -   * The getter [scanOffset] is expected to return the index where the current
 | 
| -   * character *starts*. In case of a non-ascii character, after invoking
 | 
| -   * [currentAsUnicode], the byte offset points to the *last* byte.
 | 
| -   *
 | 
| -   * This field keeps track of the number of bytes for the current unicode
 | 
| -   * character. For example, if bytes 7,8,9 encode one unicode character, the
 | 
| -   * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]
 | 
| -   * will be 2, so that [scanOffset] returns 7.
 | 
| -   */
 | 
| -  int scanSlack = 0;
 | 
| -
 | 
| -  /**
 | 
| -   * Holds the [byteOffset] value for which the current [scanSlack] is valid.
 | 
| -   */
 | 
| -  int scanSlackOffset = -1;
 | 
| -
 | 
| -  /**
 | 
| -   * Returns the byte offset of the first byte that belongs to the current
 | 
| -   * character.
 | 
| -   */
 | 
| -  int get scanOffset {
 | 
| -    if (byteOffset == scanSlackOffset) {
 | 
| -      return byteOffset - scanSlack;
 | 
| -    } else {
 | 
| -      return byteOffset;
 | 
| -    }
 | 
| -  }
 | 
| -
 | 
| -  /**
 | 
| -   * The difference between the number of bytes and the number of corresponding
 | 
| -   * string characters, up to the current [byteOffset].
 | 
| -   */
 | 
| -  int utf8Slack = 0;
 | 
| -
 | 
| -  /**
 | 
| -   * Creates a new Utf8BytesScanner. The source file is expected to be a
 | 
| -   * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the
 | 
| -   * string text of the source file is decoded.
 | 
| -   *
 | 
| -   * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an
 | 
| -   * array whose last element is '0' to signal the end of the file. If this
 | 
| -   * is not the case, the entire array is copied before scanning.
 | 
| -   */
 | 
| -  Utf8BytesScanner(SourceFile file, {bool includeComments: false})
 | 
| -      : bytes = file.slowUtf8ZeroTerminatedBytes(),
 | 
| -        super(file, includeComments) {
 | 
| -    assert(bytes.last == 0);
 | 
| -    // Skip a leading BOM.
 | 
| -    if (_containsBomAt(0)) byteOffset += 3;
 | 
| -  }
 | 
| -
 | 
| -  /**
 | 
| -   * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.
 | 
| -   *
 | 
| -   * The last element of the list is expected to be '0' to signal the end of
 | 
| -   * the file. If this is not the case, the entire array is copied before
 | 
| -   * scanning.
 | 
| -   */
 | 
| -  Utf8BytesScanner.fromBytes(List<int> zeroTerminatedBytes,
 | 
| -      {bool includeComments: false})
 | 
| -      : this.bytes = zeroTerminatedBytes,
 | 
| -        super(null, includeComments) {
 | 
| -    assert(bytes.last == 0);
 | 
| -  }
 | 
| -
 | 
| -  bool _containsBomAt(int offset) {
 | 
| -    const BOM_UTF8 = const [0xEF, 0xBB, 0xBF];
 | 
| -
 | 
| -    return offset + 3 < bytes.length &&
 | 
| -        bytes[offset] == BOM_UTF8[0] &&
 | 
| -        bytes[offset + 1] == BOM_UTF8[1] &&
 | 
| -        bytes[offset + 2] == BOM_UTF8[2];
 | 
| -  }
 | 
| -
 | 
| -  int advance() => bytes[++byteOffset];
 | 
| -
 | 
| -  int peek() => bytes[byteOffset + 1];
 | 
| -
 | 
| -  /**
 | 
| -   * Returns the unicode code point starting at the byte offset [startOffset]
 | 
| -   * with the byte [nextByte]. If [advance] is true the current [byteOffset]
 | 
| -   * is advanced to the last byte of the code point.
 | 
| -   */
 | 
| -  int nextCodePoint(int startOffset, int nextByte, bool advance) {
 | 
| -    // The number of 1s in the first byte indicate the number of bytes, at
 | 
| -    // least 2.
 | 
| -    int numBytes = 2;
 | 
| -    int bit = 0x20;
 | 
| -    while ((nextByte & bit) != 0) {
 | 
| -      numBytes++;
 | 
| -      bit >>= 1;
 | 
| -    }
 | 
| -    int end = startOffset + numBytes;
 | 
| -    if (advance) {
 | 
| -      byteOffset = end - 1;
 | 
| -    }
 | 
| -    // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
 | 
| -    // _Utf8Decoder instance. Also the sublist is eagerly allocated.
 | 
| -    String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
 | 
| -    if (codePoint.length == 0) {
 | 
| -      // The UTF-8 decoder discards leading BOM characters.
 | 
| -      // TODO(floitsch): don't just assume that removed characters were the
 | 
| -      // BOM.
 | 
| -      assert(_containsBomAt(startOffset));
 | 
| -      codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
 | 
| -    }
 | 
| -    if (codePoint.length == 1) {
 | 
| -      if (advance) {
 | 
| -        utf8Slack += (numBytes - 1);
 | 
| -        scanSlack = numBytes - 1;
 | 
| -        scanSlackOffset = byteOffset;
 | 
| -      }
 | 
| -      return codePoint.codeUnitAt(0);
 | 
| -    } else if (codePoint.length == 2) {
 | 
| -      if (advance) {
 | 
| -        utf8Slack += (numBytes - 2);
 | 
| -        scanSlack = numBytes - 1;
 | 
| -        scanSlackOffset = byteOffset;
 | 
| -        stringOffsetSlackOffset = byteOffset;
 | 
| -      }
 | 
| -      // In case of a surrogate pair, return a single code point.
 | 
| -      return codePoint.runes.single;
 | 
| -    } else {
 | 
| -      throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
 | 
| -    }
 | 
| -  }
 | 
| -
 | 
| -  int lastUnicodeOffset = -1;
 | 
| -  int currentAsUnicode(int next) {
 | 
| -    if (next < 128) return next;
 | 
| -    // Check if currentAsUnicode was already invoked.
 | 
| -    if (byteOffset == lastUnicodeOffset) return next;
 | 
| -    int res = nextCodePoint(byteOffset, next, true);
 | 
| -    lastUnicodeOffset = byteOffset;
 | 
| -    return res;
 | 
| -  }
 | 
| -
 | 
| -  void handleUnicode(int startScanOffset) {
 | 
| -    int end = byteOffset;
 | 
| -    // TODO(lry): this measurably slows down the scanner for files with unicode.
 | 
| -    String s = UTF8.decode(bytes.sublist(startScanOffset, end));
 | 
| -    utf8Slack += (end - startScanOffset) - s.length;
 | 
| -  }
 | 
| -
 | 
| -  /**
 | 
| -   * This field remembers the byte offset of the last character decoded with
 | 
| -   * [nextCodePoint] that used two code units in UTF-16.
 | 
| -   *
 | 
| -   * [nextCodePoint] returns a single code point for each unicode character,
 | 
| -   * even if it needs two code units in UTF-16.
 | 
| -   *
 | 
| -   * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in
 | 
| -   * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the
 | 
| -   * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should
 | 
| -   * return the offset of the first one, which is one position more left than
 | 
| -   * the [utf8Slack].
 | 
| -   */
 | 
| -  int stringOffsetSlackOffset = -1;
 | 
| -
 | 
| -  int get stringOffset {
 | 
| -    if (stringOffsetSlackOffset == byteOffset) {
 | 
| -      return byteOffset - utf8Slack - 1;
 | 
| -    } else {
 | 
| -      return byteOffset - utf8Slack;
 | 
| -    }
 | 
| -  }
 | 
| -
 | 
| -  Token firstToken() => tokens.next;
 | 
| -  Token previousToken() => tail;
 | 
| -
 | 
| -  void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
 | 
| -      [int extraOffset = 0]) {
 | 
| -    tail.next = new StringToken.fromUtf8Bytes(
 | 
| -        info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
 | 
| -    tail = tail.next;
 | 
| -  }
 | 
| -
 | 
| -  bool atEndOfFile() => byteOffset >= bytes.length - 1;
 | 
| -}
 | 
| 
 |