| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 library dart2js.scanner.utf8; | |
| 6 | |
| 7 import 'dart:convert' show UNICODE_BOM_CHARACTER_RUNE, UTF8; | |
| 8 | |
| 9 import '../io/source_file.dart' show SourceFile; | |
| 10 import '../tokens/precedence.dart' show PrecedenceInfo; | |
| 11 import '../tokens/token.dart' show StringToken, Token; | |
| 12 import 'array_based_scanner.dart' show ArrayBasedScanner; | |
| 13 | |
| 14 /** | |
| 15 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens | |
| 16 * that points to substrings. | |
| 17 */ | |
| 18 class Utf8BytesScanner extends ArrayBasedScanner { | |
| 19 /** | |
| 20 * The file content. | |
| 21 * | |
| 22 * The content is zero-terminated. | |
| 23 */ | |
| 24 List<int> bytes; | |
| 25 | |
| 26 /** | |
| 27 * Points to the offset of the last byte returned by [advance]. | |
| 28 * | |
| 29 * After invoking [currentAsUnicode], the [byteOffset] points to the last | |
| 30 * byte that is part of the (unicode or ASCII) character. That way, [advance] | |
| 31 * can always increase the byte offset by 1. | |
| 32 */ | |
| 33 int byteOffset = -1; | |
| 34 | |
| 35 /** | |
| 36 * The getter [scanOffset] is expected to return the index where the current | |
| 37 * character *starts*. In case of a non-ascii character, after invoking | |
| 38 * [currentAsUnicode], the byte offset points to the *last* byte. | |
| 39 * | |
| 40 * This field keeps track of the number of bytes for the current unicode | |
| 41 * character. For example, if bytes 7,8,9 encode one unicode character, the | |
| 42 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack] | |
| 43 * will be 2, so that [scanOffset] returns 7. | |
| 44 */ | |
| 45 int scanSlack = 0; | |
| 46 | |
| 47 /** | |
| 48 * Holds the [byteOffset] value for which the current [scanSlack] is valid. | |
| 49 */ | |
| 50 int scanSlackOffset = -1; | |
| 51 | |
| 52 /** | |
| 53 * Returns the byte offset of the first byte that belongs to the current | |
| 54 * character. | |
| 55 */ | |
| 56 int get scanOffset { | |
| 57 if (byteOffset == scanSlackOffset) { | |
| 58 return byteOffset - scanSlack; | |
| 59 } else { | |
| 60 return byteOffset; | |
| 61 } | |
| 62 } | |
| 63 | |
| 64 /** | |
| 65 * The difference between the number of bytes and the number of corresponding | |
| 66 * string characters, up to the current [byteOffset]. | |
| 67 */ | |
| 68 int utf8Slack = 0; | |
| 69 | |
| 70 /** | |
| 71 * Creates a new Utf8BytesScanner. The source file is expected to be a | |
| 72 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the | |
| 73 * string text of the source file is decoded. | |
| 74 * | |
| 75 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an | |
| 76 * array whose last element is '0' to signal the end of the file. If this | |
| 77 * is not the case, the entire array is copied before scanning. | |
| 78 */ | |
| 79 Utf8BytesScanner(SourceFile file, {bool includeComments: false}) | |
| 80 : bytes = file.slowUtf8ZeroTerminatedBytes(), | |
| 81 super(file, includeComments) { | |
| 82 assert(bytes.last == 0); | |
| 83 // Skip a leading BOM. | |
| 84 if (_containsBomAt(0)) byteOffset += 3; | |
| 85 } | |
| 86 | |
| 87 /** | |
| 88 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes. | |
| 89 * | |
| 90 * The last element of the list is expected to be '0' to signal the end of | |
| 91 * the file. If this is not the case, the entire array is copied before | |
| 92 * scanning. | |
| 93 */ | |
| 94 Utf8BytesScanner.fromBytes(List<int> zeroTerminatedBytes, | |
| 95 {bool includeComments: false}) | |
| 96 : this.bytes = zeroTerminatedBytes, | |
| 97 super(null, includeComments) { | |
| 98 assert(bytes.last == 0); | |
| 99 } | |
| 100 | |
| 101 bool _containsBomAt(int offset) { | |
| 102 const BOM_UTF8 = const [0xEF, 0xBB, 0xBF]; | |
| 103 | |
| 104 return offset + 3 < bytes.length && | |
| 105 bytes[offset] == BOM_UTF8[0] && | |
| 106 bytes[offset + 1] == BOM_UTF8[1] && | |
| 107 bytes[offset + 2] == BOM_UTF8[2]; | |
| 108 } | |
| 109 | |
| 110 int advance() => bytes[++byteOffset]; | |
| 111 | |
| 112 int peek() => bytes[byteOffset + 1]; | |
| 113 | |
| 114 /** | |
| 115 * Returns the unicode code point starting at the byte offset [startOffset] | |
| 116 * with the byte [nextByte]. If [advance] is true the current [byteOffset] | |
| 117 * is advanced to the last byte of the code point. | |
| 118 */ | |
| 119 int nextCodePoint(int startOffset, int nextByte, bool advance) { | |
| 120 // The number of 1s in the first byte indicate the number of bytes, at | |
| 121 // least 2. | |
| 122 int numBytes = 2; | |
| 123 int bit = 0x20; | |
| 124 while ((nextByte & bit) != 0) { | |
| 125 numBytes++; | |
| 126 bit >>= 1; | |
| 127 } | |
| 128 int end = startOffset + numBytes; | |
| 129 if (advance) { | |
| 130 byteOffset = end - 1; | |
| 131 } | |
| 132 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a | |
| 133 // _Utf8Decoder instance. Also the sublist is eagerly allocated. | |
| 134 String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); | |
| 135 if (codePoint.length == 0) { | |
| 136 // The UTF-8 decoder discards leading BOM characters. | |
| 137 // TODO(floitsch): don't just assume that removed characters were the | |
| 138 // BOM. | |
| 139 assert(_containsBomAt(startOffset)); | |
| 140 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); | |
| 141 } | |
| 142 if (codePoint.length == 1) { | |
| 143 if (advance) { | |
| 144 utf8Slack += (numBytes - 1); | |
| 145 scanSlack = numBytes - 1; | |
| 146 scanSlackOffset = byteOffset; | |
| 147 } | |
| 148 return codePoint.codeUnitAt(0); | |
| 149 } else if (codePoint.length == 2) { | |
| 150 if (advance) { | |
| 151 utf8Slack += (numBytes - 2); | |
| 152 scanSlack = numBytes - 1; | |
| 153 scanSlackOffset = byteOffset; | |
| 154 stringOffsetSlackOffset = byteOffset; | |
| 155 } | |
| 156 // In case of a surrogate pair, return a single code point. | |
| 157 return codePoint.runes.single; | |
| 158 } else { | |
| 159 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; | |
| 160 } | |
| 161 } | |
| 162 | |
| 163 int lastUnicodeOffset = -1; | |
| 164 int currentAsUnicode(int next) { | |
| 165 if (next < 128) return next; | |
| 166 // Check if currentAsUnicode was already invoked. | |
| 167 if (byteOffset == lastUnicodeOffset) return next; | |
| 168 int res = nextCodePoint(byteOffset, next, true); | |
| 169 lastUnicodeOffset = byteOffset; | |
| 170 return res; | |
| 171 } | |
| 172 | |
| 173 void handleUnicode(int startScanOffset) { | |
| 174 int end = byteOffset; | |
| 175 // TODO(lry): this measurably slows down the scanner for files with unicode. | |
| 176 String s = UTF8.decode(bytes.sublist(startScanOffset, end)); | |
| 177 utf8Slack += (end - startScanOffset) - s.length; | |
| 178 } | |
| 179 | |
| 180 /** | |
| 181 * This field remembers the byte offset of the last character decoded with | |
| 182 * [nextCodePoint] that used two code units in UTF-16. | |
| 183 * | |
| 184 * [nextCodePoint] returns a single code point for each unicode character, | |
| 185 * even if it needs two code units in UTF-16. | |
| 186 * | |
| 187 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in | |
| 188 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the | |
| 189 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should | |
| 190 * return the offset of the first one, which is one position more left than | |
| 191 * the [utf8Slack]. | |
| 192 */ | |
| 193 int stringOffsetSlackOffset = -1; | |
| 194 | |
| 195 int get stringOffset { | |
| 196 if (stringOffsetSlackOffset == byteOffset) { | |
| 197 return byteOffset - utf8Slack - 1; | |
| 198 } else { | |
| 199 return byteOffset - utf8Slack; | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 Token firstToken() => tokens.next; | |
| 204 Token previousToken() => tail; | |
| 205 | |
| 206 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, | |
| 207 [int extraOffset = 0]) { | |
| 208 tail.next = new StringToken.fromUtf8Bytes( | |
| 209 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); | |
| 210 tail = tail.next; | |
| 211 } | |
| 212 | |
| 213 bool atEndOfFile() => byteOffset >= bytes.length - 1; | |
| 214 } | |
| OLD | NEW |