OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 |
| 5 part of scanner; |
| 6 |
| 7 /** |
| 8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens |
| 9 * that points to substrings. |
| 10 */ |
| 11 class Utf8BytesScanner extends AbstractScanner { |
| 12 /** The file content. */ |
| 13 List<int> bytes; |
| 14 |
| 15 /** |
| 16 * Points to the offset of the byte last returned by [advance]. |
| 17 * |
| 18 * After invoking [currentAsUnicode], the [byteOffset] points to the last |
| 19 * byte that is part of the (unicode or ASCII) character. That way, [advance] |
| 20 * can always increase the byte offset by 1. |
| 21 */ |
| 22 int byteOffset = -1; |
| 23 |
| 24 /** |
| 25 * The getter [scanOffset] is expected to return the index where the current |
| 26 * character *starts*. In case of a non-ascii character, after invoking |
| 27 * [currentAsUnicode], the byte offset points to the *last* byte. |
| 28 * |
| 29 * This field keeps track of the number of bytes for the current unicode |
| 30 * character. For example, if bytes 7,8,9 encode one unicode character, the |
| 31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack] |
| 32 * will be 2, so that [scanOffset] returns 7. |
| 33 */ |
| 34 int scanSlack = 0; |
| 35 |
| 36 /** |
| 37 * Holds the [byteOffset] value for which the current [scanSlack] is valid. |
| 38 */ |
| 39 int scanSlackOffset = -1; |
| 40 |
| 41 /** |
| 42 * Returns the byte offset of the first byte that belongs to the current |
| 43 * character. |
| 44 */ |
| 45 int get scanOffset { |
| 46 if (byteOffset == scanSlackOffset) { |
| 47 return byteOffset - scanSlack; |
| 48 } else { |
| 49 return byteOffset; |
| 50 } |
| 51 } |
| 52 |
| 53 /** |
| 54 * The difference between the number of bytes and the number of corresponding |
| 55 * string characters, up to the current [byteOffset]. |
| 56 */ |
| 57 int utf8Slack = 0; |
| 58 |
| 59 /** |
| 60 * Creates a new Utf8BytesScanner. The source file is expected to be a |
| 61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the |
| 62 * string text of the source file is decoded. |
| 63 * |
| 64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an |
| 65 * array whose last element is '0' to signal the end of the file. If this |
| 66 * is not the case, the entire array is copied before scanning. |
| 67 */ |
| 68 Utf8BytesScanner(SourceFile file, {bool includeComments: false}) |
| 69 : bytes = file.slowUtf8Bytes(), |
| 70 super(file, includeComments) { |
| 71 ensureZeroTermination(); |
| 72 } |
| 73 |
| 74 /** |
| 75 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes. |
| 76 * |
| 77 * The last element of the list is expected to be '0' to signal the end of |
| 78 * the file. If this is not the case, the entire array is copied before |
| 79 * scanning. |
| 80 */ |
| 81 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false}) |
| 82 : super(null, includeComments) { |
| 83 ensureZeroTermination(); |
| 84 } |
| 85 |
| 86 void ensureZeroTermination() { |
| 87 if (bytes.isEmpty || bytes[bytes.length -1] != 0) { |
| 88 // TODO(lry), abort instead of copying the array, or warn? |
| 89 var newBytes = new Uint8List(bytes.length + 1); |
| 90 newBytes.addAll(bytes); |
| 91 newBytes[bytes.length] = 0; |
| 92 bytes = newBytes; |
| 93 } |
| 94 } |
| 95 |
| 96 int advance() => bytes[++byteOffset]; |
| 97 |
| 98 int peek() => bytes[byteOffset + 1]; |
| 99 |
| 100 /** |
| 101 * Returns the unicode code point starting at the byte offset [startOffset] |
| 102 * with the byte [nextByte]. If [advance] is true the current [byteOffset] |
| 103 * is advanced to the last byte of the code point. |
| 104 */ |
| 105 int nextCodePoint(int startOffset, int nextByte, bool advance) { |
| 106 // The number of 1s in the first byte indicate the number of bytes, at |
| 107 // least 2. |
| 108 int numBytes = 2; |
| 109 int bit = 0x20; |
| 110 while ((nextByte & bit) != 0) { |
| 111 numBytes++; |
| 112 bit >>= 1; |
| 113 } |
| 114 int end = startOffset + numBytes; |
| 115 if (advance) { |
| 116 byteOffset = end - 1; |
| 117 } |
| 118 // TODO(lry), measurably slow, decode creates first a Utf8Decoder and a |
| 119 // _Utf8Decoder instance. Also the sublist is eagerly allocated. |
| 120 String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); |
| 121 if (codePoint.length == 1) { |
| 122 if (advance) { |
| 123 utf8Slack += (numBytes - 1); |
| 124 scanSlack = numBytes - 1; |
| 125 scanSlackOffset = byteOffset; |
| 126 } |
| 127 return codePoint.codeUnitAt(0); |
| 128 } else if (codePoint.length == 2) { |
| 129 if (advance) { |
| 130 utf8Slack += (numBytes - 2); |
| 131 scanSlack = numBytes - 1; |
| 132 scanSlackOffset = byteOffset; |
| 133 stringOffsetSlackOffset = byteOffset; |
| 134 } |
| 135 // In case of a surrogate pair, return a single code point. |
| 136 return codePoint.runes.single; |
| 137 } else { |
| 138 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; |
| 139 } |
| 140 } |
| 141 |
| 142 int lastUnicodeOffset = -1; |
| 143 int currentAsUnicode(int next) { |
| 144 if (next < 128) return next; |
| 145 // Check if currentAsUnicode was already invoked. |
| 146 if (byteOffset == lastUnicodeOffset) return next; |
| 147 int res = nextCodePoint(byteOffset, next, true); |
| 148 lastUnicodeOffset = byteOffset; |
| 149 return res; |
| 150 } |
| 151 |
| 152 void handleUnicode(int startScanOffset) { |
| 153 int end = byteOffset; |
| 154 // TODO(lry), this measurably slows down the scanner for files with unicode. |
| 155 String s = UTF8.decode(bytes.sublist(startScanOffset, end)); |
| 156 utf8Slack += (end - startScanOffset) - s.length; |
| 157 } |
| 158 |
| 159 |
| 160 |
| 161 /** |
| 162 * This field remembers the byte offset of the last character decoded with |
| 163 * [nextCodePoint] that used two code units in UTF-16. |
| 164 * |
| 165 * [nextCodePoint] returns a single code point for each unicode character, |
| 166 * even if it needs two code units in UTF-16. |
| 167 * |
| 168 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in |
| 169 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the |
| 170 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should |
| 171 * return the offset of the first one, which is one position more left than |
| 172 * the [utf8Slack]. |
| 173 */ |
| 174 int stringOffsetSlackOffset = -1; |
| 175 |
| 176 int get stringOffset { |
| 177 if (stringOffsetSlackOffset == byteOffset) { |
| 178 return byteOffset - utf8Slack - 1; |
| 179 } else { |
| 180 return byteOffset - utf8Slack; |
| 181 } |
| 182 } |
| 183 |
| 184 Token firstToken() => tokens.next; |
| 185 Token previousToken() => tail; |
| 186 |
| 187 |
| 188 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, |
| 189 [int extraOffset = 0]) { |
| 190 tail.next = new StringToken.fromUtf8Bytes( |
| 191 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); |
| 192 tail = tail.next; |
| 193 } |
| 194 } |
OLD | NEW |