OLD | NEW |
| (Empty) |
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of scanner; | |
6 | |
7 /** | |
8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens | |
9 * that points to substrings. | |
10 */ | |
11 class Utf8BytesScanner extends ArrayBasedScanner { | |
12 /** The file content. */ | |
13 List<int> bytes; | |
14 | |
15 /** | |
16 * Points to the offset of the last byte returned by [advance]. | |
17 * | |
18 * After invoking [currentAsUnicode], the [byteOffset] points to the last | |
19 * byte that is part of the (unicode or ASCII) character. That way, [advance] | |
20 * can always increase the byte offset by 1. | |
21 */ | |
22 int byteOffset = -1; | |
23 | |
24 /** | |
25 * The getter [scanOffset] is expected to return the index where the current | |
26 * character *starts*. In case of a non-ascii character, after invoking | |
27 * [currentAsUnicode], the byte offset points to the *last* byte. | |
28 * | |
29 * This field keeps track of the number of bytes for the current unicode | |
30 * character. For example, if bytes 7,8,9 encode one unicode character, the | |
31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack] | |
32 * will be 2, so that [scanOffset] returns 7. | |
33 */ | |
34 int scanSlack = 0; | |
35 | |
36 /** | |
37 * Holds the [byteOffset] value for which the current [scanSlack] is valid. | |
38 */ | |
39 int scanSlackOffset = -1; | |
40 | |
41 /** | |
42 * Returns the byte offset of the first byte that belongs to the current | |
43 * character. | |
44 */ | |
45 int get scanOffset { | |
46 if (byteOffset == scanSlackOffset) { | |
47 return byteOffset - scanSlack; | |
48 } else { | |
49 return byteOffset; | |
50 } | |
51 } | |
52 | |
53 /** | |
54 * The difference between the number of bytes and the number of corresponding | |
55 * string characters, up to the current [byteOffset]. | |
56 */ | |
57 int utf8Slack = 0; | |
58 | |
59 /** | |
60 * Creates a new Utf8BytesScanner. The source file is expected to be a | |
61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the | |
62 * string text of the source file is decoded. | |
63 * | |
64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an | |
65 * array whose last element is '0' to signal the end of the file. If this | |
66 * is not the case, the entire array is copied before scanning. | |
67 */ | |
68 Utf8BytesScanner(SourceFile file, {bool includeComments: false}) | |
69 : bytes = file.slowUtf8Bytes(), | |
70 super(file, includeComments) { | |
71 ensureZeroTermination(); | |
72 // Skip a leading BOM. | |
73 if (_containsBomAt(0)) byteOffset += 3; | |
74 } | |
75 | |
76 /** | |
77 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes. | |
78 * | |
79 * The last element of the list is expected to be '0' to signal the end of | |
80 * the file. If this is not the case, the entire array is copied before | |
81 * scanning. | |
82 */ | |
83 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false}) | |
84 : super(null, includeComments) { | |
85 ensureZeroTermination(); | |
86 } | |
87 | |
88 void ensureZeroTermination() { | |
89 if (bytes.isEmpty || bytes[bytes.length - 1] != 0) { | |
90 // TODO(lry): abort instead of copying the array, or warn? | |
91 var newBytes = new Uint8List(bytes.length + 1); | |
92 for (int i = 0; i < bytes.length; i++) { | |
93 newBytes[i] = bytes[i]; | |
94 } | |
95 newBytes[bytes.length] = 0; | |
96 bytes = newBytes; | |
97 } | |
98 } | |
99 | |
100 bool _containsBomAt(int offset) { | |
101 const BOM_UTF8 = const [0xEF, 0xBB, 0xBF]; | |
102 | |
103 return offset + 3 < bytes.length && | |
104 bytes[offset] == BOM_UTF8[0] && | |
105 bytes[offset + 1] == BOM_UTF8[1] && | |
106 bytes[offset + 2] == BOM_UTF8[2]; | |
107 } | |
108 | |
109 int advance() => bytes[++byteOffset]; | |
110 | |
111 int peek() => bytes[byteOffset + 1]; | |
112 | |
113 /** | |
114 * Returns the unicode code point starting at the byte offset [startOffset] | |
115 * with the byte [nextByte]. If [advance] is true the current [byteOffset] | |
116 * is advanced to the last byte of the code point. | |
117 */ | |
118 int nextCodePoint(int startOffset, int nextByte, bool advance) { | |
119 // The number of 1s in the first byte indicate the number of bytes, at | |
120 // least 2. | |
121 int numBytes = 2; | |
122 int bit = 0x20; | |
123 while ((nextByte & bit) != 0) { | |
124 numBytes++; | |
125 bit >>= 1; | |
126 } | |
127 int end = startOffset + numBytes; | |
128 if (advance) { | |
129 byteOffset = end - 1; | |
130 } | |
131 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a | |
132 // _Utf8Decoder instance. Also the sublist is eagerly allocated. | |
133 String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); | |
134 if (codePoint.length == 0) { | |
135 // The UTF-8 decoder discards leading BOM characters. | |
136 // TODO(floitsch): don't just assume that removed characters were the | |
137 // BOM. | |
138 assert(_containsBomAt(startOffset)); | |
139 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); | |
140 } | |
141 if (codePoint.length == 1) { | |
142 if (advance) { | |
143 utf8Slack += (numBytes - 1); | |
144 scanSlack = numBytes - 1; | |
145 scanSlackOffset = byteOffset; | |
146 } | |
147 return codePoint.codeUnitAt(0); | |
148 } else if (codePoint.length == 2) { | |
149 if (advance) { | |
150 utf8Slack += (numBytes - 2); | |
151 scanSlack = numBytes - 1; | |
152 scanSlackOffset = byteOffset; | |
153 stringOffsetSlackOffset = byteOffset; | |
154 } | |
155 // In case of a surrogate pair, return a single code point. | |
156 return codePoint.runes.single; | |
157 } else { | |
158 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; | |
159 } | |
160 } | |
161 | |
162 int lastUnicodeOffset = -1; | |
163 int currentAsUnicode(int next) { | |
164 if (next < 128) return next; | |
165 // Check if currentAsUnicode was already invoked. | |
166 if (byteOffset == lastUnicodeOffset) return next; | |
167 int res = nextCodePoint(byteOffset, next, true); | |
168 lastUnicodeOffset = byteOffset; | |
169 return res; | |
170 } | |
171 | |
172 void handleUnicode(int startScanOffset) { | |
173 int end = byteOffset; | |
174 // TODO(lry): this measurably slows down the scanner for files with unicode. | |
175 String s = UTF8.decode(bytes.sublist(startScanOffset, end)); | |
176 utf8Slack += (end - startScanOffset) - s.length; | |
177 } | |
178 | |
179 /** | |
180 * This field remembers the byte offset of the last character decoded with | |
181 * [nextCodePoint] that used two code units in UTF-16. | |
182 * | |
183 * [nextCodePoint] returns a single code point for each unicode character, | |
184 * even if it needs two code units in UTF-16. | |
185 * | |
186 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in | |
187 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the | |
188 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should | |
189 * return the offset of the first one, which is one position more left than | |
190 * the [utf8Slack]. | |
191 */ | |
192 int stringOffsetSlackOffset = -1; | |
193 | |
194 int get stringOffset { | |
195 if (stringOffsetSlackOffset == byteOffset) { | |
196 return byteOffset - utf8Slack - 1; | |
197 } else { | |
198 return byteOffset - utf8Slack; | |
199 } | |
200 } | |
201 | |
202 Token firstToken() => tokens.next; | |
203 Token previousToken() => tail; | |
204 | |
205 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, | |
206 [int extraOffset = 0]) { | |
207 tail.next = new StringToken.fromUtf8Bytes( | |
208 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); | |
209 tail = tail.next; | |
210 } | |
211 | |
212 bool atEndOfFile() => byteOffset >= bytes.length - 1; | |
213 } | |
OLD | NEW |