Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1297)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart

Issue 694353007: Move dart2js from sdk/lib/_internal/compiler to pkg/compiler (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of scanner;
6
7 /**
8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens
9 * that points to substrings.
10 */
11 class Utf8BytesScanner extends ArrayBasedScanner {
12 /** The file content. */
13 List<int> bytes;
14
15 /**
16 * Points to the offset of the last byte returned by [advance].
17 *
18 * After invoking [currentAsUnicode], the [byteOffset] points to the last
19 * byte that is part of the (unicode or ASCII) character. That way, [advance]
20 * can always increase the byte offset by 1.
21 */
22 int byteOffset = -1;
23
24 /**
25 * The getter [scanOffset] is expected to return the index where the current
26 * character *starts*. In case of a non-ascii character, after invoking
27 * [currentAsUnicode], the byte offset points to the *last* byte.
28 *
29 * This field keeps track of the number of bytes for the current unicode
30 * character. For example, if bytes 7,8,9 encode one unicode character, the
31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]
32 * will be 2, so that [scanOffset] returns 7.
33 */
34 int scanSlack = 0;
35
36 /**
37 * Holds the [byteOffset] value for which the current [scanSlack] is valid.
38 */
39 int scanSlackOffset = -1;
40
41 /**
42 * Returns the byte offset of the first byte that belongs to the current
43 * character.
44 */
45 int get scanOffset {
46 if (byteOffset == scanSlackOffset) {
47 return byteOffset - scanSlack;
48 } else {
49 return byteOffset;
50 }
51 }
52
53 /**
54 * The difference between the number of bytes and the number of corresponding
55 * string characters, up to the current [byteOffset].
56 */
57 int utf8Slack = 0;
58
59 /**
60 * Creates a new Utf8BytesScanner. The source file is expected to be a
61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the
62 * string text of the source file is decoded.
63 *
64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an
65 * array whose last element is '0' to signal the end of the file. If this
66 * is not the case, the entire array is copied before scanning.
67 */
68 Utf8BytesScanner(SourceFile file, {bool includeComments: false})
69 : bytes = file.slowUtf8Bytes(),
70 super(file, includeComments) {
71 ensureZeroTermination();
72 // Skip a leading BOM.
73 if (_containsBomAt(0)) byteOffset += 3;
74 }
75
76 /**
77 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.
78 *
79 * The last element of the list is expected to be '0' to signal the end of
80 * the file. If this is not the case, the entire array is copied before
81 * scanning.
82 */
83 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false})
84 : super(null, includeComments) {
85 ensureZeroTermination();
86 }
87
88 void ensureZeroTermination() {
89 if (bytes.isEmpty || bytes[bytes.length - 1] != 0) {
90 // TODO(lry): abort instead of copying the array, or warn?
91 var newBytes = new Uint8List(bytes.length + 1);
92 for (int i = 0; i < bytes.length; i++) {
93 newBytes[i] = bytes[i];
94 }
95 newBytes[bytes.length] = 0;
96 bytes = newBytes;
97 }
98 }
99
100 bool _containsBomAt(int offset) {
101 const BOM_UTF8 = const [0xEF, 0xBB, 0xBF];
102
103 return offset + 3 < bytes.length &&
104 bytes[offset] == BOM_UTF8[0] &&
105 bytes[offset + 1] == BOM_UTF8[1] &&
106 bytes[offset + 2] == BOM_UTF8[2];
107 }
108
109 int advance() => bytes[++byteOffset];
110
111 int peek() => bytes[byteOffset + 1];
112
113 /**
114 * Returns the unicode code point starting at the byte offset [startOffset]
115 * with the byte [nextByte]. If [advance] is true the current [byteOffset]
116 * is advanced to the last byte of the code point.
117 */
118 int nextCodePoint(int startOffset, int nextByte, bool advance) {
119 // The number of 1s in the first byte indicate the number of bytes, at
120 // least 2.
121 int numBytes = 2;
122 int bit = 0x20;
123 while ((nextByte & bit) != 0) {
124 numBytes++;
125 bit >>= 1;
126 }
127 int end = startOffset + numBytes;
128 if (advance) {
129 byteOffset = end - 1;
130 }
131 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
132 // _Utf8Decoder instance. Also the sublist is eagerly allocated.
133 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
134 if (codePoint.length == 0) {
135 // The UTF-8 decoder discards leading BOM characters.
136 // TODO(floitsch): don't just assume that removed characters were the
137 // BOM.
138 assert(_containsBomAt(startOffset));
139 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
140 }
141 if (codePoint.length == 1) {
142 if (advance) {
143 utf8Slack += (numBytes - 1);
144 scanSlack = numBytes - 1;
145 scanSlackOffset = byteOffset;
146 }
147 return codePoint.codeUnitAt(0);
148 } else if (codePoint.length == 2) {
149 if (advance) {
150 utf8Slack += (numBytes - 2);
151 scanSlack = numBytes - 1;
152 scanSlackOffset = byteOffset;
153 stringOffsetSlackOffset = byteOffset;
154 }
155 // In case of a surrogate pair, return a single code point.
156 return codePoint.runes.single;
157 } else {
158 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
159 }
160 }
161
162 int lastUnicodeOffset = -1;
163 int currentAsUnicode(int next) {
164 if (next < 128) return next;
165 // Check if currentAsUnicode was already invoked.
166 if (byteOffset == lastUnicodeOffset) return next;
167 int res = nextCodePoint(byteOffset, next, true);
168 lastUnicodeOffset = byteOffset;
169 return res;
170 }
171
172 void handleUnicode(int startScanOffset) {
173 int end = byteOffset;
174 // TODO(lry): this measurably slows down the scanner for files with unicode.
175 String s = UTF8.decode(bytes.sublist(startScanOffset, end));
176 utf8Slack += (end - startScanOffset) - s.length;
177 }
178
179 /**
180 * This field remembers the byte offset of the last character decoded with
181 * [nextCodePoint] that used two code units in UTF-16.
182 *
183 * [nextCodePoint] returns a single code point for each unicode character,
184 * even if it needs two code units in UTF-16.
185 *
186 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in
187 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the
188 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should
189 * return the offset of the first one, which is one position more left than
190 * the [utf8Slack].
191 */
192 int stringOffsetSlackOffset = -1;
193
194 int get stringOffset {
195 if (stringOffsetSlackOffset == byteOffset) {
196 return byteOffset - utf8Slack - 1;
197 } else {
198 return byteOffset - utf8Slack;
199 }
200 }
201
202 Token firstToken() => tokens.next;
203 Token previousToken() => tail;
204
205 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
206 [int extraOffset = 0]) {
207 tail.next = new StringToken.fromUtf8Bytes(
208 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
209 tail = tail.next;
210 }
211
212 bool atEndOfFile() => byteOffset >= bytes.length - 1;
213 }
OLDNEW
« no previous file with comments | « sdk/lib/_internal/compiler/implementation/scanner/token.dart ('k') | sdk/lib/_internal/compiler/implementation/script.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698