Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(29)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: fixes compiler tests Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
ngeoffray 2013/10/18 10:19:37 2011 -> 2013
lukas 2013/10/24 16:48:36 Done.
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of scanner;
6
7 /**
8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens
9 * that points to substrings.
10 */
11 class Utf8BytesScanner extends ArrayBasedScanner {
12 /** The file content. */
13 List<int> bytes;
14
15 /**
16 * Points to the offset of the byte last returned by [advance].
ngeoffray 2013/10/18 10:19:37 byte last -> last byte?
lukas 2013/10/24 16:48:36 Done.
17 *
18 * After invoking [currentAsUnicode], the [byteOffset] points to the last
19 * byte that is part of the (unicode or ASCII) character. That way, [advance]
20 * can always increase the byte offset by 1.
21 */
22 int byteOffset = -1;
23
24 /**
25 * The getter [scanOffset] is expected to return the index where the current
26 * character *starts*. In case of a non-ascii character, after invoking
27 * [currentAsUnicode], the byte offset points to the *last* byte.
28 *
29 * This field keeps track of the number of bytes for the current unicode
30 * character. For example, if bytes 7,8,9 encode one unicode character, the
31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]
32 * will be 2, so that [scanOffset] returns 7.
33 */
34 int scanSlack = 0;
35
36 /**
37 * Holds the [byteOffset] value for which the current [scanSlack] is valid.
38 */
39 int scanSlackOffset = -1;
40
41 /**
42 * Returns the byte offset of the first byte that belongs to the current
43 * character.
44 */
45 int get scanOffset {
46 if (byteOffset == scanSlackOffset) {
47 return byteOffset - scanSlack;
48 } else {
49 return byteOffset;
50 }
51 }
52
53 /**
54 * The difference between the number of bytes and the number of corresponding
55 * string characters, up to the current [byteOffset].
56 */
57 int utf8Slack = 0;
58
59 /**
60 * Creates a new Utf8BytesScanner. The source file is expected to be a
61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the
62 * string text of the source file is decoded.
63 *
64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an
65 * array whose last element is '0' to signal the end of the file. If this
66 * is not the case, the entire array is copied before scanning.
67 */
68 Utf8BytesScanner(SourceFile file, {bool includeComments: false})
69 : bytes = file.slowUtf8Bytes(),
70 super(file, includeComments) {
71 ensureZeroTermination();
72 }
73
74 /**
75 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.
76 *
77 * The last element of the list is expected to be '0' to signal the end of
78 * the file. If this is not the case, the entire array is copied before
79 * scanning.
80 */
81 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false})
82 : super(null, includeComments) {
83 ensureZeroTermination();
84 }
85
86 void ensureZeroTermination() {
87 if (bytes.isEmpty || bytes[bytes.length - 1] != 0) {
88 // TODO(lry): abort instead of copying the array, or warn?
89 var newBytes = new Uint8List(bytes.length + 1);
90 for (int i = 0; i < bytes.length; i++) {
91 newBytes[i] = bytes[i];
92 }
93 newBytes[bytes.length] = 0;
94 bytes = newBytes;
95 }
96 }
97
98 int advance() => bytes[++byteOffset];
99
100 int peek() => bytes[byteOffset + 1];
101
102 /**
103 * Returns the unicode code point starting at the byte offset [startOffset]
104 * with the byte [nextByte]. If [advance] is true the current [byteOffset]
105 * is advanced to the last byte of the code point.
106 */
107 int nextCodePoint(int startOffset, int nextByte, bool advance) {
108 // The number of 1s in the first byte indicate the number of bytes, at
109 // least 2.
110 int numBytes = 2;
111 int bit = 0x20;
112 while ((nextByte & bit) != 0) {
113 numBytes++;
114 bit >>= 1;
115 }
116 int end = startOffset + numBytes;
117 if (advance) {
118 byteOffset = end - 1;
119 }
120 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
121 // _Utf8Decoder instance. Also the sublist is eagerly allocated.
122 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
123 if (codePoint.length == 1) {
124 if (advance) {
125 utf8Slack += (numBytes - 1);
126 scanSlack = numBytes - 1;
127 scanSlackOffset = byteOffset;
128 }
129 return codePoint.codeUnitAt(0);
130 } else if (codePoint.length == 2) {
131 if (advance) {
132 utf8Slack += (numBytes - 2);
133 scanSlack = numBytes - 1;
134 scanSlackOffset = byteOffset;
135 stringOffsetSlackOffset = byteOffset;
136 }
137 // In case of a surrogate pair, return a single code point.
138 return codePoint.runes.single;
139 } else {
140 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
ngeoffray 2013/10/18 10:19:37 Will users face this error? If yes, we should thro
lukas 2013/10/24 16:48:36 That could only happen in a file with wrong UTF-8
141 }
142 }
143
144 int lastUnicodeOffset = -1;
145 int currentAsUnicode(int next) {
146 if (next < 128) return next;
147 // Check if currentAsUnicode was already invoked.
148 if (byteOffset == lastUnicodeOffset) return next;
149 int res = nextCodePoint(byteOffset, next, true);
150 lastUnicodeOffset = byteOffset;
151 return res;
152 }
153
154 void handleUnicode(int startScanOffset) {
155 int end = byteOffset;
156 // TODO(lry): this measurably slows down the scanner for files with unicode.
157 String s = UTF8.decode(bytes.sublist(startScanOffset, end));
158 utf8Slack += (end - startScanOffset) - s.length;
159 }
160
161 /**
162 * This field remembers the byte offset of the last character decoded with
163 * [nextCodePoint] that used two code units in UTF-16.
164 *
165 * [nextCodePoint] returns a single code point for each unicode character,
166 * even if it needs two code units in UTF-16.
167 *
168 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in
169 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the
170 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should
171 * return the offset of the first one, which is one position more left than
172 * the [utf8Slack].
173 */
174 int stringOffsetSlackOffset = -1;
175
176 int get stringOffset {
177 if (stringOffsetSlackOffset == byteOffset) {
178 return byteOffset - utf8Slack - 1;
179 } else {
180 return byteOffset - utf8Slack;
181 }
182 }
183
184 Token firstToken() => tokens.next;
185 Token previousToken() => tail;
186
187
ngeoffray 2013/10/18 10:19:37 Extra line,
lukas 2013/10/24 16:48:36 Done.
188 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
189 [int extraOffset = 0]) {
190 tail.next = new StringToken.fromUtf8Bytes(
191 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
192 tail = tail.next;
193 }
194 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698