Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(383)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Re-add ArrayBasedScanner, minor fixes. Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of scanner;
6
7 /**
8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens
9 * that points to substrings.
10 */
11 class Utf8BytesScanner extends ArrayBasedScanner {
12 /** The file content. */
13 List<int> bytes;
14
15 /**
16 * Points to the offset of the byte last returned by [advance].
17 *
18 * After invoking [currentAsUnicode], the [byteOffset] points to the last
19 * byte that is part of the (unicode or ASCII) character. That way, [advance]
20 * can always increase the byte offset by 1.
21 */
22 int byteOffset = -1;
23
24 /**
25 * The getter [scanOffset] is expected to return the index where the current
26 * character *starts*. In case of a non-ascii character, after invoking
27 * [currentAsUnicode], the byte offset points to the *last* byte.
28 *
29 * This field keeps track of the number of bytes for the current unicode
30 * character. For example, if bytes 7,8,9 encode one unicode character, the
31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]
32 * will be 2, so that [scanOffset] returns 7.
33 */
34 int scanSlack = 0;
35
36 /**
37 * Holds the [byteOffset] value for which the current [scanSlack] is valid.
38 */
39 int scanSlackOffset = -1;
40
41 /**
42 * Returns the byte offset of the first byte that belongs to the current
43 * character.
44 */
45 int get scanOffset {
46 if (byteOffset == scanSlackOffset) {
47 return byteOffset - scanSlack;
48 } else {
49 return byteOffset;
50 }
51 }
52
53 /**
54 * The difference between the number of bytes and the number of corresponding
55 * string characters, up to the current [byteOffset].
56 */
57 int utf8Slack = 0;
58
59 /**
60 * Creates a new Utf8BytesScanner. The source file is expected to be a
61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the
62 * string text of the source file is decoded.
63 *
64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an
65 * array whose last element is '0' to signal the end of the file. If this
66 * is not the case, the entire array is copied before scanning.
67 */
68 Utf8BytesScanner(SourceFile file, {bool includeComments: false})
69 : bytes = file.slowUtf8Bytes(),
70 super(file, includeComments) {
71 ensureZeroTermination();
72 }
73
74 /**
75 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.
76 *
77 * The last element of the list is expected to be '0' to signal the end of
78 * the file. If this is not the case, the entire array is copied before
79 * scanning.
80 */
81 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false})
82 : super(null, includeComments) {
83 ensureZeroTermination();
84 }
85
86 void ensureZeroTermination() {
87 if (bytes.isEmpty || bytes[bytes.length -1] != 0) {
88 // TODO(lry), abort instead of copying the array, or warn?
89 var newBytes = new Uint8List(bytes.length + 1);
90 newBytes.addAll(bytes);
91 newBytes[bytes.length] = 0;
92 bytes = newBytes;
93 }
94 }
95
96 int advance() => bytes[++byteOffset];
97
98 int peek() => bytes[byteOffset + 1];
99
100 /**
101 * Returns the unicode code point starting at the byte offset [startOffset]
102 * with the byte [nextByte]. If [advance] is true the current [byteOffset]
103 * is advanced to the last byte of the code point.
104 */
105 int nextCodePoint(int startOffset, int nextByte, bool advance) {
106 // The number of 1s in the first byte indicate the number of bytes, at
107 // least 2.
108 int numBytes = 2;
109 int bit = 0x20;
110 while ((nextByte & bit) != 0) {
111 numBytes++;
112 bit >>= 1;
113 }
114 int end = startOffset + numBytes;
115 if (advance) {
116 byteOffset = end - 1;
117 }
118 // TODO(lry), measurably slow, decode creates first a Utf8Decoder and a
119 // _Utf8Decoder instance. Also the sublist is eagerly allocated.
120 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
121 if (codePoint.length == 1) {
122 if (advance) {
123 utf8Slack += (numBytes - 1);
124 scanSlack = numBytes - 1;
125 scanSlackOffset = byteOffset;
126 }
127 return codePoint.codeUnitAt(0);
128 } else if (codePoint.length == 2) {
129 if (advance) {
130 utf8Slack += (numBytes - 2);
131 scanSlack = numBytes - 1;
132 scanSlackOffset = byteOffset;
133 stringOffsetSlackOffset = byteOffset;
134 }
135 // In case of a surrogate pair, return a single code point.
136 return codePoint.runes.single;
137 } else {
138 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
139 }
140 }
141
142 int lastUnicodeOffset = -1;
143 int currentAsUnicode(int next) {
144 if (next < 128) return next;
145 // Check if currentAsUnicode was already invoked.
146 if (byteOffset == lastUnicodeOffset) return next;
147 int res = nextCodePoint(byteOffset, next, true);
148 lastUnicodeOffset = byteOffset;
149 return res;
150 }
151
152 void handleUnicode(int startScanOffset) {
153 int end = byteOffset;
154 // TODO(lry), this measurably slows down the scanner for files with unicode.
155 String s = UTF8.decode(bytes.sublist(startScanOffset, end));
156 utf8Slack += (end - startScanOffset) - s.length;
157 }
158
kasperl 2013/10/17 08:50:39 That's a lot of whitespace.
lukas 2013/10/17 17:49:34 Done.
159
160
161 /**
162 * This field remembers the byte offset of the last character decoded with
163 * [nextCodePoint] that used two code units in UTF-16.
164 *
165 * [nextCodePoint] returns a single code point for each unicode character,
166 * even if it needs two code units in UTF-16.
167 *
168 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in
169 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the
170 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should
171 * return the offset of the first one, which is one position more left than
172 * the [utf8Slack].
173 */
174 int stringOffsetSlackOffset = -1;
175
176 int get stringOffset {
177 if (stringOffsetSlackOffset == byteOffset) {
178 return byteOffset - utf8Slack - 1;
179 } else {
180 return byteOffset - utf8Slack;
181 }
182 }
183
184 Token firstToken() => tokens.next;
185 Token previousToken() => tail;
186
187
188 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
189 [int extraOffset = 0]) {
190 tail.next = new StringToken.fromUtf8Bytes(
191 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
192 tail = tail.next;
193 }
194 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698