Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(47)

Side by Side Diff: pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart

Issue 2644843006: Use packages dart_parser, dart_scanner, and compiler_util. (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 library dart2js.scanner.utf8;
6
7 import 'dart:convert' show UNICODE_BOM_CHARACTER_RUNE, UTF8;
8
9 import '../io/source_file.dart' show SourceFile;
10 import '../tokens/precedence.dart' show PrecedenceInfo;
11 import '../tokens/token.dart' show StringToken, Token;
12 import 'array_based_scanner.dart' show ArrayBasedScanner;
13
14 /**
15 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens
16 * that points to substrings.
17 */
18 class Utf8BytesScanner extends ArrayBasedScanner {
19 /**
20 * The file content.
21 *
22 * The content is zero-terminated.
23 */
24 List<int> bytes;
25
26 /**
27 * Points to the offset of the last byte returned by [advance].
28 *
29 * After invoking [currentAsUnicode], the [byteOffset] points to the last
30 * byte that is part of the (unicode or ASCII) character. That way, [advance]
31 * can always increase the byte offset by 1.
32 */
33 int byteOffset = -1;
34
35 /**
36 * The getter [scanOffset] is expected to return the index where the current
37 * character *starts*. In case of a non-ascii character, after invoking
38 * [currentAsUnicode], the byte offset points to the *last* byte.
39 *
40 * This field keeps track of the number of bytes for the current unicode
41 * character. For example, if bytes 7,8,9 encode one unicode character, the
42 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]
43 * will be 2, so that [scanOffset] returns 7.
44 */
45 int scanSlack = 0;
46
47 /**
48 * Holds the [byteOffset] value for which the current [scanSlack] is valid.
49 */
50 int scanSlackOffset = -1;
51
52 /**
53 * Returns the byte offset of the first byte that belongs to the current
54 * character.
55 */
56 int get scanOffset {
57 if (byteOffset == scanSlackOffset) {
58 return byteOffset - scanSlack;
59 } else {
60 return byteOffset;
61 }
62 }
63
64 /**
65 * The difference between the number of bytes and the number of corresponding
66 * string characters, up to the current [byteOffset].
67 */
68 int utf8Slack = 0;
69
70 /**
71 * Creates a new Utf8BytesScanner. The source file is expected to be a
72 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the
73 * string text of the source file is decoded.
74 *
75 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an
76 * array whose last element is '0' to signal the end of the file. If this
77 * is not the case, the entire array is copied before scanning.
78 */
79 Utf8BytesScanner(SourceFile file, {bool includeComments: false})
80 : bytes = file.slowUtf8ZeroTerminatedBytes(),
81 super(file, includeComments) {
82 assert(bytes.last == 0);
83 // Skip a leading BOM.
84 if (_containsBomAt(0)) byteOffset += 3;
85 }
86
87 /**
88 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.
89 *
90 * The last element of the list is expected to be '0' to signal the end of
91 * the file. If this is not the case, the entire array is copied before
92 * scanning.
93 */
94 Utf8BytesScanner.fromBytes(List<int> zeroTerminatedBytes,
95 {bool includeComments: false})
96 : this.bytes = zeroTerminatedBytes,
97 super(null, includeComments) {
98 assert(bytes.last == 0);
99 }
100
101 bool _containsBomAt(int offset) {
102 const BOM_UTF8 = const [0xEF, 0xBB, 0xBF];
103
104 return offset + 3 < bytes.length &&
105 bytes[offset] == BOM_UTF8[0] &&
106 bytes[offset + 1] == BOM_UTF8[1] &&
107 bytes[offset + 2] == BOM_UTF8[2];
108 }
109
110 int advance() => bytes[++byteOffset];
111
112 int peek() => bytes[byteOffset + 1];
113
114 /**
115 * Returns the unicode code point starting at the byte offset [startOffset]
116 * with the byte [nextByte]. If [advance] is true the current [byteOffset]
117 * is advanced to the last byte of the code point.
118 */
119 int nextCodePoint(int startOffset, int nextByte, bool advance) {
120 // The number of 1s in the first byte indicate the number of bytes, at
121 // least 2.
122 int numBytes = 2;
123 int bit = 0x20;
124 while ((nextByte & bit) != 0) {
125 numBytes++;
126 bit >>= 1;
127 }
128 int end = startOffset + numBytes;
129 if (advance) {
130 byteOffset = end - 1;
131 }
132 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
133 // _Utf8Decoder instance. Also the sublist is eagerly allocated.
134 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));
135 if (codePoint.length == 0) {
136 // The UTF-8 decoder discards leading BOM characters.
137 // TODO(floitsch): don't just assume that removed characters were the
138 // BOM.
139 assert(_containsBomAt(startOffset));
140 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
141 }
142 if (codePoint.length == 1) {
143 if (advance) {
144 utf8Slack += (numBytes - 1);
145 scanSlack = numBytes - 1;
146 scanSlackOffset = byteOffset;
147 }
148 return codePoint.codeUnitAt(0);
149 } else if (codePoint.length == 2) {
150 if (advance) {
151 utf8Slack += (numBytes - 2);
152 scanSlack = numBytes - 1;
153 scanSlackOffset = byteOffset;
154 stringOffsetSlackOffset = byteOffset;
155 }
156 // In case of a surrogate pair, return a single code point.
157 return codePoint.runes.single;
158 } else {
159 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
160 }
161 }
162
163 int lastUnicodeOffset = -1;
164 int currentAsUnicode(int next) {
165 if (next < 128) return next;
166 // Check if currentAsUnicode was already invoked.
167 if (byteOffset == lastUnicodeOffset) return next;
168 int res = nextCodePoint(byteOffset, next, true);
169 lastUnicodeOffset = byteOffset;
170 return res;
171 }
172
173 void handleUnicode(int startScanOffset) {
174 int end = byteOffset;
175 // TODO(lry): this measurably slows down the scanner for files with unicode.
176 String s = UTF8.decode(bytes.sublist(startScanOffset, end));
177 utf8Slack += (end - startScanOffset) - s.length;
178 }
179
180 /**
181 * This field remembers the byte offset of the last character decoded with
182 * [nextCodePoint] that used two code units in UTF-16.
183 *
184 * [nextCodePoint] returns a single code point for each unicode character,
185 * even if it needs two code units in UTF-16.
186 *
187 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in
188 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the
189 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should
190 * return the offset of the first one, which is one position more left than
191 * the [utf8Slack].
192 */
193 int stringOffsetSlackOffset = -1;
194
195 int get stringOffset {
196 if (stringOffsetSlackOffset == byteOffset) {
197 return byteOffset - utf8Slack - 1;
198 } else {
199 return byteOffset - utf8Slack;
200 }
201 }
202
203 Token firstToken() => tokens.next;
204 Token previousToken() => tail;
205
206 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
207 [int extraOffset = 0]) {
208 tail.next = new StringToken.fromUtf8Bytes(
209 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
210 tail = tail.next;
211 }
212
213 bool atEndOfFile() => byteOffset >= bytes.length - 1;
214 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698