Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(389)

Side by Side Diff: packages/utf/lib/src/utf/utf16.dart

Issue 2989763002: Update charted to 0.4.8 and roll (Closed)
Patch Set: Removed Cutch from list of reviewers Created 3 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « packages/utf/lib/src/shared.dart ('k') | packages/utf/lib/src/utf/utf32.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of utf;
6
7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
8 /**
9 * Provide a list of Unicode codepoints for a given string.
10 */
11 List<int> stringToCodepoints(String str) {
12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.
13 // So we need to convert.
14 return utf16CodeUnitsToCodepoints(str.codeUnits);
15 }
16
17 /**
18 * Generate a string from the provided Unicode codepoints.
19 *
20 * *Deprecated* Use [String.fromCharCodes] instead.
21 */
22 @deprecated
23 String codepointsToString(List<int> codepoints) {
24 return new String.fromCharCodes(codepoints);
25 }
26 /**
27 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert
28 * as much of the input as needed. Determines the byte order from the BOM,
29 * or uses big-endian as a default. This method always strips a leading BOM.
30 * Set the [replacementCodepoint] to null to throw an ArgumentError
31 * rather than replace the bad value. The default value for
32 * [replacementCodepoint] is U+FFFD.
33 */
34 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,
35 int length, int replacementCodepoint =
36 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
37 return new IterableUtf16Decoder._(
38 () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length,
39 replacementCodepoint), replacementCodepoint);
40 }
41
42 /**
43 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only
44 * convert as much of the input as needed. This method strips a leading BOM by
45 * default, but can be overridden by setting the optional parameter [stripBom]
46 * to false. Set the [replacementCodepoint] to null to throw an
47 * ArgumentError rather than replace the bad value. The default
48 * value for the [replacementCodepoint] is U+FFFD.
49 */
50 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0,
51 int length, bool stripBom = true, int replacementCodepoint =
52 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
53 return new IterableUtf16Decoder._(
54 () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,
55 replacementCodepoint), replacementCodepoint);
56 }
57
58 /**
59 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only
60 * convert as much of the input as needed. This method strips a leading BOM by
61 * default, but can be overridden by setting the optional parameter [stripBom]
62 * to false. Set the [replacementCodepoint] to null to throw an
63 * ArgumentError rather than replace the bad value. The default
64 * value for the [replacementCodepoint] is U+FFFD.
65 */
66 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0,
67 int length, bool stripBom = true, int replacementCodepoint =
68 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
69 return new IterableUtf16Decoder._(
70 () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,
71 replacementCodepoint), replacementCodepoint);
72 }
73
74 /**
75 * Produce a String from a sequence of UTF-16 encoded bytes. This method always
76 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an
77 * ArgumentError rather than replace the bad value. The default
78 * value for the [replacementCodepoint] is U+FFFD.
79 */
80 String decodeUtf16(List<int> bytes, [int offset = 0, int length,
81 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
82 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes,
83 offset, length, replacementCodepoint);
84 List<int> codeunits = decoder.decodeRest();
85 return new String.fromCharCodes(
86 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
87 }
88
89 /**
90 * Produce a String from a sequence of UTF-16BE encoded bytes. This method
91 * strips a leading BOM by default, but can be overridden by setting the
92 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to
93 * null to throw an ArgumentError rather than replace the bad value.
94 * The default value for the [replacementCodepoint] is U+FFFD.
95 */
96 String decodeUtf16be(List<int> bytes, [int offset = 0, int length,
97 bool stripBom = true,
98 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
99 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset,
100 length, stripBom, replacementCodepoint)).decodeRest();
101 return new String.fromCharCodes(
102 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
103 }
104
105 /**
106 * Produce a String from a sequence of UTF-16LE encoded bytes. This method
107 * strips a leading BOM by default, but can be overridden by setting the
108 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to
109 * null to throw an ArgumentError rather than replace the bad value.
110 * The default value for the [replacementCodepoint] is U+FFFD.
111 */
112 String decodeUtf16le(List<int> bytes, [int offset = 0, int length,
113 bool stripBom = true,
114 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
115 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset,
116 length, stripBom, replacementCodepoint)).decodeRest();
117 return new String.fromCharCodes(
118 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
119 }
120
121 /**
122 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting
123 * bytes with a big-endian byte-order-marker.
124 */
125 List<int> encodeUtf16(String str) =>
126 encodeUtf16be(str, true);
127
128 /**
129 * Produce a list of UTF-16BE encoded bytes. By default, this method produces
130 * UTF-16BE bytes with no BOM.
131 */
132 List<int> encodeUtf16be(String str, [bool writeBOM = false]) {
133 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);
134 List<int> encoding =
135 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));
136 int i = 0;
137 if (writeBOM) {
138 encoding[i++] = UNICODE_UTF_BOM_HI;
139 encoding[i++] = UNICODE_UTF_BOM_LO;
140 }
141 for (int unit in utf16CodeUnits) {
142 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;
143 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;
144 }
145 return encoding;
146 }
147
148 /**
149 * Produce a list of UTF-16LE encoded bytes. By default, this method produces
150 * UTF-16LE bytes with no BOM.
151 */
152 List<int> encodeUtf16le(String str, [bool writeBOM = false]) {
153 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);
154 List<int> encoding =
155 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));
156 int i = 0;
157 if (writeBOM) {
158 encoding[i++] = UNICODE_UTF_BOM_LO;
159 encoding[i++] = UNICODE_UTF_BOM_HI;
160 }
161 for (int unit in utf16CodeUnits) {
162 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;
163 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;
164 }
165 return encoding;
166 }
167
168 /**
169 * Identifies whether a List of bytes starts (based on offset) with a
170 * byte-order marker (BOM).
171 */
172 bool hasUtf16Bom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {
173 return hasUtf16beBom(utf32EncodedBytes, offset, length) ||
174 hasUtf16leBom(utf32EncodedBytes, offset, length);
175 }
176
177 /**
178 * Identifies whether a List of bytes starts (based on offset) with a
179 * big-endian byte-order marker (BOM).
180 */
181 bool hasUtf16beBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {
182 int end = length != null ? offset + length : utf16EncodedBytes.length;
183 return (offset + 2) <= end &&
184 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_HI &&
185 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_LO;
186 }
187
188 /**
189 * Identifies whether a List of bytes starts (based on offset) with a
190 * little-endian byte-order marker (BOM).
191 */
192 bool hasUtf16leBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {
193 int end = length != null ? offset + length : utf16EncodedBytes.length;
194 return (offset + 2) <= end &&
195 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_LO &&
196 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI;
197 }
198
199 List<int> _stringToUtf16CodeUnits(String str) {
200 return codepointsToUtf16CodeUnits(str.codeUnits);
201 }
202
203 typedef ListRangeIterator _CodeUnitsProvider();
204
205 /**
206 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type
207 * provides an iterator on demand and the iterator will only translate bytes
208 * as requested by the user of the iterator. (Note: results are not cached.)
209 */
210 // TODO(floitsch): Consider removing the extend and switch to implements since
211 // that's cheaper to allocate.
212 class IterableUtf16Decoder extends IterableBase<int> {
213 final _CodeUnitsProvider codeunitsProvider;
214 final int replacementCodepoint;
215
216 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint);
217
218 Utf16CodeUnitDecoder get iterator =>
219 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(),
220 replacementCodepoint);
221 }
222
223 /**
224 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes
225 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine
226 * endian-ness, and defaults to BE.
227 */
228 abstract class Utf16BytesToCodeUnitsDecoder implements ListRangeIterator {
229 // TODO(kevmoo): should this field be private?
230 final ListRangeIterator utf16EncodedBytesIterator;
231 final int replacementCodepoint;
232 int _current = null;
233
234 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator(
235 this.utf16EncodedBytesIterator, this.replacementCodepoint);
236
237 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
238 int offset = 0, int length,
239 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
240 if (length == null) {
241 length = utf16EncodedBytes.length - offset;
242 }
243 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) {
244 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,
245 length - 2, false, replacementCodepoint);
246 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) {
247 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,
248 length - 2, false, replacementCodepoint);
249 } else {
250 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset,
251 length, false, replacementCodepoint);
252 }
253 }
254
255 /**
256 * Provides a fast way to decode the rest of the source bytes in a single
257 * call. This method trades memory for improved speed in that it potentially
258 * over-allocates the List containing results.
259 */
260 List<int> decodeRest() {
261 List<int> codeunits = new List<int>(remaining);
262 int i = 0;
263 while (moveNext()) {
264 codeunits[i++] = current;
265 }
266 if (i == codeunits.length) {
267 return codeunits;
268 } else {
269 List<int> truncCodeunits = new List<int>(i);
270 truncCodeunits.setRange(0, i, codeunits);
271 return truncCodeunits;
272 }
273 }
274
275 int get current => _current;
276
277 bool moveNext() {
278 _current = null;
279 int remaining = utf16EncodedBytesIterator.remaining;
280 if (remaining == 0) {
281 _current = null;
282 return false;
283 }
284 if (remaining == 1) {
285 utf16EncodedBytesIterator.moveNext();
286 if (replacementCodepoint != null) {
287 _current = replacementCodepoint;
288 return true;
289 } else {
290 throw new ArgumentError(
291 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");
292 }
293 }
294 _current = decode();
295 return true;
296 }
297
298 int get position => utf16EncodedBytesIterator.position ~/ 2;
299
300 void backup([int by = 1]) {
301 utf16EncodedBytesIterator.backup(2 * by);
302 }
303
304 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;
305
306 void skip([int count = 1]) {
307 utf16EncodedBytesIterator.skip(2 * count);
308 }
309
310 int decode();
311 }
312
313 /**
314 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes
315 * to produce the code unit (0-(2^16)-1).
316 */
317 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {
318 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
319 int offset = 0, int length, bool stripBom = true,
320 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
321 super._fromListRangeIterator(
322 (new ListRange(utf16EncodedBytes, offset, length)).iterator,
323 replacementCodepoint) {
324 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) {
325 skip();
326 }
327 }
328
329 int decode() {
330 utf16EncodedBytesIterator.moveNext();
331 int hi = utf16EncodedBytesIterator.current;
332 utf16EncodedBytesIterator.moveNext();
333 int lo = utf16EncodedBytesIterator.current;
334 return (hi << 8) + lo;
335 }
336 }
337
338 /**
339 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes
340 * to produce the code unit (0-(2^16)-1).
341 */
342 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {
343 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
344 int offset = 0, int length, bool stripBom = true,
345 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
346 super._fromListRangeIterator(
347 (new ListRange(utf16EncodedBytes, offset, length)).iterator,
348 replacementCodepoint) {
349 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) {
350 skip();
351 }
352 }
353
354 int decode() {
355 utf16EncodedBytesIterator.moveNext();
356 int lo = utf16EncodedBytesIterator.current;
357 utf16EncodedBytesIterator.moveNext();
358 int hi = utf16EncodedBytesIterator.current;
359 return (hi << 8) + lo;
360 }
361 }
OLDNEW
« no previous file with comments | « packages/utf/lib/src/shared.dart ('k') | packages/utf/lib/src/utf/utf32.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698