Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(168)

Side by Side Diff: pkg/utf/lib/utf16.dart

Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of utf;
6
7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
8 /**
9 * Provide a list of Unicode codepoints for a given string.
10 */
11 List<int> stringToCodepoints(String str) {
12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.
13 // So we need to convert.
14 return _utf16CodeUnitsToCodepoints(str.codeUnits);
15 }
16
17 /**
18 * Generate a string from the provided Unicode codepoints.
19 *
20 * *Deprecated* Use [String.fromCharCodes] instead.
21 */
22 String codepointsToString(List<int> codepoints) {
23 return new String.fromCharCodes(codepoints);
24 }
25
26 /**
27 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.
28 * The parameters can override the default Unicode replacement character. Set
29 * the replacementCharacter to null to throw an ArgumentError
30 * rather than replace the bad value.
31 */
32 class Utf16CodeUnitDecoder implements Iterator<int> {
33 final _ListRangeIterator utf16CodeUnitIterator;
34 final int replacementCodepoint;
35 int _current = null;
36
37 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,
38 int this.replacementCodepoint =
39 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
40 utf16CodeUnitIterator =
41 (new _ListRange(utf16CodeUnits, offset, length)).iterator;
42
43 Utf16CodeUnitDecoder.fromListRangeIterator(
44 _ListRangeIterator this.utf16CodeUnitIterator,
45 int this.replacementCodepoint);
46
47 Iterator<int> get iterator => this;
48
49 int get current => _current;
50
51 bool moveNext() {
52 _current = null;
53 if (!utf16CodeUnitIterator.moveNext()) return false;
54
55 int value = utf16CodeUnitIterator.current;
56 if (value < 0) {
57 if (replacementCodepoint != null) {
58 _current = replacementCodepoint;
59 } else {
60 throw new ArgumentError(
61 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
62 }
63 } else if (value < UNICODE_UTF16_RESERVED_LO ||
64 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
65 // transfer directly
66 _current = value;
67 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
68 utf16CodeUnitIterator.moveNext()) {
69 // merge surrogate pair
70 int nextValue = utf16CodeUnitIterator.current;
71 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
72 nextValue <= UNICODE_UTF16_RESERVED_HI) {
73 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;
74 value += UNICODE_UTF16_OFFSET +
75 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);
76 _current = value;
77 } else {
78 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&
79 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {
80 utf16CodeUnitIterator.backup();
81 }
82 if (replacementCodepoint != null) {
83 _current = replacementCodepoint;
84 } else {
85 throw new ArgumentError(
86 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
87 }
88 }
89 } else if (replacementCodepoint != null) {
90 _current = replacementCodepoint;
91 } else {
92 throw new ArgumentError(
93 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
94 }
95 return true;
96 }
97 }
98
99 /**
100 * Encode code points as UTF16 code units.
101 */
102 List<int> _codepointsToUtf16CodeUnits(
103 List<int> codepoints,
104 [int offset = 0,
105 int length,
106 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
107
108 _ListRange listRange = new _ListRange(codepoints, offset, length);
109 int encodedLength = 0;
110 for (int value in listRange) {
111 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
112 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
113 encodedLength++;
114 } else if (value > UNICODE_PLANE_ONE_MAX &&
115 value <= UNICODE_VALID_RANGE_MAX) {
116 encodedLength += 2;
117 } else {
118 encodedLength++;
119 }
120 }
121
122 List<int> codeUnitsBuffer = new List<int>(encodedLength);
123 int j = 0;
124 for (int value in listRange) {
125 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
126 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
127 codeUnitsBuffer[j++] = value;
128 } else if (value > UNICODE_PLANE_ONE_MAX &&
129 value <= UNICODE_VALID_RANGE_MAX) {
130 int base = value - UNICODE_UTF16_OFFSET;
131 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +
132 ((base & UNICODE_UTF16_HI_MASK) >> 10);
133 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +
134 (base & UNICODE_UTF16_LO_MASK);
135 } else if (replacementCodepoint != null) {
136 codeUnitsBuffer[j++] = replacementCodepoint;
137 } else {
138 throw new ArgumentError("Invalid encoding");
139 }
140 }
141 return codeUnitsBuffer;
142 }
143
144 /**
145 * Decodes the utf16 codeunits to codepoints.
146 */
147 List<int> _utf16CodeUnitsToCodepoints(
148 List<int> utf16CodeUnits, [int offset = 0, int length,
149 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
150 _ListRangeIterator source =
151 (new _ListRange(utf16CodeUnits, offset, length)).iterator;
152 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder
153 .fromListRangeIterator(source, replacementCodepoint);
154 List<int> codepoints = new List<int>(source.remaining);
155 int i = 0;
156 while (decoder.moveNext()) {
157 codepoints[i++] = decoder.current;
158 }
159 if (i == codepoints.length) {
160 return codepoints;
161 } else {
162 List<int> codepointTrunc = new List<int>(i);
163 codepointTrunc.setRange(0, i, codepoints);
164 return codepointTrunc;
165 }
166 }
167
168 /**
169 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert
170 * as much of the input as needed. Determines the byte order from the BOM,
171 * or uses big-endian as a default. This method always strips a leading BOM.
172 * Set the [replacementCodepoint] to null to throw an ArgumentError
173 * rather than replace the bad value. The default value for
174 * [replacementCodepoint] is U+FFFD.
175 */
176 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,
177 int length, int replacementCodepoint =
178 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
179 return new IterableUtf16Decoder._(
180 () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length,
181 replacementCodepoint), replacementCodepoint);
182 }
183
184 /**
185 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only
186 * convert as much of the input as needed. This method strips a leading BOM by
187 * default, but can be overridden by setting the optional parameter [stripBom]
188 * to false. Set the [replacementCodepoint] to null to throw an
189 * ArgumentError rather than replace the bad value. The default
190 * value for the [replacementCodepoint] is U+FFFD.
191 */
192 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0,
193 int length, bool stripBom = true, int replacementCodepoint =
194 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
195 return new IterableUtf16Decoder._(
196 () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,
197 replacementCodepoint), replacementCodepoint);
198 }
199
200 /**
201 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only
202 * convert as much of the input as needed. This method strips a leading BOM by
203 * default, but can be overridden by setting the optional parameter [stripBom]
204 * to false. Set the [replacementCodepoint] to null to throw an
205 * ArgumentError rather than replace the bad value. The default
206 * value for the [replacementCodepoint] is U+FFFD.
207 */
208 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0,
209 int length, bool stripBom = true, int replacementCodepoint =
210 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
211 return new IterableUtf16Decoder._(
212 () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,
213 replacementCodepoint), replacementCodepoint);
214 }
215
216 /**
217 * Produce a String from a sequence of UTF-16 encoded bytes. This method always
218 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an
219 * ArgumentError rather than replace the bad value. The default
220 * value for the [replacementCodepoint] is U+FFFD.
221 */
222 String decodeUtf16(List<int> bytes, [int offset = 0, int length,
223 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
224 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes,
225 offset, length, replacementCodepoint);
226 List<int> codeunits = decoder.decodeRest();
227 return new String.fromCharCodes(
228 _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
229 }
230
231 /**
232 * Produce a String from a sequence of UTF-16BE encoded bytes. This method
233 * strips a leading BOM by default, but can be overridden by setting the
234 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to
235 * null to throw an ArgumentError rather than replace the bad value.
236 * The default value for the [replacementCodepoint] is U+FFFD.
237 */
238 String decodeUtf16be(List<int> bytes, [int offset = 0, int length,
239 bool stripBom = true,
240 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
241 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset,
242 length, stripBom, replacementCodepoint)).decodeRest();
243 return new String.fromCharCodes(
244 _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
245 }
246
247 /**
248 * Produce a String from a sequence of UTF-16LE encoded bytes. This method
249 * strips a leading BOM by default, but can be overridden by setting the
250 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to
251 * null to throw an ArgumentError rather than replace the bad value.
252 * The default value for the [replacementCodepoint] is U+FFFD.
253 */
254 String decodeUtf16le(List<int> bytes, [int offset = 0, int length,
255 bool stripBom = true,
256 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
257 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset,
258 length, stripBom, replacementCodepoint)).decodeRest();
259 return new String.fromCharCodes(
260 _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
261 }
262
263 /**
264 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting
265 * bytes with a big-endian byte-order-marker.
266 */
267 List<int> encodeUtf16(String str) =>
268 encodeUtf16be(str, true);
269
270 /**
271 * Produce a list of UTF-16BE encoded bytes. By default, this method produces
272 * UTF-16BE bytes with no BOM.
273 */
274 List<int> encodeUtf16be(String str, [bool writeBOM = false]) {
275 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);
276 List<int> encoding =
277 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));
278 int i = 0;
279 if (writeBOM) {
280 encoding[i++] = UNICODE_UTF_BOM_HI;
281 encoding[i++] = UNICODE_UTF_BOM_LO;
282 }
283 for (int unit in utf16CodeUnits) {
284 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;
285 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;
286 }
287 return encoding;
288 }
289
290 /**
291 * Produce a list of UTF-16LE encoded bytes. By default, this method produces
292 * UTF-16LE bytes with no BOM.
293 */
294 List<int> encodeUtf16le(String str, [bool writeBOM = false]) {
295 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);
296 List<int> encoding =
297 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));
298 int i = 0;
299 if (writeBOM) {
300 encoding[i++] = UNICODE_UTF_BOM_LO;
301 encoding[i++] = UNICODE_UTF_BOM_HI;
302 }
303 for (int unit in utf16CodeUnits) {
304 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;
305 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;
306 }
307 return encoding;
308 }
309
310 /**
311 * Identifies whether a List of bytes starts (based on offset) with a
312 * byte-order marker (BOM).
313 */
314 bool hasUtf16Bom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {
315 return hasUtf16beBom(utf32EncodedBytes, offset, length) ||
316 hasUtf16leBom(utf32EncodedBytes, offset, length);
317 }
318
319 /**
320 * Identifies whether a List of bytes starts (based on offset) with a
321 * big-endian byte-order marker (BOM).
322 */
323 bool hasUtf16beBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {
324 int end = length != null ? offset + length : utf16EncodedBytes.length;
325 return (offset + 2) <= end &&
326 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_HI &&
327 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_LO;
328 }
329
330 /**
331 * Identifies whether a List of bytes starts (based on offset) with a
332 * little-endian byte-order marker (BOM).
333 */
334 bool hasUtf16leBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {
335 int end = length != null ? offset + length : utf16EncodedBytes.length;
336 return (offset + 2) <= end &&
337 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_LO &&
338 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI;
339 }
340
341 List<int> _stringToUtf16CodeUnits(String str) {
342 return _codepointsToUtf16CodeUnits(str.codeUnits);
343 }
344
345 typedef _ListRangeIterator _CodeUnitsProvider();
346
347 /**
348 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type
349 * provides an iterator on demand and the iterator will only translate bytes
350 * as requested by the user of the iterator. (Note: results are not cached.)
351 */
352 // TODO(floitsch): Consider removing the extend and switch to implements since
353 // that's cheaper to allocate.
354 class IterableUtf16Decoder extends IterableBase<int> {
355 final _CodeUnitsProvider codeunitsProvider;
356 final int replacementCodepoint;
357
358 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint);
359
360 Utf16CodeUnitDecoder get iterator =>
361 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(),
362 replacementCodepoint);
363 }
364
365 /**
366 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes
367 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine
368 * endian-ness, and defaults to BE.
369 */
370 abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator {
371 final _ListRangeIterator utf16EncodedBytesIterator;
372 final int replacementCodepoint;
373 int _current = null;
374
375 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator(
376 this.utf16EncodedBytesIterator, this.replacementCodepoint);
377
378 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
379 int offset = 0, int length,
380 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
381 if (length == null) {
382 length = utf16EncodedBytes.length - offset;
383 }
384 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) {
385 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,
386 length - 2, false, replacementCodepoint);
387 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) {
388 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,
389 length - 2, false, replacementCodepoint);
390 } else {
391 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset,
392 length, false, replacementCodepoint);
393 }
394 }
395
396 /**
397 * Provides a fast way to decode the rest of the source bytes in a single
398 * call. This method trades memory for improved speed in that it potentially
399 * over-allocates the List containing results.
400 */
401 List<int> decodeRest() {
402 List<int> codeunits = new List<int>(remaining);
403 int i = 0;
404 while (moveNext()) {
405 codeunits[i++] = current;
406 }
407 if (i == codeunits.length) {
408 return codeunits;
409 } else {
410 List<int> truncCodeunits = new List<int>(i);
411 truncCodeunits.setRange(0, i, codeunits);
412 return truncCodeunits;
413 }
414 }
415
416 int get current => _current;
417
418 bool moveNext() {
419 _current = null;
420 int remaining = utf16EncodedBytesIterator.remaining;
421 if (remaining == 0) {
422 _current = null;
423 return false;
424 }
425 if (remaining == 1) {
426 utf16EncodedBytesIterator.moveNext();
427 if (replacementCodepoint != null) {
428 _current = replacementCodepoint;
429 return true;
430 } else {
431 throw new ArgumentError(
432 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");
433 }
434 }
435 _current = decode();
436 return true;
437 }
438
439 int get position => utf16EncodedBytesIterator.position ~/ 2;
440
441 void backup([int by = 1]) {
442 utf16EncodedBytesIterator.backup(2 * by);
443 }
444
445 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;
446
447 void skip([int count = 1]) {
448 utf16EncodedBytesIterator.skip(2 * count);
449 }
450
451 int decode();
452 }
453
454 /**
455 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes
456 * to produce the code unit (0-(2^16)-1).
457 */
458 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {
459 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
460 int offset = 0, int length, bool stripBom = true,
461 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
462 super._fromListRangeIterator(
463 (new _ListRange(utf16EncodedBytes, offset, length)).iterator,
464 replacementCodepoint) {
465 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) {
466 skip();
467 }
468 }
469
470 int decode() {
471 utf16EncodedBytesIterator.moveNext();
472 int hi = utf16EncodedBytesIterator.current;
473 utf16EncodedBytesIterator.moveNext();
474 int lo = utf16EncodedBytesIterator.current;
475 return (hi << 8) + lo;
476 }
477 }
478
479 /**
480 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes
481 * to produce the code unit (0-(2^16)-1).
482 */
483 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {
484 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
485 int offset = 0, int length, bool stripBom = true,
486 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
487 super._fromListRangeIterator(
488 (new _ListRange(utf16EncodedBytes, offset, length)).iterator,
489 replacementCodepoint) {
490 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) {
491 skip();
492 }
493 }
494
495 int decode() {
496 utf16EncodedBytesIterator.moveNext();
497 int lo = utf16EncodedBytesIterator.current;
498 utf16EncodedBytesIterator.moveNext();
499 int hi = utf16EncodedBytesIterator.current;
500 return (hi << 8) + lo;
501 }
502 }
OLDNEW
« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698