OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of utf; | 5 library utf.utf16; |
6 | 6 |
7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | 7 import "dart:collection"; |
8 /** | 8 |
9 * Provide a list of Unicode codepoints for a given string. | 9 import 'constants.dart'; |
10 */ | 10 import 'list_range.dart'; |
11 List<int> stringToCodepoints(String str) { | 11 import 'utf_16_code_unit_decoder.dart'; |
12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. | 12 import 'util.dart'; |
13 // So we need to convert. | |
14 return utf16CodeUnitsToCodepoints(str.codeUnits); | |
15 } | |
16 | 13 |
17 /** | 14 /** |
18 * Generate a string from the provided Unicode codepoints. | 15 * Generate a string from the provided Unicode codepoints. |
19 * | 16 * |
20 * *Deprecated* Use [String.fromCharCodes] instead. | 17 * *Deprecated* Use [String.fromCharCodes] instead. |
21 */ | 18 */ |
22 @deprecated | 19 @deprecated |
23 String codepointsToString(List<int> codepoints) { | 20 String codepointsToString(List<int> codepoints) { |
24 return new String.fromCharCodes(codepoints); | 21 return new String.fromCharCodes(codepoints); |
25 } | 22 } |
| 23 |
26 /** | 24 /** |
27 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert | 25 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert |
28 * as much of the input as needed. Determines the byte order from the BOM, | 26 * as much of the input as needed. Determines the byte order from the BOM, |
29 * or uses big-endian as a default. This method always strips a leading BOM. | 27 * or uses big-endian as a default. This method always strips a leading BOM. |
30 * Set the [replacementCodepoint] to null to throw an ArgumentError | 28 * Set the [replacementCodepoint] to null to throw an ArgumentError |
31 * rather than replace the bad value. The default value for | 29 * rather than replace the bad value. The default value for |
32 * [replacementCodepoint] is U+FFFD. | 30 * [replacementCodepoint] is U+FFFD. |
33 */ | 31 */ |
34 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0, | 32 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, |
35 int length, int replacementCodepoint = | 33 [int offset = 0, |
36 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 34 int length, |
| 35 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
37 return new IterableUtf16Decoder._( | 36 return new IterableUtf16Decoder._( |
38 () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length, | 37 () => new Utf16BytesToCodeUnitsDecoder( |
39 replacementCodepoint), replacementCodepoint); | 38 bytes, offset, length, replacementCodepoint), |
| 39 replacementCodepoint); |
40 } | 40 } |
41 | 41 |
42 /** | 42 /** |
43 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only | 43 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only |
44 * convert as much of the input as needed. This method strips a leading BOM by | 44 * convert as much of the input as needed. This method strips a leading BOM by |
45 * default, but can be overridden by setting the optional parameter [stripBom] | 45 * default, but can be overridden by setting the optional parameter [stripBom] |
46 * to false. Set the [replacementCodepoint] to null to throw an | 46 * to false. Set the [replacementCodepoint] to null to throw an |
47 * ArgumentError rather than replace the bad value. The default | 47 * ArgumentError rather than replace the bad value. The default |
48 * value for the [replacementCodepoint] is U+FFFD. | 48 * value for the [replacementCodepoint] is U+FFFD. |
49 */ | 49 */ |
50 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0, | 50 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, |
51 int length, bool stripBom = true, int replacementCodepoint = | 51 [int offset = 0, |
52 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 52 int length, |
| 53 bool stripBom = true, |
| 54 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
53 return new IterableUtf16Decoder._( | 55 return new IterableUtf16Decoder._( |
54 () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom, | 56 () => new Utf16beBytesToCodeUnitsDecoder( |
55 replacementCodepoint), replacementCodepoint); | 57 bytes, offset, length, stripBom, replacementCodepoint), |
| 58 replacementCodepoint); |
56 } | 59 } |
57 | 60 |
58 /** | 61 /** |
59 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only | 62 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only |
60 * convert as much of the input as needed. This method strips a leading BOM by | 63 * convert as much of the input as needed. This method strips a leading BOM by |
61 * default, but can be overridden by setting the optional parameter [stripBom] | 64 * default, but can be overridden by setting the optional parameter [stripBom] |
62 * to false. Set the [replacementCodepoint] to null to throw an | 65 * to false. Set the [replacementCodepoint] to null to throw an |
63 * ArgumentError rather than replace the bad value. The default | 66 * ArgumentError rather than replace the bad value. The default |
64 * value for the [replacementCodepoint] is U+FFFD. | 67 * value for the [replacementCodepoint] is U+FFFD. |
65 */ | 68 */ |
66 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0, | 69 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, |
67 int length, bool stripBom = true, int replacementCodepoint = | 70 [int offset = 0, |
68 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 71 int length, |
| 72 bool stripBom = true, |
| 73 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
69 return new IterableUtf16Decoder._( | 74 return new IterableUtf16Decoder._( |
70 () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom, | 75 () => new Utf16leBytesToCodeUnitsDecoder( |
71 replacementCodepoint), replacementCodepoint); | 76 bytes, offset, length, stripBom, replacementCodepoint), |
| 77 replacementCodepoint); |
72 } | 78 } |
73 | 79 |
74 /** | 80 /** |
75 * Produce a String from a sequence of UTF-16 encoded bytes. This method always | 81 * Produce a String from a sequence of UTF-16 encoded bytes. This method always |
76 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an | 82 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an |
77 * ArgumentError rather than replace the bad value. The default | 83 * ArgumentError rather than replace the bad value. The default |
78 * value for the [replacementCodepoint] is U+FFFD. | 84 * value for the [replacementCodepoint] is U+FFFD. |
79 */ | 85 */ |
80 String decodeUtf16(List<int> bytes, [int offset = 0, int length, | 86 String decodeUtf16(List<int> bytes, |
| 87 [int offset = 0, |
| 88 int length, |
81 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 89 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
82 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes, | 90 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder( |
83 offset, length, replacementCodepoint); | 91 bytes, offset, length, replacementCodepoint); |
84 List<int> codeunits = decoder.decodeRest(); | 92 List<int> codeunits = decoder.decodeRest(); |
85 return new String.fromCharCodes( | 93 return new String.fromCharCodes( |
86 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); | 94 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); |
87 } | 95 } |
88 | 96 |
89 /** | 97 /** |
90 * Produce a String from a sequence of UTF-16BE encoded bytes. This method | 98 * Produce a String from a sequence of UTF-16BE encoded bytes. This method |
91 * strips a leading BOM by default, but can be overridden by setting the | 99 * strips a leading BOM by default, but can be overridden by setting the |
92 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to | 100 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to |
93 * null to throw an ArgumentError rather than replace the bad value. | 101 * null to throw an ArgumentError rather than replace the bad value. |
94 * The default value for the [replacementCodepoint] is U+FFFD. | 102 * The default value for the [replacementCodepoint] is U+FFFD. |
95 */ | 103 */ |
96 String decodeUtf16be(List<int> bytes, [int offset = 0, int length, | 104 String decodeUtf16be(List<int> bytes, |
| 105 [int offset = 0, |
| 106 int length, |
97 bool stripBom = true, | 107 bool stripBom = true, |
98 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 108 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
99 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset, | 109 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder( |
100 length, stripBom, replacementCodepoint)).decodeRest(); | 110 bytes, offset, length, stripBom, replacementCodepoint)) |
| 111 .decodeRest(); |
101 return new String.fromCharCodes( | 112 return new String.fromCharCodes( |
102 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); | 113 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); |
103 } | 114 } |
104 | 115 |
105 /** | 116 /** |
106 * Produce a String from a sequence of UTF-16LE encoded bytes. This method | 117 * Produce a String from a sequence of UTF-16LE encoded bytes. This method |
107 * strips a leading BOM by default, but can be overridden by setting the | 118 * strips a leading BOM by default, but can be overridden by setting the |
108 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to | 119 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to |
109 * null to throw an ArgumentError rather than replace the bad value. | 120 * null to throw an ArgumentError rather than replace the bad value. |
110 * The default value for the [replacementCodepoint] is U+FFFD. | 121 * The default value for the [replacementCodepoint] is U+FFFD. |
111 */ | 122 */ |
112 String decodeUtf16le(List<int> bytes, [int offset = 0, int length, | 123 String decodeUtf16le(List<int> bytes, |
| 124 [int offset = 0, |
| 125 int length, |
113 bool stripBom = true, | 126 bool stripBom = true, |
114 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 127 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
115 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset, | 128 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder( |
116 length, stripBom, replacementCodepoint)).decodeRest(); | 129 bytes, offset, length, stripBom, replacementCodepoint)) |
| 130 .decodeRest(); |
117 return new String.fromCharCodes( | 131 return new String.fromCharCodes( |
118 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); | 132 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); |
119 } | 133 } |
120 | 134 |
121 /** | 135 /** |
122 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting | 136 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting |
123 * bytes with a big-endian byte-order-marker. | 137 * bytes with a big-endian byte-order-marker. |
124 */ | 138 */ |
125 List<int> encodeUtf16(String str) => | 139 List<int> encodeUtf16(String str) => encodeUtf16be(str, true); |
126 encodeUtf16be(str, true); | |
127 | 140 |
128 /** | 141 /** |
129 * Produce a list of UTF-16BE encoded bytes. By default, this method produces | 142 * Produce a list of UTF-16BE encoded bytes. By default, this method produces |
130 * UTF-16BE bytes with no BOM. | 143 * UTF-16BE bytes with no BOM. |
131 */ | 144 */ |
132 List<int> encodeUtf16be(String str, [bool writeBOM = false]) { | 145 List<int> encodeUtf16be(String str, [bool writeBOM = false]) { |
133 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); | 146 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); |
134 List<int> encoding = | 147 List<int> encoding = |
135 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); | 148 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); |
136 int i = 0; | 149 int i = 0; |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
209 */ | 222 */ |
210 // TODO(floitsch): Consider removing the extend and switch to implements since | 223 // TODO(floitsch): Consider removing the extend and switch to implements since |
211 // that's cheaper to allocate. | 224 // that's cheaper to allocate. |
212 class IterableUtf16Decoder extends IterableBase<int> { | 225 class IterableUtf16Decoder extends IterableBase<int> { |
213 final _CodeUnitsProvider codeunitsProvider; | 226 final _CodeUnitsProvider codeunitsProvider; |
214 final int replacementCodepoint; | 227 final int replacementCodepoint; |
215 | 228 |
216 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint); | 229 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint); |
217 | 230 |
218 Utf16CodeUnitDecoder get iterator => | 231 Utf16CodeUnitDecoder get iterator => |
219 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(), | 232 new Utf16CodeUnitDecoder.fromListRangeIterator( |
220 replacementCodepoint); | 233 codeunitsProvider(), replacementCodepoint); |
221 } | 234 } |
222 | 235 |
223 /** | 236 /** |
224 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes | 237 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes |
225 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine | 238 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine |
226 * endian-ness, and defaults to BE. | 239 * endian-ness, and defaults to BE. |
227 */ | 240 */ |
228 abstract class Utf16BytesToCodeUnitsDecoder implements ListRangeIterator { | 241 abstract class Utf16BytesToCodeUnitsDecoder implements ListRangeIterator { |
229 // TODO(kevmoo): should this field be private? | 242 // TODO(kevmoo): should this field be private? |
230 final ListRangeIterator utf16EncodedBytesIterator; | 243 final ListRangeIterator utf16EncodedBytesIterator; |
231 final int replacementCodepoint; | 244 final int replacementCodepoint; |
232 int _current = null; | 245 int _current = null; |
233 | 246 |
234 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator( | 247 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator( |
235 this.utf16EncodedBytesIterator, this.replacementCodepoint); | 248 this.utf16EncodedBytesIterator, this.replacementCodepoint); |
236 | 249 |
237 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | 250 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, |
238 int offset = 0, int length, | 251 [int offset = 0, |
| 252 int length, |
239 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 253 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
240 if (length == null) { | 254 if (length == null) { |
241 length = utf16EncodedBytes.length - offset; | 255 length = utf16EncodedBytes.length - offset; |
242 } | 256 } |
243 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) { | 257 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) { |
244 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, | 258 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, |
245 length - 2, false, replacementCodepoint); | 259 length - 2, false, replacementCodepoint); |
246 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) { | 260 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) { |
247 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, | 261 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, |
248 length - 2, false, replacementCodepoint); | 262 length - 2, false, replacementCodepoint); |
249 } else { | 263 } else { |
250 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset, | 264 return new Utf16beBytesToCodeUnitsDecoder( |
251 length, false, replacementCodepoint); | 265 utf16EncodedBytes, offset, length, false, replacementCodepoint); |
252 } | 266 } |
253 } | 267 } |
254 | 268 |
255 /** | 269 /** |
256 * Provides a fast way to decode the rest of the source bytes in a single | 270 * Provides a fast way to decode the rest of the source bytes in a single |
257 * call. This method trades memory for improved speed in that it potentially | 271 * call. This method trades memory for improved speed in that it potentially |
258 * over-allocates the List containing results. | 272 * over-allocates the List containing results. |
259 */ | 273 */ |
260 List<int> decodeRest() { | 274 List<int> decodeRest() { |
261 List<int> codeunits = new List<int>(remaining); | 275 List<int> codeunits = new List<int>(remaining); |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
308 } | 322 } |
309 | 323 |
310 int decode(); | 324 int decode(); |
311 } | 325 } |
312 | 326 |
313 /** | 327 /** |
314 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes | 328 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes |
315 * to produce the code unit (0-(2^16)-1). | 329 * to produce the code unit (0-(2^16)-1). |
316 */ | 330 */ |
317 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { | 331 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { |
318 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | 332 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, |
319 int offset = 0, int length, bool stripBom = true, | 333 [int offset = 0, |
320 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 334 int length, |
321 super._fromListRangeIterator( | 335 bool stripBom = true, |
322 (new ListRange(utf16EncodedBytes, offset, length)).iterator, | 336 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
323 replacementCodepoint) { | 337 : super._fromListRangeIterator( |
| 338 (new ListRange(utf16EncodedBytes, offset, length)).iterator, |
| 339 replacementCodepoint) { |
324 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) { | 340 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) { |
325 skip(); | 341 skip(); |
326 } | 342 } |
327 } | 343 } |
328 | 344 |
329 int decode() { | 345 int decode() { |
330 utf16EncodedBytesIterator.moveNext(); | 346 utf16EncodedBytesIterator.moveNext(); |
331 int hi = utf16EncodedBytesIterator.current; | 347 int hi = utf16EncodedBytesIterator.current; |
332 utf16EncodedBytesIterator.moveNext(); | 348 utf16EncodedBytesIterator.moveNext(); |
333 int lo = utf16EncodedBytesIterator.current; | 349 int lo = utf16EncodedBytesIterator.current; |
334 return (hi << 8) + lo; | 350 return (hi << 8) + lo; |
335 } | 351 } |
336 } | 352 } |
337 | 353 |
338 /** | 354 /** |
339 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes | 355 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes |
340 * to produce the code unit (0-(2^16)-1). | 356 * to produce the code unit (0-(2^16)-1). |
341 */ | 357 */ |
342 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { | 358 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { |
343 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | 359 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, |
344 int offset = 0, int length, bool stripBom = true, | 360 [int offset = 0, |
345 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 361 int length, |
346 super._fromListRangeIterator( | 362 bool stripBom = true, |
347 (new ListRange(utf16EncodedBytes, offset, length)).iterator, | 363 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
348 replacementCodepoint) { | 364 : super._fromListRangeIterator( |
| 365 (new ListRange(utf16EncodedBytes, offset, length)).iterator, |
| 366 replacementCodepoint) { |
349 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) { | 367 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) { |
350 skip(); | 368 skip(); |
351 } | 369 } |
352 } | 370 } |
353 | 371 |
354 int decode() { | 372 int decode() { |
355 utf16EncodedBytesIterator.moveNext(); | 373 utf16EncodedBytesIterator.moveNext(); |
356 int lo = utf16EncodedBytesIterator.current; | 374 int lo = utf16EncodedBytesIterator.current; |
357 utf16EncodedBytesIterator.moveNext(); | 375 utf16EncodedBytesIterator.moveNext(); |
358 int hi = utf16EncodedBytesIterator.current; | 376 int hi = utf16EncodedBytesIterator.current; |
359 return (hi << 8) + lo; | 377 return (hi << 8) + lo; |
360 } | 378 } |
361 } | 379 } |
OLD | NEW |