OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of utf; | |
6 | |
7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | |
8 /** | |
9 * Provide a list of Unicode codepoints for a given string. | |
10 */ | |
11 List<int> stringToCodepoints(String str) { | |
12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. | |
13 // So we need to convert. | |
14 return utf16CodeUnitsToCodepoints(str.codeUnits); | |
15 } | |
16 | |
17 /** | |
18 * Generate a string from the provided Unicode codepoints. | |
19 * | |
20 * *Deprecated* Use [String.fromCharCodes] instead. | |
21 */ | |
22 @deprecated | |
23 String codepointsToString(List<int> codepoints) { | |
24 return new String.fromCharCodes(codepoints); | |
25 } | |
26 /** | |
27 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert | |
28 * as much of the input as needed. Determines the byte order from the BOM, | |
29 * or uses big-endian as a default. This method always strips a leading BOM. | |
30 * Set the [replacementCodepoint] to null to throw an ArgumentError | |
31 * rather than replace the bad value. The default value for | |
32 * [replacementCodepoint] is U+FFFD. | |
33 */ | |
34 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0, | |
35 int length, int replacementCodepoint = | |
36 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
37 return new IterableUtf16Decoder._( | |
38 () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length, | |
39 replacementCodepoint), replacementCodepoint); | |
40 } | |
41 | |
42 /** | |
43 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only | |
44 * convert as much of the input as needed. This method strips a leading BOM by | |
45 * default, but can be overridden by setting the optional parameter [stripBom] | |
46 * to false. Set the [replacementCodepoint] to null to throw an | |
47 * ArgumentError rather than replace the bad value. The default | |
48 * value for the [replacementCodepoint] is U+FFFD. | |
49 */ | |
50 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0, | |
51 int length, bool stripBom = true, int replacementCodepoint = | |
52 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
53 return new IterableUtf16Decoder._( | |
54 () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom, | |
55 replacementCodepoint), replacementCodepoint); | |
56 } | |
57 | |
58 /** | |
59 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only | |
60 * convert as much of the input as needed. This method strips a leading BOM by | |
61 * default, but can be overridden by setting the optional parameter [stripBom] | |
62 * to false. Set the [replacementCodepoint] to null to throw an | |
63 * ArgumentError rather than replace the bad value. The default | |
64 * value for the [replacementCodepoint] is U+FFFD. | |
65 */ | |
66 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0, | |
67 int length, bool stripBom = true, int replacementCodepoint = | |
68 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
69 return new IterableUtf16Decoder._( | |
70 () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom, | |
71 replacementCodepoint), replacementCodepoint); | |
72 } | |
73 | |
74 /** | |
75 * Produce a String from a sequence of UTF-16 encoded bytes. This method always | |
76 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an | |
77 * ArgumentError rather than replace the bad value. The default | |
78 * value for the [replacementCodepoint] is U+FFFD. | |
79 */ | |
80 String decodeUtf16(List<int> bytes, [int offset = 0, int length, | |
81 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
82 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes, | |
83 offset, length, replacementCodepoint); | |
84 List<int> codeunits = decoder.decodeRest(); | |
85 return new String.fromCharCodes( | |
86 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); | |
87 } | |
88 | |
89 /** | |
90 * Produce a String from a sequence of UTF-16BE encoded bytes. This method | |
91 * strips a leading BOM by default, but can be overridden by setting the | |
92 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to | |
93 * null to throw an ArgumentError rather than replace the bad value. | |
94 * The default value for the [replacementCodepoint] is U+FFFD. | |
95 */ | |
96 String decodeUtf16be(List<int> bytes, [int offset = 0, int length, | |
97 bool stripBom = true, | |
98 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
99 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset, | |
100 length, stripBom, replacementCodepoint)).decodeRest(); | |
101 return new String.fromCharCodes( | |
102 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); | |
103 } | |
104 | |
105 /** | |
106 * Produce a String from a sequence of UTF-16LE encoded bytes. This method | |
107 * strips a leading BOM by default, but can be overridden by setting the | |
108 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to | |
109 * null to throw an ArgumentError rather than replace the bad value. | |
110 * The default value for the [replacementCodepoint] is U+FFFD. | |
111 */ | |
112 String decodeUtf16le(List<int> bytes, [int offset = 0, int length, | |
113 bool stripBom = true, | |
114 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
115 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset, | |
116 length, stripBom, replacementCodepoint)).decodeRest(); | |
117 return new String.fromCharCodes( | |
118 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint)); | |
119 } | |
120 | |
121 /** | |
122 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting | |
123 * bytes with a big-endian byte-order-marker. | |
124 */ | |
125 List<int> encodeUtf16(String str) => | |
126 encodeUtf16be(str, true); | |
127 | |
128 /** | |
129 * Produce a list of UTF-16BE encoded bytes. By default, this method produces | |
130 * UTF-16BE bytes with no BOM. | |
131 */ | |
132 List<int> encodeUtf16be(String str, [bool writeBOM = false]) { | |
133 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); | |
134 List<int> encoding = | |
135 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); | |
136 int i = 0; | |
137 if (writeBOM) { | |
138 encoding[i++] = UNICODE_UTF_BOM_HI; | |
139 encoding[i++] = UNICODE_UTF_BOM_LO; | |
140 } | |
141 for (int unit in utf16CodeUnits) { | |
142 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8; | |
143 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
144 } | |
145 return encoding; | |
146 } | |
147 | |
148 /** | |
149 * Produce a list of UTF-16LE encoded bytes. By default, this method produces | |
150 * UTF-16LE bytes with no BOM. | |
151 */ | |
152 List<int> encodeUtf16le(String str, [bool writeBOM = false]) { | |
153 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); | |
154 List<int> encoding = | |
155 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); | |
156 int i = 0; | |
157 if (writeBOM) { | |
158 encoding[i++] = UNICODE_UTF_BOM_LO; | |
159 encoding[i++] = UNICODE_UTF_BOM_HI; | |
160 } | |
161 for (int unit in utf16CodeUnits) { | |
162 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
163 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8; | |
164 } | |
165 return encoding; | |
166 } | |
167 | |
168 /** | |
169 * Identifies whether a List of bytes starts (based on offset) with a | |
170 * byte-order marker (BOM). | |
171 */ | |
172 bool hasUtf16Bom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
173 return hasUtf16beBom(utf32EncodedBytes, offset, length) || | |
174 hasUtf16leBom(utf32EncodedBytes, offset, length); | |
175 } | |
176 | |
177 /** | |
178 * Identifies whether a List of bytes starts (based on offset) with a | |
179 * big-endian byte-order marker (BOM). | |
180 */ | |
181 bool hasUtf16beBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) { | |
182 int end = length != null ? offset + length : utf16EncodedBytes.length; | |
183 return (offset + 2) <= end && | |
184 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_HI && | |
185 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_LO; | |
186 } | |
187 | |
188 /** | |
189 * Identifies whether a List of bytes starts (based on offset) with a | |
190 * little-endian byte-order marker (BOM). | |
191 */ | |
192 bool hasUtf16leBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) { | |
193 int end = length != null ? offset + length : utf16EncodedBytes.length; | |
194 return (offset + 2) <= end && | |
195 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_LO && | |
196 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI; | |
197 } | |
198 | |
199 List<int> _stringToUtf16CodeUnits(String str) { | |
200 return codepointsToUtf16CodeUnits(str.codeUnits); | |
201 } | |
202 | |
203 typedef ListRangeIterator _CodeUnitsProvider(); | |
204 | |
205 /** | |
206 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type | |
207 * provides an iterator on demand and the iterator will only translate bytes | |
208 * as requested by the user of the iterator. (Note: results are not cached.) | |
209 */ | |
210 // TODO(floitsch): Consider removing the extend and switch to implements since | |
211 // that's cheaper to allocate. | |
212 class IterableUtf16Decoder extends IterableBase<int> { | |
213 final _CodeUnitsProvider codeunitsProvider; | |
214 final int replacementCodepoint; | |
215 | |
216 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint); | |
217 | |
218 Utf16CodeUnitDecoder get iterator => | |
219 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(), | |
220 replacementCodepoint); | |
221 } | |
222 | |
223 /** | |
224 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes | |
225 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine | |
226 * endian-ness, and defaults to BE. | |
227 */ | |
228 abstract class Utf16BytesToCodeUnitsDecoder implements ListRangeIterator { | |
229 // TODO(kevmoo): should this field be private? | |
230 final ListRangeIterator utf16EncodedBytesIterator; | |
231 final int replacementCodepoint; | |
232 int _current = null; | |
233 | |
234 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator( | |
235 this.utf16EncodedBytesIterator, this.replacementCodepoint); | |
236 | |
237 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | |
238 int offset = 0, int length, | |
239 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
240 if (length == null) { | |
241 length = utf16EncodedBytes.length - offset; | |
242 } | |
243 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) { | |
244 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, | |
245 length - 2, false, replacementCodepoint); | |
246 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) { | |
247 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, | |
248 length - 2, false, replacementCodepoint); | |
249 } else { | |
250 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset, | |
251 length, false, replacementCodepoint); | |
252 } | |
253 } | |
254 | |
255 /** | |
256 * Provides a fast way to decode the rest of the source bytes in a single | |
257 * call. This method trades memory for improved speed in that it potentially | |
258 * over-allocates the List containing results. | |
259 */ | |
260 List<int> decodeRest() { | |
261 List<int> codeunits = new List<int>(remaining); | |
262 int i = 0; | |
263 while (moveNext()) { | |
264 codeunits[i++] = current; | |
265 } | |
266 if (i == codeunits.length) { | |
267 return codeunits; | |
268 } else { | |
269 List<int> truncCodeunits = new List<int>(i); | |
270 truncCodeunits.setRange(0, i, codeunits); | |
271 return truncCodeunits; | |
272 } | |
273 } | |
274 | |
275 int get current => _current; | |
276 | |
277 bool moveNext() { | |
278 _current = null; | |
279 int remaining = utf16EncodedBytesIterator.remaining; | |
280 if (remaining == 0) { | |
281 _current = null; | |
282 return false; | |
283 } | |
284 if (remaining == 1) { | |
285 utf16EncodedBytesIterator.moveNext(); | |
286 if (replacementCodepoint != null) { | |
287 _current = replacementCodepoint; | |
288 return true; | |
289 } else { | |
290 throw new ArgumentError( | |
291 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); | |
292 } | |
293 } | |
294 _current = decode(); | |
295 return true; | |
296 } | |
297 | |
298 int get position => utf16EncodedBytesIterator.position ~/ 2; | |
299 | |
300 void backup([int by = 1]) { | |
301 utf16EncodedBytesIterator.backup(2 * by); | |
302 } | |
303 | |
304 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; | |
305 | |
306 void skip([int count = 1]) { | |
307 utf16EncodedBytesIterator.skip(2 * count); | |
308 } | |
309 | |
310 int decode(); | |
311 } | |
312 | |
313 /** | |
314 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes | |
315 * to produce the code unit (0-(2^16)-1). | |
316 */ | |
317 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { | |
318 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | |
319 int offset = 0, int length, bool stripBom = true, | |
320 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
321 super._fromListRangeIterator( | |
322 (new ListRange(utf16EncodedBytes, offset, length)).iterator, | |
323 replacementCodepoint) { | |
324 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) { | |
325 skip(); | |
326 } | |
327 } | |
328 | |
329 int decode() { | |
330 utf16EncodedBytesIterator.moveNext(); | |
331 int hi = utf16EncodedBytesIterator.current; | |
332 utf16EncodedBytesIterator.moveNext(); | |
333 int lo = utf16EncodedBytesIterator.current; | |
334 return (hi << 8) + lo; | |
335 } | |
336 } | |
337 | |
338 /** | |
339 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes | |
340 * to produce the code unit (0-(2^16)-1). | |
341 */ | |
342 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { | |
343 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | |
344 int offset = 0, int length, bool stripBom = true, | |
345 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
346 super._fromListRangeIterator( | |
347 (new ListRange(utf16EncodedBytes, offset, length)).iterator, | |
348 replacementCodepoint) { | |
349 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) { | |
350 skip(); | |
351 } | |
352 } | |
353 | |
354 int decode() { | |
355 utf16EncodedBytesIterator.moveNext(); | |
356 int lo = utf16EncodedBytesIterator.current; | |
357 utf16EncodedBytesIterator.moveNext(); | |
358 int hi = utf16EncodedBytesIterator.current; | |
359 return (hi << 8) + lo; | |
360 } | |
361 } | |
OLD | NEW |