OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of utf; | |
6 | |
7 /** | |
8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert | |
9 * as much of the input as needed. Determines the byte order from the BOM, | |
10 * or uses big-endian as a default. This method always strips a leading BOM. | |
11 * Set the replacementCharacter to null to throw an ArgumentError | |
12 * rather than replace the bad value. | |
13 */ | |
14 IterableUtf32Decoder decodeUtf32AsIterable(List<int> bytes, [ | |
15 int offset = 0, int length, | |
16 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
17 return new IterableUtf32Decoder._( | |
18 () => new Utf32BytesDecoder(bytes, offset, length, replacementCodepoint)); | |
19 } | |
20 | |
21 /** | |
22 * Decodes the UTF-32BE bytes as an iterable. Thus, the consumer can only conver
t | |
23 * as much of the input as needed. This method strips a leading BOM by default, | |
24 * but can be overridden by setting the optional parameter [stripBom] to false. | |
25 * Set the replacementCharacter to null to throw an ArgumentError | |
26 * rather than replace the bad value. | |
27 */ | |
28 IterableUtf32Decoder decodeUtf32beAsIterable(List<int> bytes, [ | |
29 int offset = 0, int length, bool stripBom = true, | |
30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
31 return new IterableUtf32Decoder._( | |
32 () => new Utf32beBytesDecoder(bytes, offset, length, stripBom, | |
33 replacementCodepoint)); | |
34 } | |
35 | |
36 /** | |
37 * Decodes the UTF-32LE bytes as an iterable. Thus, the consumer can only conver
t | |
38 * as much of the input as needed. This method strips a leading BOM by default, | |
39 * but can be overridden by setting the optional parameter [stripBom] to false. | |
40 * Set the replacementCharacter to null to throw an ArgumentError | |
41 * rather than replace the bad value. | |
42 */ | |
43 IterableUtf32Decoder decodeUtf32leAsIterable(List<int> bytes, [ | |
44 int offset = 0, int length, bool stripBom = true, | |
45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
46 return new IterableUtf32Decoder._( | |
47 () => new Utf32leBytesDecoder(bytes, offset, length, stripBom, | |
48 replacementCodepoint)); | |
49 } | |
50 | |
51 /** | |
52 * Produce a String from a sequence of UTF-32 encoded bytes. The parameters | |
53 * allow an offset into a list of bytes (as int), limiting the length of the | |
54 * values be decoded and the ability of override the default Unicode | |
55 * replacement character. Set the replacementCharacter to null to throw an | |
56 * ArgumentError rather than replace the bad value. | |
57 */ | |
58 String decodeUtf32(List<int> bytes, [int offset = 0, int length, | |
59 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
60 return new String.fromCharCodes((new Utf32BytesDecoder(bytes, offset, length, | |
61 replacementCodepoint)).decodeRest()); | |
62 } | |
63 /** | |
64 * Produce a String from a sequence of UTF-32BE encoded bytes. The parameters | |
65 * allow an offset into a list of bytes (as int), limiting the length of the | |
66 * values be decoded and the ability of override the default Unicode | |
67 * replacement character. Set the replacementCharacter to null to throw an | |
68 * ArgumentError rather than replace the bad value. | |
69 */ | |
70 String decodeUtf32be( | |
71 List<int> bytes, [int offset = 0, int length, bool stripBom = true, | |
72 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => | |
73 new String.fromCharCodes((new Utf32beBytesDecoder(bytes, offset, length, | |
74 stripBom, replacementCodepoint)).decodeRest()); | |
75 | |
76 /** | |
77 * Produce a String from a sequence of UTF-32LE encoded bytes. The parameters | |
78 * allow an offset into a list of bytes (as int), limiting the length of the | |
79 * values be decoded and the ability of override the default Unicode | |
80 * replacement character. Set the replacementCharacter to null to throw an | |
81 * ArgumentError rather than replace the bad value. | |
82 */ | |
83 String decodeUtf32le( | |
84 List<int> bytes, [int offset = 0, int length, bool stripBom = true, | |
85 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => | |
86 new String.fromCharCodes((new Utf32leBytesDecoder(bytes, offset, length, | |
87 stripBom, replacementCodepoint)).decodeRest()); | |
88 | |
89 /** | |
90 * Produce a list of UTF-32 encoded bytes. This method prefixes the resulting | |
91 * bytes with a big-endian byte-order-marker. | |
92 */ | |
93 List<int> encodeUtf32(String str) => | |
94 encodeUtf32be(str, true); | |
95 | |
96 /** | |
97 * Produce a list of UTF-32BE encoded bytes. By default, this method produces | |
98 * UTF-32BE bytes with no BOM. | |
99 */ | |
100 List<int> encodeUtf32be(String str, [bool writeBOM = false]) { | |
101 List<int> utf32CodeUnits = stringToCodepoints(str); | |
102 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + | |
103 (writeBOM ? 4 : 0)); | |
104 int i = 0; | |
105 if (writeBOM) { | |
106 encoding[i++] = 0; | |
107 encoding[i++] = 0; | |
108 encoding[i++] = UNICODE_UTF_BOM_HI; | |
109 encoding[i++] = UNICODE_UTF_BOM_LO; | |
110 } | |
111 for (int unit in utf32CodeUnits) { | |
112 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; | |
113 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; | |
114 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; | |
115 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
116 } | |
117 return encoding; | |
118 } | |
119 | |
120 /** | |
121 * Produce a list of UTF-32LE encoded bytes. By default, this method produces | |
122 * UTF-32BE bytes with no BOM. | |
123 */ | |
124 List<int> encodeUtf32le(String str, [bool writeBOM = false]) { | |
125 List<int> utf32CodeUnits = stringToCodepoints(str); | |
126 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + | |
127 (writeBOM ? 4 : 0)); | |
128 int i = 0; | |
129 if (writeBOM) { | |
130 encoding[i++] = UNICODE_UTF_BOM_LO; | |
131 encoding[i++] = UNICODE_UTF_BOM_HI; | |
132 encoding[i++] = 0; | |
133 encoding[i++] = 0; | |
134 } | |
135 for (int unit in utf32CodeUnits) { | |
136 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
137 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; | |
138 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; | |
139 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; | |
140 } | |
141 return encoding; | |
142 } | |
143 | |
144 /** | |
145 * Identifies whether a List of bytes starts (based on offset) with a | |
146 * byte-order marker (BOM). | |
147 */ | |
148 bool hasUtf32Bom( | |
149 List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
150 return hasUtf32beBom(utf32EncodedBytes, offset, length) || | |
151 hasUtf32leBom(utf32EncodedBytes, offset, length); | |
152 } | |
153 | |
154 /** | |
155 * Identifies whether a List of bytes starts (based on offset) with a | |
156 * big-endian byte-order marker (BOM). | |
157 */ | |
158 bool hasUtf32beBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
159 int end = length != null ? offset + length : utf32EncodedBytes.length; | |
160 return (offset + 4) <= end && | |
161 utf32EncodedBytes[offset] == 0 && utf32EncodedBytes[offset + 1] == 0 && | |
162 utf32EncodedBytes[offset + 2] == UNICODE_UTF_BOM_HI && | |
163 utf32EncodedBytes[offset + 3] == UNICODE_UTF_BOM_LO; | |
164 } | |
165 | |
166 /** | |
167 * Identifies whether a List of bytes starts (based on offset) with a | |
168 * little-endian byte-order marker (BOM). | |
169 */ | |
170 bool hasUtf32leBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
171 int end = length != null ? offset + length : utf32EncodedBytes.length; | |
172 return (offset + 4) <= end && | |
173 utf32EncodedBytes[offset] == UNICODE_UTF_BOM_LO && | |
174 utf32EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI && | |
175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; | |
176 } | |
177 | |
178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); | |
179 | |
180 /** | |
181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type | |
182 * provides an iterator on demand and the iterator will only translate bytes | |
183 * as requested by the user of the iterator. (Note: results are not cached.) | |
184 */ | |
185 // TODO(floitsch): Consider removing the extend and switch to implements since | |
186 // that's cheaper to allocate. | |
187 class IterableUtf32Decoder extends IterableBase<int> { | |
188 final Utf32BytesDecoderProvider codeunitsProvider; | |
189 | |
190 IterableUtf32Decoder._(this.codeunitsProvider); | |
191 | |
192 Utf32BytesDecoder get iterator => codeunitsProvider(); | |
193 } | |
194 | |
195 /** | |
196 * Abstrace parent class converts encoded bytes to codepoints. | |
197 */ | |
198 abstract class Utf32BytesDecoder implements ListRangeIterator { | |
199 // TODO(kevmoo): should this field be private? | |
200 final ListRangeIterator utf32EncodedBytesIterator; | |
201 final int replacementCodepoint; | |
202 int _current = null; | |
203 | |
204 Utf32BytesDecoder._fromListRangeIterator( | |
205 this.utf32EncodedBytesIterator, this.replacementCodepoint); | |
206 | |
207 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ | |
208 int offset = 0, int length, | |
209 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
210 if (length == null) { | |
211 length = utf32EncodedBytes.length - offset; | |
212 } | |
213 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { | |
214 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | |
215 false, replacementCodepoint); | |
216 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { | |
217 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | |
218 false, replacementCodepoint); | |
219 } else { | |
220 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, | |
221 replacementCodepoint); | |
222 } | |
223 } | |
224 | |
225 List<int> decodeRest() { | |
226 List<int> codeunits = new List<int>(remaining); | |
227 int i = 0; | |
228 while (moveNext()) { | |
229 codeunits[i++] = current; | |
230 } | |
231 return codeunits; | |
232 } | |
233 | |
234 int get current => _current; | |
235 | |
236 bool moveNext() { | |
237 _current = null; | |
238 int remaining = utf32EncodedBytesIterator.remaining; | |
239 if (remaining == 0) { | |
240 _current = null; | |
241 return false; | |
242 } | |
243 if (remaining < 4) { | |
244 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); | |
245 if (replacementCodepoint != null) { | |
246 _current = replacementCodepoint; | |
247 return true; | |
248 } else { | |
249 throw new ArgumentError( | |
250 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | |
251 } | |
252 } | |
253 int codepoint = decode(); | |
254 if (_validCodepoint(codepoint)) { | |
255 _current = codepoint; | |
256 return true; | |
257 } else if (replacementCodepoint != null) { | |
258 _current = replacementCodepoint; | |
259 return true; | |
260 } else { | |
261 throw new ArgumentError( | |
262 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | |
263 } | |
264 } | |
265 | |
266 int get position => utf32EncodedBytesIterator.position ~/ 4; | |
267 | |
268 void backup([int by = 1]) { | |
269 utf32EncodedBytesIterator.backup(4 * by); | |
270 } | |
271 | |
272 int get remaining => (utf32EncodedBytesIterator.remaining + 3) ~/ 4; | |
273 | |
274 void skip([int count = 1]) { | |
275 utf32EncodedBytesIterator.skip(4 * count); | |
276 } | |
277 | |
278 int decode(); | |
279 } | |
280 | |
281 /** | |
282 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | |
283 * to produce the unicode codepoint. | |
284 */ | |
285 class Utf32beBytesDecoder extends Utf32BytesDecoder { | |
286 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | |
287 int length, bool stripBom = true, | |
288 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
289 super._fromListRangeIterator( | |
290 (new ListRange(utf32EncodedBytes, offset, length)).iterator, | |
291 replacementCodepoint) { | |
292 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { | |
293 skip(); | |
294 } | |
295 } | |
296 | |
297 int decode() { | |
298 utf32EncodedBytesIterator.moveNext(); | |
299 int value = utf32EncodedBytesIterator.current; | |
300 utf32EncodedBytesIterator.moveNext(); | |
301 value = (value << 8) + utf32EncodedBytesIterator.current; | |
302 utf32EncodedBytesIterator.moveNext(); | |
303 value = (value << 8) + utf32EncodedBytesIterator.current; | |
304 utf32EncodedBytesIterator.moveNext(); | |
305 value = (value << 8) + utf32EncodedBytesIterator.current; | |
306 return value; | |
307 } | |
308 } | |
309 | |
310 /** | |
311 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | |
312 * to produce the unicode codepoint. | |
313 */ | |
314 class Utf32leBytesDecoder extends Utf32BytesDecoder { | |
315 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | |
316 int length, bool stripBom = true, | |
317 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
318 super._fromListRangeIterator( | |
319 (new ListRange(utf32EncodedBytes, offset, length)).iterator, | |
320 replacementCodepoint) { | |
321 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { | |
322 skip(); | |
323 } | |
324 } | |
325 | |
326 int decode() { | |
327 utf32EncodedBytesIterator.moveNext(); | |
328 int value = utf32EncodedBytesIterator.current; | |
329 utf32EncodedBytesIterator.moveNext(); | |
330 value += (utf32EncodedBytesIterator.current << 8); | |
331 utf32EncodedBytesIterator.moveNext(); | |
332 value += (utf32EncodedBytesIterator.current << 16); | |
333 utf32EncodedBytesIterator.moveNext(); | |
334 value += (utf32EncodedBytesIterator.current << 24); | |
335 return value; | |
336 } | |
337 } | |
338 | |
339 bool _validCodepoint(int codepoint) { | |
340 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || | |
341 (codepoint > UNICODE_UTF16_RESERVED_HI && | |
342 codepoint < UNICODE_VALID_RANGE_MAX); | |
343 } | |
OLD | NEW |