OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of utf; | |
6 | |
7 /** | |
8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert | |
9 * as much of the input as needed. Determines the byte order from the BOM, | |
10 * or uses big-endian as a default. This method always strips a leading BOM. | |
11 * Set the replacementCharacter to null to throw an ArgumentError | |
12 * rather than replace the bad value. | |
13 */ | |
14 IterableUtf32Decoder decodeUtf32AsIterable(List<int> bytes, [ | |
15 int offset = 0, int length, | |
16 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
17 return new IterableUtf32Decoder._( | |
18 () => new Utf32BytesDecoder(bytes, offset, length, replacementCodepoint)); | |
19 } | |
20 | |
21 /** | |
22 * Decodes the UTF-32BE bytes as an iterable. Thus, the consumer can only conver
t | |
23 * as much of the input as needed. This method strips a leading BOM by default, | |
24 * but can be overridden by setting the optional parameter [stripBom] to false. | |
25 * Set the replacementCharacter to null to throw an ArgumentError | |
26 * rather than replace the bad value. | |
27 */ | |
28 IterableUtf32Decoder decodeUtf32beAsIterable(List<int> bytes, [ | |
29 int offset = 0, int length, bool stripBom = true, | |
30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
31 return new IterableUtf32Decoder._( | |
32 () => new Utf32beBytesDecoder(bytes, offset, length, stripBom, | |
33 replacementCodepoint)); | |
34 } | |
35 | |
36 /** | |
37 * Decodes the UTF-32LE bytes as an iterable. Thus, the consumer can only conver
t | |
38 * as much of the input as needed. This method strips a leading BOM by default, | |
39 * but can be overridden by setting the optional parameter [stripBom] to false. | |
40 * Set the replacementCharacter to null to throw an ArgumentError | |
41 * rather than replace the bad value. | |
42 */ | |
43 IterableUtf32Decoder decodeUtf32leAsIterable(List<int> bytes, [ | |
44 int offset = 0, int length, bool stripBom = true, | |
45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
46 return new IterableUtf32Decoder._( | |
47 () => new Utf32leBytesDecoder(bytes, offset, length, stripBom, | |
48 replacementCodepoint)); | |
49 } | |
50 | |
51 /** | |
52 * Produce a String from a sequence of UTF-32 encoded bytes. The parameters | |
53 * allow an offset into a list of bytes (as int), limiting the length of the | |
54 * values be decoded and the ability of override the default Unicode | |
55 * replacement character. Set the replacementCharacter to null to throw an | |
56 * ArgumentError rather than replace the bad value. | |
57 */ | |
58 String decodeUtf32(List<int> bytes, [int offset = 0, int length, | |
59 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
60 return new String.fromCharCodes((new Utf32BytesDecoder(bytes, offset, length, | |
61 replacementCodepoint)).decodeRest()); | |
62 } | |
63 /** | |
64 * Produce a String from a sequence of UTF-32BE encoded bytes. The parameters | |
65 * allow an offset into a list of bytes (as int), limiting the length of the | |
66 * values be decoded and the ability of override the default Unicode | |
67 * replacement character. Set the replacementCharacter to null to throw an | |
68 * ArgumentError rather than replace the bad value. | |
69 */ | |
70 String decodeUtf32be( | |
71 List<int> bytes, [int offset = 0, int length, bool stripBom = true, | |
72 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => | |
73 new String.fromCharCodes((new Utf32beBytesDecoder(bytes, offset, length, | |
74 stripBom, replacementCodepoint)).decodeRest()); | |
75 | |
76 /** | |
77 * Produce a String from a sequence of UTF-32LE encoded bytes. The parameters | |
78 * allow an offset into a list of bytes (as int), limiting the length of the | |
79 * values be decoded and the ability of override the default Unicode | |
80 * replacement character. Set the replacementCharacter to null to throw an | |
81 * ArgumentError rather than replace the bad value. | |
82 */ | |
83 String decodeUtf32le( | |
84 List<int> bytes, [int offset = 0, int length, bool stripBom = true, | |
85 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => | |
86 new String.fromCharCodes((new Utf32leBytesDecoder(bytes, offset, length, | |
87 stripBom, replacementCodepoint)).decodeRest()); | |
88 | |
89 /** | |
90 * Produce a list of UTF-32 encoded bytes. This method prefixes the resulting | |
91 * bytes with a big-endian byte-order-marker. | |
92 */ | |
93 List<int> encodeUtf32(String str) => | |
94 encodeUtf32be(str, true); | |
95 | |
96 /** | |
97 * Produce a list of UTF-32BE encoded bytes. By default, this method produces | |
98 * UTF-32BE bytes with no BOM. | |
99 */ | |
100 List<int> encodeUtf32be(String str, [bool writeBOM = false]) { | |
101 List<int> utf32CodeUnits = stringToCodepoints(str); | |
102 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + | |
103 (writeBOM ? 4 : 0)); | |
104 int i = 0; | |
105 if (writeBOM) { | |
106 encoding[i++] = 0; | |
107 encoding[i++] = 0; | |
108 encoding[i++] = UNICODE_UTF_BOM_HI; | |
109 encoding[i++] = UNICODE_UTF_BOM_LO; | |
110 } | |
111 for (int unit in utf32CodeUnits) { | |
112 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; | |
113 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; | |
114 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; | |
115 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
116 } | |
117 return encoding; | |
118 } | |
119 | |
120 /** | |
121 * Produce a list of UTF-32LE encoded bytes. By default, this method produces | |
122 * UTF-32BE bytes with no BOM. | |
123 */ | |
124 List<int> encodeUtf32le(String str, [bool writeBOM = false]) { | |
125 List<int> utf32CodeUnits = stringToCodepoints(str); | |
126 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + | |
127 (writeBOM ? 4 : 0)); | |
128 int i = 0; | |
129 if (writeBOM) { | |
130 encoding[i++] = UNICODE_UTF_BOM_LO; | |
131 encoding[i++] = UNICODE_UTF_BOM_HI; | |
132 encoding[i++] = 0; | |
133 encoding[i++] = 0; | |
134 } | |
135 for (int unit in utf32CodeUnits) { | |
136 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
137 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; | |
138 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; | |
139 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; | |
140 } | |
141 return encoding; | |
142 } | |
143 | |
144 /** | |
145 * Identifies whether a List of bytes starts (based on offset) with a | |
146 * byte-order marker (BOM). | |
147 */ | |
148 bool hasUtf32Bom( | |
149 List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
150 return hasUtf32beBom(utf32EncodedBytes, offset, length) || | |
151 hasUtf32leBom(utf32EncodedBytes, offset, length); | |
152 } | |
153 | |
154 /** | |
155 * Identifies whether a List of bytes starts (based on offset) with a | |
156 * big-endian byte-order marker (BOM). | |
157 */ | |
158 bool hasUtf32beBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
159 int end = length != null ? offset + length : utf32EncodedBytes.length; | |
160 return (offset + 4) <= end && | |
161 utf32EncodedBytes[offset] == 0 && utf32EncodedBytes[offset + 1] == 0 && | |
162 utf32EncodedBytes[offset + 2] == UNICODE_UTF_BOM_HI && | |
163 utf32EncodedBytes[offset + 3] == UNICODE_UTF_BOM_LO; | |
164 } | |
165 | |
166 /** | |
167 * Identifies whether a List of bytes starts (based on offset) with a | |
168 * little-endian byte-order marker (BOM). | |
169 */ | |
170 bool hasUtf32leBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
171 int end = length != null ? offset + length : utf32EncodedBytes.length; | |
172 return (offset + 4) <= end && | |
173 utf32EncodedBytes[offset] == UNICODE_UTF_BOM_LO && | |
174 utf32EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI && | |
175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; | |
176 } | |
177 | |
178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); | |
179 | |
180 /** | |
181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type | |
182 * provides an iterator on demand and the iterator will only translate bytes | |
183 * as requested by the user of the iterator. (Note: results are not cached.) | |
184 */ | |
185 // TODO(floitsch): Consider removing the extend and switch to implements since | |
186 // that's cheaper to allocate. | |
187 class IterableUtf32Decoder extends IterableBase<int> { | |
188 final Utf32BytesDecoderProvider codeunitsProvider; | |
189 | |
190 IterableUtf32Decoder._(this.codeunitsProvider); | |
191 | |
192 Utf32BytesDecoder get iterator => codeunitsProvider(); | |
193 } | |
194 | |
195 /** | |
196 * Abstrace parent class converts encoded bytes to codepoints. | |
197 */ | |
198 abstract class Utf32BytesDecoder implements _ListRangeIterator { | |
199 final _ListRangeIterator utf32EncodedBytesIterator; | |
200 final int replacementCodepoint; | |
201 int _current = null; | |
202 | |
203 Utf32BytesDecoder._fromListRangeIterator( | |
204 this.utf32EncodedBytesIterator, this.replacementCodepoint); | |
205 | |
206 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ | |
207 int offset = 0, int length, | |
208 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
209 if (length == null) { | |
210 length = utf32EncodedBytes.length - offset; | |
211 } | |
212 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { | |
213 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | |
214 false, replacementCodepoint); | |
215 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { | |
216 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | |
217 false, replacementCodepoint); | |
218 } else { | |
219 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, | |
220 replacementCodepoint); | |
221 } | |
222 } | |
223 | |
224 List<int> decodeRest() { | |
225 List<int> codeunits = new List<int>(remaining); | |
226 int i = 0; | |
227 while (moveNext()) { | |
228 codeunits[i++] = current; | |
229 } | |
230 return codeunits; | |
231 } | |
232 | |
233 int get current => _current; | |
234 | |
235 bool moveNext() { | |
236 _current = null; | |
237 int remaining = utf32EncodedBytesIterator.remaining; | |
238 if (remaining == 0) { | |
239 _current = null; | |
240 return false; | |
241 } | |
242 if (remaining < 4) { | |
243 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); | |
244 if (replacementCodepoint != null) { | |
245 _current = replacementCodepoint; | |
246 return true; | |
247 } else { | |
248 throw new ArgumentError( | |
249 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | |
250 } | |
251 } | |
252 int codepoint = decode(); | |
253 if (_validCodepoint(codepoint)) { | |
254 _current = codepoint; | |
255 return true; | |
256 } else if (replacementCodepoint != null) { | |
257 _current = replacementCodepoint; | |
258 return true; | |
259 } else { | |
260 throw new ArgumentError( | |
261 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | |
262 } | |
263 } | |
264 | |
265 int get position => utf32EncodedBytesIterator.position ~/ 4; | |
266 | |
267 void backup([int by = 1]) { | |
268 utf32EncodedBytesIterator.backup(4 * by); | |
269 } | |
270 | |
271 int get remaining => (utf32EncodedBytesIterator.remaining + 3) ~/ 4; | |
272 | |
273 void skip([int count = 1]) { | |
274 utf32EncodedBytesIterator.skip(4 * count); | |
275 } | |
276 | |
277 int decode(); | |
278 } | |
279 | |
280 /** | |
281 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | |
282 * to produce the unicode codepoint. | |
283 */ | |
284 class Utf32beBytesDecoder extends Utf32BytesDecoder { | |
285 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | |
286 int length, bool stripBom = true, | |
287 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
288 super._fromListRangeIterator( | |
289 (new _ListRange(utf32EncodedBytes, offset, length)).iterator, | |
290 replacementCodepoint) { | |
291 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { | |
292 skip(); | |
293 } | |
294 } | |
295 | |
296 int decode() { | |
297 utf32EncodedBytesIterator.moveNext(); | |
298 int value = utf32EncodedBytesIterator.current; | |
299 utf32EncodedBytesIterator.moveNext(); | |
300 value = (value << 8) + utf32EncodedBytesIterator.current; | |
301 utf32EncodedBytesIterator.moveNext(); | |
302 value = (value << 8) + utf32EncodedBytesIterator.current; | |
303 utf32EncodedBytesIterator.moveNext(); | |
304 value = (value << 8) + utf32EncodedBytesIterator.current; | |
305 return value; | |
306 } | |
307 } | |
308 | |
309 /** | |
310 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | |
311 * to produce the unicode codepoint. | |
312 */ | |
313 class Utf32leBytesDecoder extends Utf32BytesDecoder { | |
314 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | |
315 int length, bool stripBom = true, | |
316 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
317 super._fromListRangeIterator( | |
318 (new _ListRange(utf32EncodedBytes, offset, length)).iterator, | |
319 replacementCodepoint) { | |
320 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { | |
321 skip(); | |
322 } | |
323 } | |
324 | |
325 int decode() { | |
326 utf32EncodedBytesIterator.moveNext(); | |
327 int value = utf32EncodedBytesIterator.current; | |
328 utf32EncodedBytesIterator.moveNext(); | |
329 value += (utf32EncodedBytesIterator.current << 8); | |
330 utf32EncodedBytesIterator.moveNext(); | |
331 value += (utf32EncodedBytesIterator.current << 16); | |
332 utf32EncodedBytesIterator.moveNext(); | |
333 value += (utf32EncodedBytesIterator.current << 24); | |
334 return value; | |
335 } | |
336 } | |
337 | |
338 bool _validCodepoint(int codepoint) { | |
339 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || | |
340 (codepoint > UNICODE_UTF16_RESERVED_HI && | |
341 codepoint < UNICODE_VALID_RANGE_MAX); | |
342 } | |
OLD | NEW |