OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.utf; | 5 part of dart.utf; |
6 | 6 |
7 /** | 7 /** |
8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert | 8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert |
9 * as much of the input as needed. Determines the byte order from the BOM, | 9 * as much of the input as needed. Determines the byte order from the BOM, |
10 * or uses big-endian as a default. This method always strips a leading BOM. | 10 * or uses big-endian as a default. This method always strips a leading BOM. |
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; | 175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; |
176 } | 176 } |
177 | 177 |
178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); | 178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); |
179 | 179 |
180 /** | 180 /** |
181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type | 181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type |
182 * provides an iterator on demand and the iterator will only translate bytes | 182 * provides an iterator on demand and the iterator will only translate bytes |
183 * as requested by the user of the iterator. (Note: results are not cached.) | 183 * as requested by the user of the iterator. (Note: results are not cached.) |
184 */ | 184 */ |
185 class IterableUtf32Decoder implements Iterable<int> { | 185 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 186 // that's cheaper to allocate. |
| 187 class IterableUtf32Decoder extends Iterable<int> { |
186 final Utf32BytesDecoderProvider codeunitsProvider; | 188 final Utf32BytesDecoderProvider codeunitsProvider; |
187 | 189 |
188 IterableUtf32Decoder._(this.codeunitsProvider); | 190 IterableUtf32Decoder._(this.codeunitsProvider); |
189 | 191 |
190 Utf32BytesDecoder iterator() => codeunitsProvider(); | 192 Utf32BytesDecoder get iterator => codeunitsProvider(); |
191 } | 193 } |
192 | 194 |
193 /** | 195 /** |
194 * Abstrace parent class converts encoded bytes to codepoints. | 196 * Abstrace parent class converts encoded bytes to codepoints. |
195 */ | 197 */ |
196 class Utf32BytesDecoder implements _ListRangeIterator { | 198 class Utf32BytesDecoder implements _ListRangeIterator { |
197 final _ListRangeIterator utf32EncodedBytesIterator; | 199 final _ListRangeIterator utf32EncodedBytesIterator; |
198 final int replacementCodepoint; | 200 final int replacementCodepoint; |
| 201 int _current = null; |
199 | 202 |
200 Utf32BytesDecoder._fromListRangeIterator( | 203 Utf32BytesDecoder._fromListRangeIterator( |
201 this.utf32EncodedBytesIterator, this.replacementCodepoint); | 204 this.utf32EncodedBytesIterator, this.replacementCodepoint); |
202 | 205 |
203 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ | 206 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ |
204 int offset = 0, int length, | 207 int offset = 0, int length, |
205 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 208 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
206 if (length == null) { | 209 if (length == null) { |
207 length = utf32EncodedBytes.length - offset; | 210 length = utf32EncodedBytes.length - offset; |
208 } | 211 } |
209 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { | 212 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { |
210 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | 213 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, |
211 false, replacementCodepoint); | 214 false, replacementCodepoint); |
212 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { | 215 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { |
213 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | 216 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, |
214 false, replacementCodepoint); | 217 false, replacementCodepoint); |
215 } else { | 218 } else { |
216 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, | 219 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, |
217 replacementCodepoint); | 220 replacementCodepoint); |
218 } | 221 } |
219 } | 222 } |
220 | 223 |
221 List<int> decodeRest() { | 224 List<int> decodeRest() { |
222 List<int> codeunits = new List<int>(remaining); | 225 List<int> codeunits = new List<int>.fixedLength(remaining); |
223 int i = 0; | 226 int i = 0; |
224 while (hasNext) { | 227 while (moveNext()) { |
225 codeunits[i++] = next(); | 228 codeunits[i++] = current; |
226 } | 229 } |
227 return codeunits; | 230 return codeunits; |
228 } | 231 } |
229 | 232 |
230 bool get hasNext => utf32EncodedBytesIterator.hasNext; | 233 int get current => _current; |
231 | 234 |
232 int next() { | 235 bool moveNext() { |
| 236 _current = null; |
233 if (utf32EncodedBytesIterator.remaining < 4) { | 237 if (utf32EncodedBytesIterator.remaining < 4) { |
234 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); | 238 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); |
235 if (replacementCodepoint != null) { | 239 if (replacementCodepoint != null) { |
236 return replacementCodepoint; | 240 _current = replacementCodepoint; |
| 241 return true; |
237 } else { | 242 } else { |
238 throw new ArgumentError( | 243 throw new ArgumentError( |
239 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | 244 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); |
240 } | 245 } |
241 } else { | 246 } else { |
242 int codepoint = decode(); | 247 int codepoint = decode(); |
243 if (_validCodepoint(codepoint)) { | 248 if (_validCodepoint(codepoint)) { |
244 return codepoint; | 249 _current = codepoint; |
| 250 return true; |
245 } else if (replacementCodepoint != null) { | 251 } else if (replacementCodepoint != null) { |
246 return replacementCodepoint; | 252 _current = replacementCodepoint; |
| 253 return true; |
247 } else { | 254 } else { |
248 throw new ArgumentError( | 255 throw new ArgumentError( |
249 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | 256 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); |
250 } | 257 } |
251 } | 258 } |
252 } | 259 } |
253 | 260 |
254 int get position => utf32EncodedBytesIterator.position ~/ 4; | 261 int get position => utf32EncodedBytesIterator.position ~/ 4; |
255 | 262 |
256 void backup([int by = 1]) { | 263 void backup([int by = 1]) { |
(...skipping 10 matching lines...) Expand all Loading... |
267 } | 274 } |
268 | 275 |
269 /** | 276 /** |
270 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | 277 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes |
271 * to produce the unicode codepoint. | 278 * to produce the unicode codepoint. |
272 */ | 279 */ |
273 class Utf32beBytesDecoder extends Utf32BytesDecoder { | 280 class Utf32beBytesDecoder extends Utf32BytesDecoder { |
274 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | 281 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, |
275 int length, bool stripBom = true, | 282 int length, bool stripBom = true, |
276 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 283 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
277 super._fromListRangeIterator((new _ListRange(utf32EncodedBytes, offset, | 284 super._fromListRangeIterator( |
278 length)).iterator(), replacementCodepoint) { | 285 (new _ListRange(utf32EncodedBytes, offset, length)).iterator, |
| 286 replacementCodepoint) { |
279 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { | 287 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { |
280 skip(); | 288 skip(); |
281 } | 289 } |
282 } | 290 } |
283 | 291 |
284 int decode() { | 292 int decode() { |
285 int value = utf32EncodedBytesIterator.next(); | 293 utf32EncodedBytesIterator.moveNext(); |
286 value = (value << 8) + utf32EncodedBytesIterator.next(); | 294 int value = utf32EncodedBytesIterator.current; |
287 value = (value << 8) + utf32EncodedBytesIterator.next(); | 295 utf32EncodedBytesIterator.moveNext(); |
288 value = (value << 8) + utf32EncodedBytesIterator.next(); | 296 value = (value << 8) + utf32EncodedBytesIterator.current; |
| 297 utf32EncodedBytesIterator.moveNext(); |
| 298 value = (value << 8) + utf32EncodedBytesIterator.current; |
| 299 utf32EncodedBytesIterator.moveNext(); |
| 300 value = (value << 8) + utf32EncodedBytesIterator.current; |
289 return value; | 301 return value; |
290 } | 302 } |
291 } | 303 } |
292 | 304 |
293 /** | 305 /** |
294 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | 306 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes |
295 * to produce the unicode codepoint. | 307 * to produce the unicode codepoint. |
296 */ | 308 */ |
297 class Utf32leBytesDecoder extends Utf32BytesDecoder { | 309 class Utf32leBytesDecoder extends Utf32BytesDecoder { |
298 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | 310 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, |
299 int length, bool stripBom = true, | 311 int length, bool stripBom = true, |
300 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 312 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
301 super._fromListRangeIterator((new _ListRange(utf32EncodedBytes, offset, | 313 super._fromListRangeIterator( |
302 length)).iterator(), replacementCodepoint) { | 314 (new _ListRange(utf32EncodedBytes, offset, length)).iterator, |
| 315 replacementCodepoint) { |
303 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { | 316 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { |
304 skip(); | 317 skip(); |
305 } | 318 } |
306 } | 319 } |
307 | 320 |
308 int decode() { | 321 int decode() { |
309 int value = (utf32EncodedBytesIterator.next()); | 322 utf32EncodedBytesIterator.moveNext(); |
310 value += (utf32EncodedBytesIterator.next() << 8); | 323 int value = utf32EncodedBytesIterator.current; |
311 value += (utf32EncodedBytesIterator.next() << 16); | 324 utf32EncodedBytesIterator.moveNext(); |
312 value += (utf32EncodedBytesIterator.next() << 24); | 325 value += (utf32EncodedBytesIterator.current << 8); |
| 326 utf32EncodedBytesIterator.moveNext(); |
| 327 value += (utf32EncodedBytesIterator.current << 16); |
| 328 utf32EncodedBytesIterator.moveNext(); |
| 329 value += (utf32EncodedBytesIterator.current << 24); |
313 return value; | 330 return value; |
314 } | 331 } |
315 } | 332 } |
316 | 333 |
317 bool _validCodepoint(int codepoint) { | 334 bool _validCodepoint(int codepoint) { |
318 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || | 335 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || |
319 (codepoint > UNICODE_UTF16_RESERVED_HI && | 336 (codepoint > UNICODE_UTF16_RESERVED_HI && |
320 codepoint < UNICODE_VALID_RANGE_MAX); | 337 codepoint < UNICODE_VALID_RANGE_MAX); |
321 } | 338 } |
OLD | NEW |