OLD | NEW |
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.convert; | 5 part of dart.convert; |
6 | 6 |
7 /** The Unicode Replacement character `U+FFFD` (�). */ | 7 /** The Unicode Replacement character `U+FFFD` (�). */ |
8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; |
9 | 9 |
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ | 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ |
(...skipping 24 matching lines...) Expand all Loading... |
35 * Instantiates a new [Utf8Codec]. | 35 * Instantiates a new [Utf8Codec]. |
36 * | 36 * |
37 * The optional [allowMalformed] argument defines how [decoder] (and [decode]) | 37 * The optional [allowMalformed] argument defines how [decoder] (and [decode]) |
38 * deal with invalid or unterminated character sequences. | 38 * deal with invalid or unterminated character sequences. |
39 * | 39 * |
40 * If it is `true` (and not overridden at the method invocation) [decode] and | 40 * If it is `true` (and not overridden at the method invocation) [decode] and |
41 * the [decoder] replace invalid (or unterminated) octet | 41 * the [decoder] replace invalid (or unterminated) octet |
42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | 42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
43 * they throw a [FormatException]. | 43 * they throw a [FormatException]. |
44 */ | 44 */ |
45 const Utf8Codec({ bool allowMalformed: false }) | 45 const Utf8Codec({bool allowMalformed: false}) |
46 : _allowMalformed = allowMalformed; | 46 : _allowMalformed = allowMalformed; |
47 | 47 |
48 String get name => "utf-8"; | 48 String get name => "utf-8"; |
49 | 49 |
50 /** | 50 /** |
51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
52 * corresponding string. | 52 * corresponding string. |
53 * | 53 * |
54 * If the [codeUnits] start with the encoding of a | 54 * If the [codeUnits] start with the encoding of a |
55 * [UNICODE_BOM_CHARACTER_RUNE], that character is discarded. | 55 * [UNICODE_BOM_CHARACTER_RUNE], that character is discarded. |
56 * | 56 * |
57 * If [allowMalformed] is `true` the decoder replaces invalid (or | 57 * If [allowMalformed] is `true` the decoder replaces invalid (or |
58 * unterminated) character sequences with the Unicode Replacement character | 58 * unterminated) character sequences with the Unicode Replacement character |
59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. | 59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. |
60 * | 60 * |
61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that | 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that |
62 * was used to instantiate `this`. | 62 * was used to instantiate `this`. |
63 */ | 63 */ |
64 String decode(List<int> codeUnits, { bool allowMalformed }) { | 64 String decode(List<int> codeUnits, {bool allowMalformed}) { |
65 if (allowMalformed == null) allowMalformed = _allowMalformed; | 65 if (allowMalformed == null) allowMalformed = _allowMalformed; |
66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); |
67 } | 67 } |
68 | 68 |
69 Utf8Encoder get encoder => const Utf8Encoder(); | 69 Utf8Encoder get encoder => const Utf8Encoder(); |
70 Utf8Decoder get decoder { | 70 Utf8Decoder get decoder { |
71 return new Utf8Decoder(allowMalformed: _allowMalformed); | 71 return new Utf8Decoder(allowMalformed: _allowMalformed); |
72 } | 72 } |
73 } | 73 } |
74 | 74 |
75 /** | 75 /** |
76 * This class converts strings to their UTF-8 code units (a list of | 76 * This class converts strings to their UTF-8 code units (a list of |
77 * unsigned 8-bit integers). | 77 * unsigned 8-bit integers). |
78 */ | 78 */ |
79 class Utf8Encoder extends Converter<String, List<int>> | 79 class Utf8Encoder extends Converter<String, List<int>> |
80 implements ChunkedConverter<String, List<int>, String, List<int>> { | 80 implements ChunkedConverter<String, List<int>, String, List<int>> { |
81 | |
82 const Utf8Encoder(); | 81 const Utf8Encoder(); |
83 | 82 |
84 /** | 83 /** |
85 * Converts [string] to its UTF-8 code units (a list of | 84 * Converts [string] to its UTF-8 code units (a list of |
86 * unsigned 8-bit integers). | 85 * unsigned 8-bit integers). |
87 * | 86 * |
88 * If [start] and [end] are provided, only the substring | 87 * If [start] and [end] are provided, only the substring |
89 * `string.substring(start, end)` is converted. | 88 * `string.substring(start, end)` is converted. |
90 */ | 89 */ |
91 List<int> convert(String string, [int start = 0, int end]) { | 90 List<int> convert(String string, [int start = 0, int end]) { |
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
233 } | 232 } |
234 return stringIndex; | 233 return stringIndex; |
235 } | 234 } |
236 } | 235 } |
237 | 236 |
238 /** | 237 /** |
239 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit | 238 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit |
240 * integers). | 239 * integers). |
241 */ | 240 */ |
242 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { | 241 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { |
243 | |
244 final ByteConversionSink _sink; | 242 final ByteConversionSink _sink; |
245 | 243 |
246 _Utf8EncoderSink(this._sink); | 244 _Utf8EncoderSink(this._sink); |
247 | 245 |
248 void close() { | 246 void close() { |
249 if (_carry != 0) { | 247 if (_carry != 0) { |
250 // addSlice will call close again, but then the carry must be equal to 0. | 248 // addSlice will call close again, but then the carry must be equal to 0. |
251 addSlice("", 0, 0, true); | 249 addSlice("", 0, 0, true); |
252 return; | 250 return; |
253 } | 251 } |
254 _sink.close(); | 252 _sink.close(); |
255 } | 253 } |
256 | 254 |
257 void addSlice(String str, int start, int end, bool isLast) { | 255 void addSlice(String str, int start, int end, bool isLast) { |
258 _bufferIndex = 0; | 256 _bufferIndex = 0; |
259 | 257 |
260 if (start == end && !isLast) { | 258 if (start == end && !isLast) { |
261 return; | 259 return; |
262 } | 260 } |
263 | 261 |
264 if (_carry != 0) { | 262 if (_carry != 0) { |
265 int nextCodeUnit = 0; | 263 int nextCodeUnit = 0; |
266 if (start != end) { | 264 if (start != end) { |
267 nextCodeUnit = str.codeUnitAt(start); | 265 nextCodeUnit = str.codeUnitAt(start); |
268 } else { | 266 } else { |
269 assert(isLast); | 267 assert(isLast); |
270 } | 268 } |
271 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); | 269 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); |
272 // Either we got a non-empty string, or we must not have been combined. | 270 // Either we got a non-empty string, or we must not have been combined. |
273 assert(!wasCombined || start != end ); | 271 assert(!wasCombined || start != end); |
274 if (wasCombined) start++; | 272 if (wasCombined) start++; |
275 _carry = 0; | 273 _carry = 0; |
276 } | 274 } |
277 do { | 275 do { |
278 start = _fillBuffer(str, start, end); | 276 start = _fillBuffer(str, start, end); |
279 bool isLastSlice = isLast && (start == end); | 277 bool isLastSlice = isLast && (start == end); |
280 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { | 278 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { |
281 if (isLast && _bufferIndex < _buffer.length - 3) { | 279 if (isLast && _bufferIndex < _buffer.length - 3) { |
282 // There is still space for the last incomplete surrogate. | 280 // There is still space for the last incomplete surrogate. |
283 // We use a non-surrogate as second argument. This way the | 281 // We use a non-surrogate as second argument. This way the |
(...skipping 16 matching lines...) Expand all Loading... |
300 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it | 298 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it |
301 // needs to deal with malformed input. | 299 // needs to deal with malformed input. |
302 } | 300 } |
303 | 301 |
304 /** | 302 /** |
305 * This class converts UTF-8 code units (lists of unsigned 8-bit integers) | 303 * This class converts UTF-8 code units (lists of unsigned 8-bit integers) |
306 * to a string. | 304 * to a string. |
307 */ | 305 */ |
308 class Utf8Decoder extends Converter<List<int>, String> | 306 class Utf8Decoder extends Converter<List<int>, String> |
309 implements ChunkedConverter<List<int>, String, List<int>, String> { | 307 implements ChunkedConverter<List<int>, String, List<int>, String> { |
310 | |
311 final bool _allowMalformed; | 308 final bool _allowMalformed; |
312 | 309 |
313 /** | 310 /** |
314 * Instantiates a new [Utf8Decoder]. | 311 * Instantiates a new [Utf8Decoder]. |
315 * | 312 * |
316 * The optional [allowMalformed] argument defines how [convert] deals | 313 * The optional [allowMalformed] argument defines how [convert] deals |
317 * with invalid or unterminated character sequences. | 314 * with invalid or unterminated character sequences. |
318 * | 315 * |
319 * If it is `true` [convert] replaces invalid (or unterminated) character | 316 * If it is `true` [convert] replaces invalid (or unterminated) character |
320 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | 317 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
321 * it throws a [FormatException]. | 318 * it throws a [FormatException]. |
322 */ | 319 */ |
323 const Utf8Decoder({ bool allowMalformed: false }) | 320 const Utf8Decoder({bool allowMalformed: false}) |
324 : this._allowMalformed = allowMalformed; | 321 : this._allowMalformed = allowMalformed; |
325 | 322 |
326 /** | 323 /** |
327 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 324 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
328 * corresponding string. | 325 * corresponding string. |
329 * | 326 * |
330 * Uses the code units from [start] to, but no including, [end]. | 327 * Uses the code units from [start] to, but no including, [end]. |
331 * If [end] is omitted, it defaults to `codeUnits.length`. | 328 * If [end] is omitted, it defaults to `codeUnits.length`. |
332 * | 329 * |
333 * If the [codeUnits] start with the encoding of a | 330 * If the [codeUnits] start with the encoding of a |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
370 // Override the base-classes bind, to provide a better type. | 367 // Override the base-classes bind, to provide a better type. |
371 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); | 368 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); |
372 | 369 |
373 external Converter<List<int>, T> fuse<T>(Converter<String, T> next); | 370 external Converter<List<int>, T> fuse<T>(Converter<String, T> next); |
374 | 371 |
375 external static String _convertIntercepted( | 372 external static String _convertIntercepted( |
376 bool allowMalformed, List<int> codeUnits, int start, int end); | 373 bool allowMalformed, List<int> codeUnits, int start, int end); |
377 } | 374 } |
378 | 375 |
379 // UTF-8 constants. | 376 // UTF-8 constants. |
380 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits | 377 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits |
381 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits | 378 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits |
382 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits | 379 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits |
383 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. | 380 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. |
384 | 381 |
385 // UTF-16 constants. | 382 // UTF-16 constants. |
386 const int _SURROGATE_MASK = 0xF800; | 383 const int _SURROGATE_MASK = 0xF800; |
387 const int _SURROGATE_TAG_MASK = 0xFC00; | 384 const int _SURROGATE_TAG_MASK = 0xFC00; |
388 const int _SURROGATE_VALUE_MASK = 0x3FF; | 385 const int _SURROGATE_VALUE_MASK = 0x3FF; |
389 const int _LEAD_SURROGATE_MIN = 0xD800; | 386 const int _LEAD_SURROGATE_MIN = 0xD800; |
390 const int _TAIL_SURROGATE_MIN = 0xDC00; | 387 const int _TAIL_SURROGATE_MIN = 0xDC00; |
391 | 388 |
392 bool _isLeadSurrogate(int codeUnit) => | 389 bool _isLeadSurrogate(int codeUnit) => |
393 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | 390 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
394 bool _isTailSurrogate(int codeUnit) => | 391 bool _isTailSurrogate(int codeUnit) => |
395 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | 392 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
396 int _combineSurrogatePair(int lead, int tail) => | 393 int _combineSurrogatePair(int lead, int tail) => |
397 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | 394 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | |
398 | (tail & _SURROGATE_VALUE_MASK); | 395 (tail & _SURROGATE_VALUE_MASK); |
399 | 396 |
400 /** | 397 /** |
401 * Decodes UTF-8. | 398 * Decodes UTF-8. |
402 * | 399 * |
403 * The decoder handles chunked input. | 400 * The decoder handles chunked input. |
404 */ | 401 */ |
405 // TODO(floitsch): make this class public. | 402 // TODO(floitsch): make this class public. |
406 class _Utf8Decoder { | 403 class _Utf8Decoder { |
407 final bool _allowMalformed; | 404 final bool _allowMalformed; |
408 final StringSink _stringSink; | 405 final StringSink _stringSink; |
409 bool _isFirstCharacter = true; | 406 bool _isFirstCharacter = true; |
410 int _value = 0; | 407 int _value = 0; |
411 int _expectedUnits = 0; | 408 int _expectedUnits = 0; |
412 int _extraUnits = 0; | 409 int _extraUnits = 0; |
413 | 410 |
414 _Utf8Decoder(this._stringSink, this._allowMalformed); | 411 _Utf8Decoder(this._stringSink, this._allowMalformed); |
415 | 412 |
416 bool get hasPartialInput => _expectedUnits > 0; | 413 bool get hasPartialInput => _expectedUnits > 0; |
417 | 414 |
418 // Limits of one through four byte encodings. | 415 // Limits of one through four byte encodings. |
419 static const List<int> _LIMITS = const <int>[ | 416 static const List<int> _LIMITS = const <int>[ |
420 _ONE_BYTE_LIMIT, | 417 _ONE_BYTE_LIMIT, |
421 _TWO_BYTE_LIMIT, | 418 _TWO_BYTE_LIMIT, |
422 _THREE_BYTE_LIMIT, | 419 _THREE_BYTE_LIMIT, |
423 _FOUR_BYTE_LIMIT ]; | 420 _FOUR_BYTE_LIMIT |
| 421 ]; |
424 | 422 |
425 void close() { | 423 void close() { |
426 flush(); | 424 flush(); |
427 } | 425 } |
428 | 426 |
429 /** | 427 /** |
430 * Flushes this decoder as if closed. | 428 * Flushes this decoder as if closed. |
431 * | 429 * |
432 * This method throws if the input was partial and the decoder was | 430 * This method throws if the input was partial and the decoder was |
433 * constructed with `allowMalformed` set to `false`. | 431 * constructed with `allowMalformed` set to `false`. |
434 * | 432 * |
435 * The [source] and [offset] of the current position may be provided, | 433 * The [source] and [offset] of the current position may be provided, |
436 * and are included in the exception if one is thrown. | 434 * and are included in the exception if one is thrown. |
437 */ | 435 */ |
438 void flush([List<int> source, int offset]) { | 436 void flush([List<int> source, int offset]) { |
439 if (hasPartialInput) { | 437 if (hasPartialInput) { |
440 if (!_allowMalformed) { | 438 if (!_allowMalformed) { |
441 throw new FormatException("Unfinished UTF-8 octet sequence", | 439 throw new FormatException( |
442 source, offset); | 440 "Unfinished UTF-8 octet sequence", source, offset); |
443 } | 441 } |
444 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 442 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
445 _value = 0; | 443 _value = 0; |
446 _expectedUnits = 0; | 444 _expectedUnits = 0; |
447 _extraUnits = 0; | 445 _extraUnits = 0; |
448 } | 446 } |
449 } | 447 } |
450 | 448 |
451 void convert(List<int> codeUnits, int startIndex, int endIndex) { | 449 void convert(List<int> codeUnits, int startIndex, int endIndex) { |
452 int value = _value; | 450 int value = _value; |
(...skipping 13 matching lines...) Expand all Loading... |
466 return to - from; | 464 return to - from; |
467 } | 465 } |
468 | 466 |
469 void addSingleBytes(int from, int to) { | 467 void addSingleBytes(int from, int to) { |
470 assert(from >= startIndex && from <= endIndex); | 468 assert(from >= startIndex && from <= endIndex); |
471 assert(to >= startIndex && to <= endIndex); | 469 assert(to >= startIndex && to <= endIndex); |
472 _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); | 470 _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); |
473 } | 471 } |
474 | 472 |
475 int i = startIndex; | 473 int i = startIndex; |
476 loop: while (true) { | 474 loop: |
477 multibyte: if (expectedUnits > 0) { | 475 while (true) { |
| 476 multibyte: |
| 477 if (expectedUnits > 0) { |
478 do { | 478 do { |
479 if (i == endIndex) { | 479 if (i == endIndex) { |
480 break loop; | 480 break loop; |
481 } | 481 } |
482 int unit = codeUnits[i]; | 482 int unit = codeUnits[i]; |
483 if ((unit & 0xC0) != 0x80) { | 483 if ((unit & 0xC0) != 0x80) { |
484 expectedUnits = 0; | 484 expectedUnits = 0; |
485 if (!_allowMalformed) { | 485 if (!_allowMalformed) { |
486 throw new FormatException( | 486 throw new FormatException( |
487 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", | 487 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", |
488 codeUnits, i); | 488 codeUnits, |
| 489 i); |
489 } | 490 } |
490 _isFirstCharacter = false; | 491 _isFirstCharacter = false; |
491 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 492 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
492 break multibyte; | 493 break multibyte; |
493 } else { | 494 } else { |
494 value = (value << 6) | (unit & 0x3f); | 495 value = (value << 6) | (unit & 0x3f); |
495 expectedUnits--; | 496 expectedUnits--; |
496 i++; | 497 i++; |
497 } | 498 } |
498 } while (expectedUnits > 0); | 499 } while (expectedUnits > 0); |
499 if (value <= _LIMITS[extraUnits - 1]) { | 500 if (value <= _LIMITS[extraUnits - 1]) { |
500 // Overly long encoding. The value could be encoded with a shorter | 501 // Overly long encoding. The value could be encoded with a shorter |
501 // encoding. | 502 // encoding. |
502 if (!_allowMalformed) { | 503 if (!_allowMalformed) { |
503 throw new FormatException( | 504 throw new FormatException( |
504 "Overlong encoding of 0x${value.toRadixString(16)}", | 505 "Overlong encoding of 0x${value.toRadixString(16)}", |
505 codeUnits, i - extraUnits - 1); | 506 codeUnits, |
| 507 i - extraUnits - 1); |
506 } | 508 } |
507 expectedUnits = extraUnits = 0; | 509 expectedUnits = extraUnits = 0; |
508 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 510 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
509 } | 511 } |
510 if (value > _FOUR_BYTE_LIMIT) { | 512 if (value > _FOUR_BYTE_LIMIT) { |
511 if (!_allowMalformed) { | 513 if (!_allowMalformed) { |
512 throw new FormatException("Character outside valid Unicode range: " | 514 throw new FormatException( |
513 "0x${value.toRadixString(16)}", | 515 "Character outside valid Unicode range: " |
514 codeUnits, i - extraUnits - 1); | 516 "0x${value.toRadixString(16)}", |
| 517 codeUnits, |
| 518 i - extraUnits - 1); |
515 } | 519 } |
516 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 520 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
517 } | 521 } |
518 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | 522 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { |
519 _stringSink.writeCharCode(value); | 523 _stringSink.writeCharCode(value); |
520 } | 524 } |
521 _isFirstCharacter = false; | 525 _isFirstCharacter = false; |
522 } | 526 } |
523 | 527 |
524 while (i < endIndex) { | 528 while (i < endIndex) { |
525 int oneBytes = scanOneByteCharacters(codeUnits, i); | 529 int oneBytes = scanOneByteCharacters(codeUnits, i); |
526 if (oneBytes > 0) { | 530 if (oneBytes > 0) { |
527 _isFirstCharacter = false; | 531 _isFirstCharacter = false; |
528 addSingleBytes(i, i + oneBytes); | 532 addSingleBytes(i, i + oneBytes); |
529 i += oneBytes; | 533 i += oneBytes; |
530 if (i == endIndex) break; | 534 if (i == endIndex) break; |
531 } | 535 } |
532 int unit = codeUnits[i++]; | 536 int unit = codeUnits[i++]; |
533 // TODO(floitsch): the way we test we could potentially allow | 537 // TODO(floitsch): the way we test we could potentially allow |
534 // units that are too large, if they happen to have the | 538 // units that are too large, if they happen to have the |
535 // right bit-pattern. (Same is true for the multibyte loop above). | 539 // right bit-pattern. (Same is true for the multibyte loop above). |
536 // TODO(floitsch): optimize this loop. See: | 540 // TODO(floitsch): optimize this loop. See: |
537 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | 541 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 |
538 if (unit < 0) { | 542 if (unit < 0) { |
539 // TODO(floitsch): should this be unit <= 0 ? | 543 // TODO(floitsch): should this be unit <= 0 ? |
540 if (!_allowMalformed) { | 544 if (!_allowMalformed) { |
541 throw new FormatException( | 545 throw new FormatException( |
542 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}", | 546 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}", |
543 codeUnits, i - 1); | 547 codeUnits, |
| 548 i - 1); |
544 } | 549 } |
545 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 550 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
546 } else { | 551 } else { |
547 assert(unit > _ONE_BYTE_LIMIT); | 552 assert(unit > _ONE_BYTE_LIMIT); |
548 if ((unit & 0xE0) == 0xC0) { | 553 if ((unit & 0xE0) == 0xC0) { |
549 value = unit & 0x1F; | 554 value = unit & 0x1F; |
550 expectedUnits = extraUnits = 1; | 555 expectedUnits = extraUnits = 1; |
551 continue loop; | 556 continue loop; |
552 } | 557 } |
553 if ((unit & 0xF0) == 0xE0) { | 558 if ((unit & 0xF0) == 0xE0) { |
554 value = unit & 0x0F; | 559 value = unit & 0x0F; |
555 expectedUnits = extraUnits = 2; | 560 expectedUnits = extraUnits = 2; |
556 continue loop; | 561 continue loop; |
557 } | 562 } |
558 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | 563 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
559 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | 564 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
560 value = unit & 0x07; | 565 value = unit & 0x07; |
561 expectedUnits = extraUnits = 3; | 566 expectedUnits = extraUnits = 3; |
562 continue loop; | 567 continue loop; |
563 } | 568 } |
564 if (!_allowMalformed) { | 569 if (!_allowMalformed) { |
565 throw new FormatException( | 570 throw new FormatException( |
566 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", | 571 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", |
567 codeUnits, i - 1); | 572 codeUnits, |
| 573 i - 1); |
568 } | 574 } |
569 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 575 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
570 expectedUnits = extraUnits = 0; | 576 expectedUnits = extraUnits = 0; |
571 _isFirstCharacter = false; | 577 _isFirstCharacter = false; |
572 _stringSink.writeCharCode(value); | 578 _stringSink.writeCharCode(value); |
573 } | 579 } |
574 } | 580 } |
575 break loop; | 581 break loop; |
576 } | 582 } |
577 if (expectedUnits > 0) { | 583 if (expectedUnits > 0) { |
578 _value = value; | 584 _value = value; |
579 _expectedUnits = expectedUnits; | 585 _expectedUnits = expectedUnits; |
580 _extraUnits = extraUnits; | 586 _extraUnits = extraUnits; |
581 } | 587 } |
582 } | 588 } |
583 } | 589 } |
OLD | NEW |