| OLD | NEW |
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of dart.convert; | 5 part of dart.convert; |
| 6 | 6 |
| 7 /** The Unicode Replacement character `U+FFFD` (�). */ | 7 /** The Unicode Replacement character `U+FFFD` (�). */ |
| 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; |
| 9 | 9 |
| 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ | 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ |
| (...skipping 24 matching lines...) Expand all Loading... |
| 35 * Instantiates a new [Utf8Codec]. | 35 * Instantiates a new [Utf8Codec]. |
| 36 * | 36 * |
| 37 * The optional [allowMalformed] argument defines how [decoder] (and [decode]) | 37 * The optional [allowMalformed] argument defines how [decoder] (and [decode]) |
| 38 * deal with invalid or unterminated character sequences. | 38 * deal with invalid or unterminated character sequences. |
| 39 * | 39 * |
| 40 * If it is `true` (and not overridden at the method invocation) [decode] and | 40 * If it is `true` (and not overridden at the method invocation) [decode] and |
| 41 * the [decoder] replace invalid (or unterminated) octet | 41 * the [decoder] replace invalid (or unterminated) octet |
| 42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | 42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
| 43 * they throw a [FormatException]. | 43 * they throw a [FormatException]. |
| 44 */ | 44 */ |
| 45 const Utf8Codec({ bool allowMalformed: false }) | 45 const Utf8Codec({bool allowMalformed: false}) |
| 46 : _allowMalformed = allowMalformed; | 46 : _allowMalformed = allowMalformed; |
| 47 | 47 |
| 48 String get name => "utf-8"; | 48 String get name => "utf-8"; |
| 49 | 49 |
| 50 /** | 50 /** |
| 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| 52 * corresponding string. | 52 * corresponding string. |
| 53 * | 53 * |
| 54 * If the [codeUnits] start with the encoding of a | 54 * If the [codeUnits] start with the encoding of a |
| 55 * [UNICODE_BOM_CHARACTER_RUNE], that character is discarded. | 55 * [UNICODE_BOM_CHARACTER_RUNE], that character is discarded. |
| 56 * | 56 * |
| 57 * If [allowMalformed] is `true` the decoder replaces invalid (or | 57 * If [allowMalformed] is `true` the decoder replaces invalid (or |
| 58 * unterminated) character sequences with the Unicode Replacement character | 58 * unterminated) character sequences with the Unicode Replacement character |
| 59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. | 59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. |
| 60 * | 60 * |
| 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that | 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that |
| 62 * was used to instantiate `this`. | 62 * was used to instantiate `this`. |
| 63 */ | 63 */ |
| 64 String decode(List<int> codeUnits, { bool allowMalformed }) { | 64 String decode(List<int> codeUnits, {bool allowMalformed}) { |
| 65 if (allowMalformed == null) allowMalformed = _allowMalformed; | 65 if (allowMalformed == null) allowMalformed = _allowMalformed; |
| 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); |
| 67 } | 67 } |
| 68 | 68 |
| 69 Utf8Encoder get encoder => const Utf8Encoder(); | 69 Utf8Encoder get encoder => const Utf8Encoder(); |
| 70 Utf8Decoder get decoder { | 70 Utf8Decoder get decoder { |
| 71 return new Utf8Decoder(allowMalformed: _allowMalformed); | 71 return new Utf8Decoder(allowMalformed: _allowMalformed); |
| 72 } | 72 } |
| 73 } | 73 } |
| 74 | 74 |
| 75 /** | 75 /** |
| 76 * This class converts strings to their UTF-8 code units (a list of | 76 * This class converts strings to their UTF-8 code units (a list of |
| 77 * unsigned 8-bit integers). | 77 * unsigned 8-bit integers). |
| 78 */ | 78 */ |
| 79 class Utf8Encoder extends Converter<String, List<int>> | 79 class Utf8Encoder extends Converter<String, List<int>> |
| 80 implements ChunkedConverter<String, List<int>, String, List<int>> { | 80 implements ChunkedConverter<String, List<int>, String, List<int>> { |
| 81 | |
| 82 const Utf8Encoder(); | 81 const Utf8Encoder(); |
| 83 | 82 |
| 84 /** | 83 /** |
| 85 * Converts [string] to its UTF-8 code units (a list of | 84 * Converts [string] to its UTF-8 code units (a list of |
| 86 * unsigned 8-bit integers). | 85 * unsigned 8-bit integers). |
| 87 * | 86 * |
| 88 * If [start] and [end] are provided, only the substring | 87 * If [start] and [end] are provided, only the substring |
| 89 * `string.substring(start, end)` is converted. | 88 * `string.substring(start, end)` is converted. |
| 90 */ | 89 */ |
| 91 List<int> convert(String string, [int start = 0, int end]) { | 90 List<int> convert(String string, [int start = 0, int end]) { |
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 233 } | 232 } |
| 234 return stringIndex; | 233 return stringIndex; |
| 235 } | 234 } |
| 236 } | 235 } |
| 237 | 236 |
| 238 /** | 237 /** |
| 239 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit | 238 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit |
| 240 * integers). | 239 * integers). |
| 241 */ | 240 */ |
| 242 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { | 241 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { |
| 243 | |
| 244 final ByteConversionSink _sink; | 242 final ByteConversionSink _sink; |
| 245 | 243 |
| 246 _Utf8EncoderSink(this._sink); | 244 _Utf8EncoderSink(this._sink); |
| 247 | 245 |
| 248 void close() { | 246 void close() { |
| 249 if (_carry != 0) { | 247 if (_carry != 0) { |
| 250 // addSlice will call close again, but then the carry must be equal to 0. | 248 // addSlice will call close again, but then the carry must be equal to 0. |
| 251 addSlice("", 0, 0, true); | 249 addSlice("", 0, 0, true); |
| 252 return; | 250 return; |
| 253 } | 251 } |
| 254 _sink.close(); | 252 _sink.close(); |
| 255 } | 253 } |
| 256 | 254 |
| 257 void addSlice(String str, int start, int end, bool isLast) { | 255 void addSlice(String str, int start, int end, bool isLast) { |
| 258 _bufferIndex = 0; | 256 _bufferIndex = 0; |
| 259 | 257 |
| 260 if (start == end && !isLast) { | 258 if (start == end && !isLast) { |
| 261 return; | 259 return; |
| 262 } | 260 } |
| 263 | 261 |
| 264 if (_carry != 0) { | 262 if (_carry != 0) { |
| 265 int nextCodeUnit = 0; | 263 int nextCodeUnit = 0; |
| 266 if (start != end) { | 264 if (start != end) { |
| 267 nextCodeUnit = str.codeUnitAt(start); | 265 nextCodeUnit = str.codeUnitAt(start); |
| 268 } else { | 266 } else { |
| 269 assert(isLast); | 267 assert(isLast); |
| 270 } | 268 } |
| 271 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); | 269 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); |
| 272 // Either we got a non-empty string, or we must not have been combined. | 270 // Either we got a non-empty string, or we must not have been combined. |
| 273 assert(!wasCombined || start != end ); | 271 assert(!wasCombined || start != end); |
| 274 if (wasCombined) start++; | 272 if (wasCombined) start++; |
| 275 _carry = 0; | 273 _carry = 0; |
| 276 } | 274 } |
| 277 do { | 275 do { |
| 278 start = _fillBuffer(str, start, end); | 276 start = _fillBuffer(str, start, end); |
| 279 bool isLastSlice = isLast && (start == end); | 277 bool isLastSlice = isLast && (start == end); |
| 280 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { | 278 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { |
| 281 if (isLast && _bufferIndex < _buffer.length - 3) { | 279 if (isLast && _bufferIndex < _buffer.length - 3) { |
| 282 // There is still space for the last incomplete surrogate. | 280 // There is still space for the last incomplete surrogate. |
| 283 // We use a non-surrogate as second argument. This way the | 281 // We use a non-surrogate as second argument. This way the |
| (...skipping 16 matching lines...) Expand all Loading... |
| 300 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it | 298 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it |
| 301 // needs to deal with malformed input. | 299 // needs to deal with malformed input. |
| 302 } | 300 } |
| 303 | 301 |
| 304 /** | 302 /** |
| 305 * This class converts UTF-8 code units (lists of unsigned 8-bit integers) | 303 * This class converts UTF-8 code units (lists of unsigned 8-bit integers) |
| 306 * to a string. | 304 * to a string. |
| 307 */ | 305 */ |
| 308 class Utf8Decoder extends Converter<List<int>, String> | 306 class Utf8Decoder extends Converter<List<int>, String> |
| 309 implements ChunkedConverter<List<int>, String, List<int>, String> { | 307 implements ChunkedConverter<List<int>, String, List<int>, String> { |
| 310 | |
| 311 final bool _allowMalformed; | 308 final bool _allowMalformed; |
| 312 | 309 |
| 313 /** | 310 /** |
| 314 * Instantiates a new [Utf8Decoder]. | 311 * Instantiates a new [Utf8Decoder]. |
| 315 * | 312 * |
| 316 * The optional [allowMalformed] argument defines how [convert] deals | 313 * The optional [allowMalformed] argument defines how [convert] deals |
| 317 * with invalid or unterminated character sequences. | 314 * with invalid or unterminated character sequences. |
| 318 * | 315 * |
| 319 * If it is `true` [convert] replaces invalid (or unterminated) character | 316 * If it is `true` [convert] replaces invalid (or unterminated) character |
| 320 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | 317 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
| 321 * it throws a [FormatException]. | 318 * it throws a [FormatException]. |
| 322 */ | 319 */ |
| 323 const Utf8Decoder({ bool allowMalformed: false }) | 320 const Utf8Decoder({bool allowMalformed: false}) |
| 324 : this._allowMalformed = allowMalformed; | 321 : this._allowMalformed = allowMalformed; |
| 325 | 322 |
| 326 /** | 323 /** |
| 327 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 324 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| 328 * corresponding string. | 325 * corresponding string. |
| 329 * | 326 * |
| 330 * Uses the code units from [start] to, but no including, [end]. | 327 * Uses the code units from [start] to, but no including, [end]. |
| 331 * If [end] is omitted, it defaults to `codeUnits.length`. | 328 * If [end] is omitted, it defaults to `codeUnits.length`. |
| 332 * | 329 * |
| 333 * If the [codeUnits] start with the encoding of a | 330 * If the [codeUnits] start with the encoding of a |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 370 // Override the base-classes bind, to provide a better type. | 367 // Override the base-classes bind, to provide a better type. |
| 371 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); | 368 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); |
| 372 | 369 |
| 373 external Converter<List<int>, T> fuse<T>(Converter<String, T> next); | 370 external Converter<List<int>, T> fuse<T>(Converter<String, T> next); |
| 374 | 371 |
| 375 external static String _convertIntercepted( | 372 external static String _convertIntercepted( |
| 376 bool allowMalformed, List<int> codeUnits, int start, int end); | 373 bool allowMalformed, List<int> codeUnits, int start, int end); |
| 377 } | 374 } |
| 378 | 375 |
| 379 // UTF-8 constants. | 376 // UTF-8 constants. |
| 380 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits | 377 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits |
| 381 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits | 378 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits |
| 382 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits | 379 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits |
| 383 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. | 380 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. |
| 384 | 381 |
| 385 // UTF-16 constants. | 382 // UTF-16 constants. |
| 386 const int _SURROGATE_MASK = 0xF800; | 383 const int _SURROGATE_MASK = 0xF800; |
| 387 const int _SURROGATE_TAG_MASK = 0xFC00; | 384 const int _SURROGATE_TAG_MASK = 0xFC00; |
| 388 const int _SURROGATE_VALUE_MASK = 0x3FF; | 385 const int _SURROGATE_VALUE_MASK = 0x3FF; |
| 389 const int _LEAD_SURROGATE_MIN = 0xD800; | 386 const int _LEAD_SURROGATE_MIN = 0xD800; |
| 390 const int _TAIL_SURROGATE_MIN = 0xDC00; | 387 const int _TAIL_SURROGATE_MIN = 0xDC00; |
| 391 | 388 |
| 392 bool _isLeadSurrogate(int codeUnit) => | 389 bool _isLeadSurrogate(int codeUnit) => |
| 393 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | 390 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
| 394 bool _isTailSurrogate(int codeUnit) => | 391 bool _isTailSurrogate(int codeUnit) => |
| 395 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | 392 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
| 396 int _combineSurrogatePair(int lead, int tail) => | 393 int _combineSurrogatePair(int lead, int tail) => |
| 397 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | 394 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | |
| 398 | (tail & _SURROGATE_VALUE_MASK); | 395 (tail & _SURROGATE_VALUE_MASK); |
| 399 | 396 |
| 400 /** | 397 /** |
| 401 * Decodes UTF-8. | 398 * Decodes UTF-8. |
| 402 * | 399 * |
| 403 * The decoder handles chunked input. | 400 * The decoder handles chunked input. |
| 404 */ | 401 */ |
| 405 // TODO(floitsch): make this class public. | 402 // TODO(floitsch): make this class public. |
| 406 class _Utf8Decoder { | 403 class _Utf8Decoder { |
| 407 final bool _allowMalformed; | 404 final bool _allowMalformed; |
| 408 final StringSink _stringSink; | 405 final StringSink _stringSink; |
| 409 bool _isFirstCharacter = true; | 406 bool _isFirstCharacter = true; |
| 410 int _value = 0; | 407 int _value = 0; |
| 411 int _expectedUnits = 0; | 408 int _expectedUnits = 0; |
| 412 int _extraUnits = 0; | 409 int _extraUnits = 0; |
| 413 | 410 |
| 414 _Utf8Decoder(this._stringSink, this._allowMalformed); | 411 _Utf8Decoder(this._stringSink, this._allowMalformed); |
| 415 | 412 |
| 416 bool get hasPartialInput => _expectedUnits > 0; | 413 bool get hasPartialInput => _expectedUnits > 0; |
| 417 | 414 |
| 418 // Limits of one through four byte encodings. | 415 // Limits of one through four byte encodings. |
| 419 static const List<int> _LIMITS = const <int>[ | 416 static const List<int> _LIMITS = const <int>[ |
| 420 _ONE_BYTE_LIMIT, | 417 _ONE_BYTE_LIMIT, |
| 421 _TWO_BYTE_LIMIT, | 418 _TWO_BYTE_LIMIT, |
| 422 _THREE_BYTE_LIMIT, | 419 _THREE_BYTE_LIMIT, |
| 423 _FOUR_BYTE_LIMIT ]; | 420 _FOUR_BYTE_LIMIT |
| 421 ]; |
| 424 | 422 |
| 425 void close() { | 423 void close() { |
| 426 flush(); | 424 flush(); |
| 427 } | 425 } |
| 428 | 426 |
| 429 /** | 427 /** |
| 430 * Flushes this decoder as if closed. | 428 * Flushes this decoder as if closed. |
| 431 * | 429 * |
| 432 * This method throws if the input was partial and the decoder was | 430 * This method throws if the input was partial and the decoder was |
| 433 * constructed with `allowMalformed` set to `false`. | 431 * constructed with `allowMalformed` set to `false`. |
| 434 * | 432 * |
| 435 * The [source] and [offset] of the current position may be provided, | 433 * The [source] and [offset] of the current position may be provided, |
| 436 * and are included in the exception if one is thrown. | 434 * and are included in the exception if one is thrown. |
| 437 */ | 435 */ |
| 438 void flush([List<int> source, int offset]) { | 436 void flush([List<int> source, int offset]) { |
| 439 if (hasPartialInput) { | 437 if (hasPartialInput) { |
| 440 if (!_allowMalformed) { | 438 if (!_allowMalformed) { |
| 441 throw new FormatException("Unfinished UTF-8 octet sequence", | 439 throw new FormatException( |
| 442 source, offset); | 440 "Unfinished UTF-8 octet sequence", source, offset); |
| 443 } | 441 } |
| 444 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 442 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 445 _value = 0; | 443 _value = 0; |
| 446 _expectedUnits = 0; | 444 _expectedUnits = 0; |
| 447 _extraUnits = 0; | 445 _extraUnits = 0; |
| 448 } | 446 } |
| 449 } | 447 } |
| 450 | 448 |
| 451 void convert(List<int> codeUnits, int startIndex, int endIndex) { | 449 void convert(List<int> codeUnits, int startIndex, int endIndex) { |
| 452 int value = _value; | 450 int value = _value; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 466 return to - from; | 464 return to - from; |
| 467 } | 465 } |
| 468 | 466 |
| 469 void addSingleBytes(int from, int to) { | 467 void addSingleBytes(int from, int to) { |
| 470 assert(from >= startIndex && from <= endIndex); | 468 assert(from >= startIndex && from <= endIndex); |
| 471 assert(to >= startIndex && to <= endIndex); | 469 assert(to >= startIndex && to <= endIndex); |
| 472 _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); | 470 _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); |
| 473 } | 471 } |
| 474 | 472 |
| 475 int i = startIndex; | 473 int i = startIndex; |
| 476 loop: while (true) { | 474 loop: |
| 477 multibyte: if (expectedUnits > 0) { | 475 while (true) { |
| 476 multibyte: |
| 477 if (expectedUnits > 0) { |
| 478 do { | 478 do { |
| 479 if (i == endIndex) { | 479 if (i == endIndex) { |
| 480 break loop; | 480 break loop; |
| 481 } | 481 } |
| 482 int unit = codeUnits[i]; | 482 int unit = codeUnits[i]; |
| 483 if ((unit & 0xC0) != 0x80) { | 483 if ((unit & 0xC0) != 0x80) { |
| 484 expectedUnits = 0; | 484 expectedUnits = 0; |
| 485 if (!_allowMalformed) { | 485 if (!_allowMalformed) { |
| 486 throw new FormatException( | 486 throw new FormatException( |
| 487 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", | 487 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", |
| 488 codeUnits, i); | 488 codeUnits, |
| 489 i); |
| 489 } | 490 } |
| 490 _isFirstCharacter = false; | 491 _isFirstCharacter = false; |
| 491 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 492 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 492 break multibyte; | 493 break multibyte; |
| 493 } else { | 494 } else { |
| 494 value = (value << 6) | (unit & 0x3f); | 495 value = (value << 6) | (unit & 0x3f); |
| 495 expectedUnits--; | 496 expectedUnits--; |
| 496 i++; | 497 i++; |
| 497 } | 498 } |
| 498 } while (expectedUnits > 0); | 499 } while (expectedUnits > 0); |
| 499 if (value <= _LIMITS[extraUnits - 1]) { | 500 if (value <= _LIMITS[extraUnits - 1]) { |
| 500 // Overly long encoding. The value could be encoded with a shorter | 501 // Overly long encoding. The value could be encoded with a shorter |
| 501 // encoding. | 502 // encoding. |
| 502 if (!_allowMalformed) { | 503 if (!_allowMalformed) { |
| 503 throw new FormatException( | 504 throw new FormatException( |
| 504 "Overlong encoding of 0x${value.toRadixString(16)}", | 505 "Overlong encoding of 0x${value.toRadixString(16)}", |
| 505 codeUnits, i - extraUnits - 1); | 506 codeUnits, |
| 507 i - extraUnits - 1); |
| 506 } | 508 } |
| 507 expectedUnits = extraUnits = 0; | 509 expectedUnits = extraUnits = 0; |
| 508 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 510 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 509 } | 511 } |
| 510 if (value > _FOUR_BYTE_LIMIT) { | 512 if (value > _FOUR_BYTE_LIMIT) { |
| 511 if (!_allowMalformed) { | 513 if (!_allowMalformed) { |
| 512 throw new FormatException("Character outside valid Unicode range: " | 514 throw new FormatException( |
| 513 "0x${value.toRadixString(16)}", | 515 "Character outside valid Unicode range: " |
| 514 codeUnits, i - extraUnits - 1); | 516 "0x${value.toRadixString(16)}", |
| 517 codeUnits, |
| 518 i - extraUnits - 1); |
| 515 } | 519 } |
| 516 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 520 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 517 } | 521 } |
| 518 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | 522 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { |
| 519 _stringSink.writeCharCode(value); | 523 _stringSink.writeCharCode(value); |
| 520 } | 524 } |
| 521 _isFirstCharacter = false; | 525 _isFirstCharacter = false; |
| 522 } | 526 } |
| 523 | 527 |
| 524 while (i < endIndex) { | 528 while (i < endIndex) { |
| 525 int oneBytes = scanOneByteCharacters(codeUnits, i); | 529 int oneBytes = scanOneByteCharacters(codeUnits, i); |
| 526 if (oneBytes > 0) { | 530 if (oneBytes > 0) { |
| 527 _isFirstCharacter = false; | 531 _isFirstCharacter = false; |
| 528 addSingleBytes(i, i + oneBytes); | 532 addSingleBytes(i, i + oneBytes); |
| 529 i += oneBytes; | 533 i += oneBytes; |
| 530 if (i == endIndex) break; | 534 if (i == endIndex) break; |
| 531 } | 535 } |
| 532 int unit = codeUnits[i++]; | 536 int unit = codeUnits[i++]; |
| 533 // TODO(floitsch): the way we test we could potentially allow | 537 // TODO(floitsch): the way we test we could potentially allow |
| 534 // units that are too large, if they happen to have the | 538 // units that are too large, if they happen to have the |
| 535 // right bit-pattern. (Same is true for the multibyte loop above). | 539 // right bit-pattern. (Same is true for the multibyte loop above). |
| 536 // TODO(floitsch): optimize this loop. See: | 540 // TODO(floitsch): optimize this loop. See: |
| 537 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | 541 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 |
| 538 if (unit < 0) { | 542 if (unit < 0) { |
| 539 // TODO(floitsch): should this be unit <= 0 ? | 543 // TODO(floitsch): should this be unit <= 0 ? |
| 540 if (!_allowMalformed) { | 544 if (!_allowMalformed) { |
| 541 throw new FormatException( | 545 throw new FormatException( |
| 542 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}", | 546 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}", |
| 543 codeUnits, i - 1); | 547 codeUnits, |
| 548 i - 1); |
| 544 } | 549 } |
| 545 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 550 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 546 } else { | 551 } else { |
| 547 assert(unit > _ONE_BYTE_LIMIT); | 552 assert(unit > _ONE_BYTE_LIMIT); |
| 548 if ((unit & 0xE0) == 0xC0) { | 553 if ((unit & 0xE0) == 0xC0) { |
| 549 value = unit & 0x1F; | 554 value = unit & 0x1F; |
| 550 expectedUnits = extraUnits = 1; | 555 expectedUnits = extraUnits = 1; |
| 551 continue loop; | 556 continue loop; |
| 552 } | 557 } |
| 553 if ((unit & 0xF0) == 0xE0) { | 558 if ((unit & 0xF0) == 0xE0) { |
| 554 value = unit & 0x0F; | 559 value = unit & 0x0F; |
| 555 expectedUnits = extraUnits = 2; | 560 expectedUnits = extraUnits = 2; |
| 556 continue loop; | 561 continue loop; |
| 557 } | 562 } |
| 558 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | 563 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
| 559 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | 564 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
| 560 value = unit & 0x07; | 565 value = unit & 0x07; |
| 561 expectedUnits = extraUnits = 3; | 566 expectedUnits = extraUnits = 3; |
| 562 continue loop; | 567 continue loop; |
| 563 } | 568 } |
| 564 if (!_allowMalformed) { | 569 if (!_allowMalformed) { |
| 565 throw new FormatException( | 570 throw new FormatException( |
| 566 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", | 571 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}", |
| 567 codeUnits, i - 1); | 572 codeUnits, |
| 573 i - 1); |
| 568 } | 574 } |
| 569 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 575 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 570 expectedUnits = extraUnits = 0; | 576 expectedUnits = extraUnits = 0; |
| 571 _isFirstCharacter = false; | 577 _isFirstCharacter = false; |
| 572 _stringSink.writeCharCode(value); | 578 _stringSink.writeCharCode(value); |
| 573 } | 579 } |
| 574 } | 580 } |
| 575 break loop; | 581 break loop; |
| 576 } | 582 } |
| 577 if (expectedUnits > 0) { | 583 if (expectedUnits > 0) { |
| 578 _value = value; | 584 _value = value; |
| 579 _expectedUnits = expectedUnits; | 585 _expectedUnits = expectedUnits; |
| 580 _extraUnits = extraUnits; | 586 _extraUnits = extraUnits; |
| 581 } | 587 } |
| 582 } | 588 } |
| 583 } | 589 } |
| OLD | NEW |