| Index: sdk/lib/convert/utf.dart
|
| diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart
|
| index 09692f0ec87b6102aead2cd6037b8a2988fc51f9..b7fc365265b3f99770a965a424871340b5f07665 100644
|
| --- a/sdk/lib/convert/utf.dart
|
| +++ b/sdk/lib/convert/utf.dart
|
| @@ -21,11 +21,187 @@ class Utf8Encoder extends Converter<String, List<int>> {
|
| * to a string.
|
| */
|
| class Utf8Decoder extends Converter<List<int>, String> {
|
| + final bool _allowMalformed;
|
| +
|
| + /**
|
| + * Instantiates a new [Utf8Decoder].
|
| + *
|
| + * The optional [allowMalformed] argument defines how [convert] deals
|
| + * with invalid or unterminated character sequences.
|
| + *
|
| + * If it is `true` [convert] replaces invalid (or unterminated) character
|
| + * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
|
| + * it throws a [FormatException].
|
| + */
|
| + Utf8Decoder({ bool allowMalformed: false })
|
| + : this._allowMalformed = allowMalformed;
|
| +
|
| /**
|
| * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
|
| * corresponding string.
|
| */
|
| - // TODO(floitsch): allow to configure the decoder (for example the replacement
|
| - // character).
|
| - String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits);
|
| + String convert(List<int> codeUnits) {
|
| + StringBuffer buffer = new StringBuffer();
|
| + _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
|
| + decoder.convert(codeUnits, 0, codeUnits.length, buffer);
|
| + decoder.close(buffer);
|
| + return buffer.toString();
|
| + }
|
| +}
|
| +
|
| +// UTF-8 constants.
|
| +const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
|
| +const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
|
| +const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
|
| +const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
|
| +
|
| +// UTF-16 constants.
|
| +const int _SURROGATE_MASK = 0xF800;
|
| +const int _SURROGATE_TAG_MASK = 0xFC00;
|
| +const int _SURROGATE_VALUE_MASK = 0x3FF;
|
| +const int _LEAD_SURROGATE_MIN = 0xD800;
|
| +const int _TAIL_SURROGATE_MIN = 0xDC00;
|
| +
|
| +const int _REPLACEMENT_CHARACTER = 0xFFFD;
|
| +const int _BOM_CHARACTER = 0xFEFF;
|
| +
|
| +bool _isSurrogate(int codeUnit) =>
|
| + (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
|
| +bool _isLeadSurrogate(int codeUnit) =>
|
| + (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
|
| +bool _isTailSurrogate(int codeUnit) =>
|
| + (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
|
| +int _combineSurrogatePair(int lead, int tail) =>
|
| + 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
|
| + | (tail & _SURROGATE_VALUE_MASK);
|
| +
|
| +
|
| +/**
|
| + * Decodes UTF-8.
|
| + *
|
| + * The decoder handles chunked input.
|
| + */
|
| +// TODO(floitsch): make this class public.
|
| +class _Utf8Decoder {
|
| + final bool _allowMalformed;
|
| + bool _isFirstCharacter = true;
|
| + int _value = 0;
|
| + int _expectedUnits = 0;
|
| + int _extraUnits = 0;
|
| +
|
| + _Utf8Decoder(this._allowMalformed);
|
| +
|
| + bool get hasPartialInput => _expectedUnits > 0;
|
| +
|
| + // Limits of one through four byte encodings.
|
| + static const List<int> _LIMITS = const <int>[
|
| + _ONE_BYTE_LIMIT,
|
| + _TWO_BYTE_LIMIT,
|
| + _THREE_BYTE_LIMIT,
|
| + _FOUR_BYTE_LIMIT ];
|
| +
|
| + void close(StringSink sink) {
|
| + if (hasPartialInput) {
|
| + if (!_allowMalformed) {
|
| + throw new FormatException("Unfinished UTF-8 octet sequence");
|
| + }
|
| + sink.writeCharCode(_REPLACEMENT_CHARACTER);
|
| + }
|
| + }
|
| +
|
| + void convert(List<int> codeUnits, int startIndex, int endIndex,
|
| + StringSink sink) {
|
| + int value = _value;
|
| + int expectedUnits = _expectedUnits;
|
| + int extraUnits = _extraUnits;
|
| + _value = 0;
|
| + _expectedUnits = 0;
|
| + _extraUnits = 0;
|
| +
|
| + int i = startIndex;
|
| + loop: while (true) {
|
| + multibyte: if (expectedUnits > 0) {
|
| + do {
|
| + if (i == endIndex) {
|
| + break loop;
|
| + }
|
| + int unit = codeUnits[i];
|
| + if ((unit & 0xC0) != 0x80) {
|
| + expectedUnits = 0;
|
| + if (!_allowMalformed) {
|
| + throw new FormatException(
|
| + "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
|
| + }
|
| + _isFirstCharacter = false;
|
| + sink.writeCharCode(_REPLACEMENT_CHARACTER);
|
| + break multibyte;
|
| + } else {
|
| + value = (value << 6) | (unit & 0x3f);
|
| + expectedUnits--;
|
| + i++;
|
| + }
|
| + } while (expectedUnits > 0);
|
| + if (value <= _LIMITS[extraUnits - 1]) {
|
| + // Overly long encoding. The value could be encoded with a shorter
|
| + // encoding.
|
| + if (!_allowMalformed) {
|
| + throw new FormatException(
|
| + "Overlong encoding of 0x${value.toRadixString(16)}");
|
| + }
|
| + expectedUnits = extraUnits = 0;
|
| + value = _REPLACEMENT_CHARACTER;
|
| + }
|
| + if (value > _FOUR_BYTE_LIMIT) {
|
| + if (!_allowMalformed) {
|
| + throw new FormatException("Character outside valid Unicode range: "
|
| + "0x${value.toRadixString(16)}");
|
| + }
|
| + value = _REPLACEMENT_CHARACTER;
|
| + }
|
| + if (!_isFirstCharacter || value != _BOM_CHARACTER) {
|
| + sink.writeCharCode(value);
|
| + }
|
| + _isFirstCharacter = false;
|
| + }
|
| +
|
| + while (i < endIndex) {
|
| + int unit = codeUnits[i++];
|
| + if (unit <= _ONE_BYTE_LIMIT) {
|
| + _isFirstCharacter = false;
|
| + sink.writeCharCode(unit);
|
| + } else {
|
| + if ((unit & 0xE0) == 0xC0) {
|
| + value = unit & 0x1F;
|
| + expectedUnits = extraUnits = 1;
|
| + continue loop;
|
| + }
|
| + if ((unit & 0xF0) == 0xE0) {
|
| + value = unit & 0x0F;
|
| + expectedUnits = extraUnits = 2;
|
| + continue loop;
|
| + }
|
| + // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
|
| + if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
|
| + value = unit & 0x07;
|
| + expectedUnits = extraUnits = 3;
|
| + continue loop;
|
| + }
|
| + if (!_allowMalformed) {
|
| + throw new FormatException(
|
| + "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
|
| + }
|
| + value = _REPLACEMENT_CHARACTER;
|
| + expectedUnits = extraUnits = 0;
|
| + _isFirstCharacter = false;
|
| + sink.writeCharCode(value);
|
| + }
|
| + }
|
| + break loop;
|
| + }
|
| + if (expectedUnits > 0) {
|
| + _value = value;
|
| + _expectedUnits = expectedUnits;
|
| + _extraUnits = extraUnits;
|
| + }
|
| + }
|
| }
|
|
|