Chromium Code Reviews| Index: sdk/lib/convert/utf.dart |
| diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart |
| index 09692f0ec87b6102aead2cd6037b8a2988fc51f9..451bf95063c57463aa8a66c45f880afd1df79513 100644 |
| --- a/sdk/lib/convert/utf.dart |
| +++ b/sdk/lib/convert/utf.dart |
| @@ -21,11 +21,167 @@ class Utf8Encoder extends Converter<String, List<int>> { |
| * to a string. |
| */ |
| class Utf8Decoder extends Converter<List<int>, String> { |
| + final bool _allowMalformed; |
| + |
| + /** |
| + * Instantiates a new [Utf8Decoder]. |
| + * |
| + * The optional [allowMalformed] argument defines how [convert] deals |
| + * with invalid or unterminated character sequences. |
| + * |
| + * If it is `true` [convert] replaces invalid (or unterminated) character |
| + * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
U+FFFD
floitsch
2013/07/16 14:25:24
Done.
|
| + * it throws a [FormatException]. |
| + */ |
| + Utf8Decoder({ bool allowMalformed: false }) |
| + : this._allowMalformed = allowMalformed; |
| + |
| /** |
| * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| * corresponding string. |
| */ |
| - // TODO(floitsch): allow to configure the decoder (for example the replacement |
| - // character). |
| - String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); |
| + String convert(List<int> codeUnits) { |
| + StringBuffer buffer = new StringBuffer(); |
| + _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
| + decoder.convert(codeUnits, 0, codeUnits.length, buffer); |
| + decoder.close(buffer); |
| + return buffer.toString(); |
| + } |
| +} |
| + |
| +// UTF-8 constants. |
| +const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes |
| +const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes |
| +const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes |
| +const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. |
| + |
| +// UTF-16 constants. |
| +const int _SURROGATE_MASK = 0xF800; |
| +const int _SURROGATE_TAG_MASK = 0xFC00; |
| +const int _SURROGATE_VALUE_MASK = 0x3FF; |
| +const int _LEAD_SURROGATE_MIN = 0xD800; |
| +const int _TAIL_SURROGATE_MIN = 0xDC00; |
| + |
| +const int _REPLACEMENT_CHARACTER = 0xFFFD; |
| + |
| +bool _isSurrogate(int codeUnit) => |
| + (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; |
| +bool _isLeadSurrogate(int codeUnit) => |
| + (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
| +bool _isTailSurrogate(int codeUnit) => |
| + (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
| +int _combineSurrogatePair(int lead, int tail) => |
| + 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) |
| + | (tail & _SURROGATE_VALUE_MASK); |
| + |
| + |
| +/** |
| + * Decodes utf-8. |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
UTF-8.
floitsch
2013/07/16 14:25:24
Done.
|
| + * |
| + * The decoder handles chunked input. |
| + */ |
| +// TODO(floitsch): do we want to make this class public? |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Sure, why not?
floitsch
2013/07/16 14:25:24
Later. But rephrased TODO.
|
| +class _Utf8Decoder { |
| + final bool _allowMalformed; |
| + int _value = 0; |
| + int _expectedUnits = 0; |
| + int _extraUnits = 0; |
| + |
| + _Utf8Decoder(this._allowMalformed); |
| + |
| + bool get hasPartialInput => _expectedUnits > 0; |
| + |
| + // Limits of one through four byte encodings. |
| + static const List<int> _LIMITS = const <int>[ |
| + _ONE_BYTE_LIMIT, |
| + _TWO_BYTE_LIMIT, |
| + _THREE_BYTE_LIMIT, |
| + _FOUR_BYTE_LIMIT ]; |
| + |
| + void close(StringSink sink) { |
| + if (hasPartialInput) { |
| + _throwIfNecessary("Unfinished UTF-8 encoding"); |
| + sink.writeCharCode(_REPLACEMENT_CHARACTER); |
| + } |
| + } |
| + |
| + void convert(List<int> codeUnits, int startIndex, int endIndex, |
| + StringSink sink) { |
| + int value = _value; |
| + int expectedUnits = _expectedUnits; |
| + int extraUnits = _extraUnits; |
| + _value = 0; |
| + _expectedUnits = 0; |
| + _extraUnits = 0; |
| + |
| + int i = startIndex; |
| + loop: while (true) { |
| + multibyte: if (expectedUnits > 0) { |
| + do { |
| + if (i == endIndex) { |
| + break loop; |
| + } |
| + int unit = codeUnits[i]; |
| + if ((unit & 0xC0) != 0x80) { |
| + expectedUnits = 0; |
| + _throwIfNecessary( |
| + "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| + sink.writeCharCode(_REPLACEMENT_CHARACTER); |
| + break multibyte; |
| + } else { |
| + value = (value << 6) | (unit & 0x3f); |
| + expectedUnits--; |
| + i++; |
| + } |
| + } while (expectedUnits > 0); |
| + if (value <= _LIMITS[extraUnits - 1]) { |
| + // Overly long encoding. The value could be encoded with a shorter |
| + // encoding. |
| + _throwIfNecessary( |
| + "Overlong encoding of 0x${value.toRadixString(16)}"); |
| + value = _REPLACEMENT_CHARACTER; |
| + } |
| + sink.writeCharCode(value); |
| + } |
| + |
| + while (i < endIndex) { |
| + int unit = codeUnits[i++]; |
| + if (unit <= _ONE_BYTE_LIMIT) { |
| + sink.writeCharCode(unit); |
| + } else { |
| + if ((unit & 0xE0) == 0xC0) { |
| + value = unit & 0x1F; |
| + expectedUnits = extraUnits = 1; |
| + continue loop; |
| + } |
| + if ((unit & 0xF0) == 0xE0) { |
| + value = unit & 0x0F; |
| + expectedUnits = extraUnits = 2; |
| + continue loop; |
| + } |
| + if ((unit & 0xF8) == 0xF0) { |
| + value = unit & 0x07; |
| + expectedUnits = extraUnits = 3; |
| + continue loop; |
| + } |
| + _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Seems inefficient to create the string and not use
floitsch
2013/07/16 14:25:24
inlined.
|
| + value = _REPLACEMENT_CHARACTER; |
| + expectedUnits = extraUnits = 0; |
| + sink.writeCharCode(value); |
| + } |
| + } |
| + break loop; |
| + } |
| + if (expectedUnits > 0) { |
| + _value = value; |
| + _expectedUnits = expectedUnits; |
| + _extraUnits = extraUnits; |
| + } |
| + } |
| + |
| + void _throwIfNecessary(String message) { |
| + if (!_allowMalformed) { |
| + throw new FormatException(message); |
| + } |
| + } |
| } |