Index: sdk/lib/convert/utf.dart |
diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart |
index 09692f0ec87b6102aead2cd6037b8a2988fc51f9..b7fc365265b3f99770a965a424871340b5f07665 100644 |
--- a/sdk/lib/convert/utf.dart |
+++ b/sdk/lib/convert/utf.dart |
@@ -21,11 +21,187 @@ class Utf8Encoder extends Converter<String, List<int>> { |
* to a string. |
*/ |
class Utf8Decoder extends Converter<List<int>, String> { |
+ final bool _allowMalformed; |
+ |
+ /** |
+ * Instantiates a new [Utf8Decoder]. |
+ * |
+ * The optional [allowMalformed] argument defines how [convert] deals |
+ * with invalid or unterminated character sequences. |
+ * |
+ * If it is `true` [convert] replaces invalid (or unterminated) character |
+ * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
+ * it throws a [FormatException]. |
+ */ |
+ Utf8Decoder({ bool allowMalformed: false }) |
+ : this._allowMalformed = allowMalformed; |
+ |
/** |
* Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
* corresponding string. |
*/ |
- // TODO(floitsch): allow to configure the decoder (for example the replacement |
- // character). |
- String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); |
+ String convert(List<int> codeUnits) { |
+ StringBuffer buffer = new StringBuffer(); |
+ _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
+ decoder.convert(codeUnits, 0, codeUnits.length, buffer); |
+ decoder.close(buffer); |
+ return buffer.toString(); |
+ } |
+} |
+ |
+// UTF-8 constants. |
+const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes |
+const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes |
+const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes |
+const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. |
+ |
+// UTF-16 constants. |
+const int _SURROGATE_MASK = 0xF800; |
+const int _SURROGATE_TAG_MASK = 0xFC00; |
+const int _SURROGATE_VALUE_MASK = 0x3FF; |
+const int _LEAD_SURROGATE_MIN = 0xD800; |
+const int _TAIL_SURROGATE_MIN = 0xDC00; |
+ |
+const int _REPLACEMENT_CHARACTER = 0xFFFD; |
+const int _BOM_CHARACTER = 0xFEFF; |
+ |
+bool _isSurrogate(int codeUnit) => |
+ (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; |
+bool _isLeadSurrogate(int codeUnit) => |
+ (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
+bool _isTailSurrogate(int codeUnit) => |
+ (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
+int _combineSurrogatePair(int lead, int tail) => |
+ 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) |
+ | (tail & _SURROGATE_VALUE_MASK); |
+ |
+ |
+/** |
+ * Decodes UTF-8. |
+ * |
+ * The decoder handles chunked input. |
+ */ |
+// TODO(floitsch): make this class public. |
+class _Utf8Decoder { |
+ final bool _allowMalformed; |
+ bool _isFirstCharacter = true; |
+ int _value = 0; |
+ int _expectedUnits = 0; |
+ int _extraUnits = 0; |
+ |
+ _Utf8Decoder(this._allowMalformed); |
+ |
+ bool get hasPartialInput => _expectedUnits > 0; |
+ |
+ // Limits of one through four byte encodings. |
+ static const List<int> _LIMITS = const <int>[ |
+ _ONE_BYTE_LIMIT, |
+ _TWO_BYTE_LIMIT, |
+ _THREE_BYTE_LIMIT, |
+ _FOUR_BYTE_LIMIT ]; |
+ |
+ void close(StringSink sink) { |
+ if (hasPartialInput) { |
+ if (!_allowMalformed) { |
+ throw new FormatException("Unfinished UTF-8 octet sequence"); |
+ } |
+ sink.writeCharCode(_REPLACEMENT_CHARACTER); |
+ } |
+ } |
+ |
+ void convert(List<int> codeUnits, int startIndex, int endIndex, |
+ StringSink sink) { |
+ int value = _value; |
+ int expectedUnits = _expectedUnits; |
+ int extraUnits = _extraUnits; |
+ _value = 0; |
+ _expectedUnits = 0; |
+ _extraUnits = 0; |
+ |
+ int i = startIndex; |
+ loop: while (true) { |
+ multibyte: if (expectedUnits > 0) { |
+ do { |
+ if (i == endIndex) { |
+ break loop; |
+ } |
+ int unit = codeUnits[i]; |
+ if ((unit & 0xC0) != 0x80) { |
+ expectedUnits = 0; |
+ if (!_allowMalformed) { |
+ throw new FormatException( |
+ "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
+ } |
+ _isFirstCharacter = false; |
+ sink.writeCharCode(_REPLACEMENT_CHARACTER); |
+ break multibyte; |
+ } else { |
+ value = (value << 6) | (unit & 0x3f); |
+ expectedUnits--; |
+ i++; |
+ } |
+ } while (expectedUnits > 0); |
+ if (value <= _LIMITS[extraUnits - 1]) { |
+ // Overly long encoding. The value could be encoded with a shorter |
+ // encoding. |
+ if (!_allowMalformed) { |
+ throw new FormatException( |
+ "Overlong encoding of 0x${value.toRadixString(16)}"); |
+ } |
+ expectedUnits = extraUnits = 0; |
+ value = _REPLACEMENT_CHARACTER; |
+ } |
+ if (value > _FOUR_BYTE_LIMIT) { |
+ if (!_allowMalformed) { |
+ throw new FormatException("Character outside valid Unicode range: " |
+ "0x${value.toRadixString(16)}"); |
+ } |
+ value = _REPLACEMENT_CHARACTER; |
+ } |
+ if (!_isFirstCharacter || value != _BOM_CHARACTER) { |
+ sink.writeCharCode(value); |
+ } |
+ _isFirstCharacter = false; |
+ } |
+ |
+ while (i < endIndex) { |
+ int unit = codeUnits[i++]; |
+ if (unit <= _ONE_BYTE_LIMIT) { |
+ _isFirstCharacter = false; |
+ sink.writeCharCode(unit); |
+ } else { |
+ if ((unit & 0xE0) == 0xC0) { |
+ value = unit & 0x1F; |
+ expectedUnits = extraUnits = 1; |
+ continue loop; |
+ } |
+ if ((unit & 0xF0) == 0xE0) { |
+ value = unit & 0x0F; |
+ expectedUnits = extraUnits = 2; |
+ continue loop; |
+ } |
+ // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
+ if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
+ value = unit & 0x07; |
+ expectedUnits = extraUnits = 3; |
+ continue loop; |
+ } |
+ if (!_allowMalformed) { |
+ throw new FormatException( |
+ "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
+ } |
+ value = _REPLACEMENT_CHARACTER; |
+ expectedUnits = extraUnits = 0; |
+ _isFirstCharacter = false; |
+ sink.writeCharCode(value); |
+ } |
+ } |
+ break loop; |
+ } |
+ if (expectedUnits > 0) { |
+ _value = value; |
+ _expectedUnits = expectedUnits; |
+ _extraUnits = extraUnits; |
+ } |
+ } |
} |