Index: sdk/lib/convert/utf.dart |
diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart |
index 09692f0ec87b6102aead2cd6037b8a2988fc51f9..451bf95063c57463aa8a66c45f880afd1df79513 100644 |
--- a/sdk/lib/convert/utf.dart |
+++ b/sdk/lib/convert/utf.dart |
@@ -21,11 +21,167 @@ class Utf8Encoder extends Converter<String, List<int>> { |
* to a string. |
*/ |
class Utf8Decoder extends Converter<List<int>, String> { |
+ final bool _allowMalformed; |
+ |
+ /** |
+ * Instantiates a new [Utf8Decoder]. |
+ * |
+ * The optional [allowMalformed] argument defines how [convert] deals |
+ * with invalid or unterminated character sequences. |
+ * |
+ * If it is `true` [convert] replaces invalid (or unterminated) character |
+ * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
U+FFFD
floitsch
2013/07/16 14:25:24
Done.
|
+ * it throws a [FormatException]. |
+ */ |
+ Utf8Decoder({ bool allowMalformed: false }) |
+ : this._allowMalformed = allowMalformed; |
+ |
/** |
* Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
* corresponding string. |
*/ |
- // TODO(floitsch): allow to configure the decoder (for example the replacement |
- // character). |
- String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); |
+ String convert(List<int> codeUnits) { |
+ StringBuffer buffer = new StringBuffer(); |
+ _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
+ decoder.convert(codeUnits, 0, codeUnits.length, buffer); |
+ decoder.close(buffer); |
+ return buffer.toString(); |
+ } |
+} |
+ |
+// UTF-8 constants. |
+const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes |
+const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes |
+const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes |
+const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. |
+ |
+// UTF-16 constants. |
+const int _SURROGATE_MASK = 0xF800; |
+const int _SURROGATE_TAG_MASK = 0xFC00; |
+const int _SURROGATE_VALUE_MASK = 0x3FF; |
+const int _LEAD_SURROGATE_MIN = 0xD800; |
+const int _TAIL_SURROGATE_MIN = 0xDC00; |
+ |
+const int _REPLACEMENT_CHARACTER = 0xFFFD; |
+ |
+bool _isSurrogate(int codeUnit) => |
+ (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; |
+bool _isLeadSurrogate(int codeUnit) => |
+ (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
+bool _isTailSurrogate(int codeUnit) => |
+ (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
+int _combineSurrogatePair(int lead, int tail) => |
+ 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) |
+ | (tail & _SURROGATE_VALUE_MASK); |
+ |
+ |
+/** |
+ * Decodes utf-8. |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
UTF-8.
floitsch
2013/07/16 14:25:24
Done.
|
+ * |
+ * The decoder handles chunked input. |
+ */ |
+// TODO(floitsch): do we want to make this class public? |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Sure, why not?
floitsch
2013/07/16 14:25:24
Later. But rephrased TODO.
|
+class _Utf8Decoder { |
+ final bool _allowMalformed; |
+ int _value = 0; |
+ int _expectedUnits = 0; |
+ int _extraUnits = 0; |
+ |
+ _Utf8Decoder(this._allowMalformed); |
+ |
+ bool get hasPartialInput => _expectedUnits > 0; |
+ |
+ // Limits of one through four byte encodings. |
+ static const List<int> _LIMITS = const <int>[ |
+ _ONE_BYTE_LIMIT, |
+ _TWO_BYTE_LIMIT, |
+ _THREE_BYTE_LIMIT, |
+ _FOUR_BYTE_LIMIT ]; |
+ |
+ void close(StringSink sink) { |
+ if (hasPartialInput) { |
+ _throwIfNecessary("Unfinished UTF-8 encoding"); |
+ sink.writeCharCode(_REPLACEMENT_CHARACTER); |
+ } |
+ } |
+ |
+ void convert(List<int> codeUnits, int startIndex, int endIndex, |
+ StringSink sink) { |
+ int value = _value; |
+ int expectedUnits = _expectedUnits; |
+ int extraUnits = _extraUnits; |
+ _value = 0; |
+ _expectedUnits = 0; |
+ _extraUnits = 0; |
+ |
+ int i = startIndex; |
+ loop: while (true) { |
+ multibyte: if (expectedUnits > 0) { |
+ do { |
+ if (i == endIndex) { |
+ break loop; |
+ } |
+ int unit = codeUnits[i]; |
+ if ((unit & 0xC0) != 0x80) { |
+ expectedUnits = 0; |
+ _throwIfNecessary( |
+ "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
+ sink.writeCharCode(_REPLACEMENT_CHARACTER); |
+ break multibyte; |
+ } else { |
+ value = (value << 6) | (unit & 0x3f); |
+ expectedUnits--; |
+ i++; |
+ } |
+ } while (expectedUnits > 0); |
+ if (value <= _LIMITS[extraUnits - 1]) { |
+ // Overly long encoding. The value could be encoded with a shorter |
+ // encoding. |
+ _throwIfNecessary( |
+ "Overlong encoding of 0x${value.toRadixString(16)}"); |
+ value = _REPLACEMENT_CHARACTER; |
+ } |
+ sink.writeCharCode(value); |
+ } |
+ |
+ while (i < endIndex) { |
+ int unit = codeUnits[i++]; |
+ if (unit <= _ONE_BYTE_LIMIT) { |
+ sink.writeCharCode(unit); |
+ } else { |
+ if ((unit & 0xE0) == 0xC0) { |
+ value = unit & 0x1F; |
+ expectedUnits = extraUnits = 1; |
+ continue loop; |
+ } |
+ if ((unit & 0xF0) == 0xE0) { |
+ value = unit & 0x0F; |
+ expectedUnits = extraUnits = 2; |
+ continue loop; |
+ } |
+ if ((unit & 0xF8) == 0xF0) { |
+ value = unit & 0x07; |
+ expectedUnits = extraUnits = 3; |
+ continue loop; |
+ } |
+ _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Seems inefficient to create the string and not use
floitsch
2013/07/16 14:25:24
inlined.
|
+ value = _REPLACEMENT_CHARACTER; |
+ expectedUnits = extraUnits = 0; |
+ sink.writeCharCode(value); |
+ } |
+ } |
+ break loop; |
+ } |
+ if (expectedUnits > 0) { |
+ _value = value; |
+ _expectedUnits = expectedUnits; |
+ _extraUnits = extraUnits; |
+ } |
+ } |
+ |
+ void _throwIfNecessary(String message) { |
+ if (!_allowMalformed) { |
+ throw new FormatException(message); |
+ } |
+ } |
} |