sdk/lib/convert/utf.dart - Issue 19187002: Replace old utf8 decoder with new one.

Unified Diff: sdk/lib/convert/utf.dart

Issue 19187002: Replace old utf8 decoder with new one. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Add comments. Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: sdk/lib/convert/utf.dart

diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart

index 09692f0ec87b6102aead2cd6037b8a2988fc51f9..451bf95063c57463aa8a66c45f880afd1df79513 100644

--- a/sdk/lib/convert/utf.dart

+++ b/sdk/lib/convert/utf.dart

@@ -21,11 +21,167 @@ class Utf8Encoder extends Converter<String, List<int>> {

* to a string.

class Utf8Decoder extends Converter<List<int>, String> {

+ final bool _allowMalformed;

+ /**

+ * Instantiates a new [Utf8Decoder].

+ *

+ * The optional [allowMalformed] argument defines how [convert] deals

+ * with invalid or unterminated character sequences.

+ *

+ * If it is `true` [convert] replaces invalid (or unterminated) character

+ * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise

Lasse Reichstein Nielsen 2013/07/16 12:23:03 U+FFFD

floitsch 2013/07/16 14:25:24 Done.

+ * it throws a [FormatException].

+ */

+ Utf8Decoder({ bool allowMalformed: false })

+ : this._allowMalformed = allowMalformed;

/**

* Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

* corresponding string.

- // TODO(floitsch): allow to configure the decoder (for example the replacement

- // character).

- String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits);

+ String convert(List<int> codeUnits) {

+ StringBuffer buffer = new StringBuffer();

+ _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);

+ decoder.convert(codeUnits, 0, codeUnits.length, buffer);

+ decoder.close(buffer);

+ return buffer.toString();

+ }

+// UTF-8 constants.

+const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes

+const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes

+const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes

+const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.

+// UTF-16 constants.

+const int _SURROGATE_MASK = 0xF800;

+const int _SURROGATE_TAG_MASK = 0xFC00;

+const int _SURROGATE_VALUE_MASK = 0x3FF;

+const int _LEAD_SURROGATE_MIN = 0xD800;

+const int _TAIL_SURROGATE_MIN = 0xDC00;

+const int _REPLACEMENT_CHARACTER = 0xFFFD;

+bool _isSurrogate(int codeUnit) =>

+ (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;

+bool _isLeadSurrogate(int codeUnit) =>

+ (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;

+bool _isTailSurrogate(int codeUnit) =>

+ (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;

+int _combineSurrogatePair(int lead, int tail) =>

+ 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)

+ | (tail & _SURROGATE_VALUE_MASK);

+/**

+ * Decodes utf-8.

Lasse Reichstein Nielsen 2013/07/16 12:23:03 UTF-8.

floitsch 2013/07/16 14:25:24 Done.

+ *

+ * The decoder handles chunked input.

+ */

+// TODO(floitsch): do we want to make this class public?

Lasse Reichstein Nielsen 2013/07/16 12:23:03 Sure, why not?

floitsch 2013/07/16 14:25:24 Later. But rephrased TODO.

+class _Utf8Decoder {

+ final bool _allowMalformed;

+ int _value = 0;

+ int _expectedUnits = 0;

+ int _extraUnits = 0;

+ _Utf8Decoder(this._allowMalformed);

+ bool get hasPartialInput => _expectedUnits > 0;

+ // Limits of one through four byte encodings.

+ static const List<int> _LIMITS = const <int>[

+ _ONE_BYTE_LIMIT,

+ _TWO_BYTE_LIMIT,

+ _THREE_BYTE_LIMIT,

+ _FOUR_BYTE_LIMIT ];

+ void close(StringSink sink) {

+ if (hasPartialInput) {

+ _throwIfNecessary("Unfinished UTF-8 encoding");

+ sink.writeCharCode(_REPLACEMENT_CHARACTER);

+ }

+ void convert(List<int> codeUnits, int startIndex, int endIndex,

+ StringSink sink) {

+ int value = _value;

+ int expectedUnits = _expectedUnits;

+ int extraUnits = _extraUnits;

+ _value = 0;

+ _expectedUnits = 0;

+ _extraUnits = 0;

+ int i = startIndex;

+ loop: while (true) {

+ multibyte: if (expectedUnits > 0) {

+ do {

+ if (i == endIndex) {

+ break loop;

+ }

+ int unit = codeUnits[i];

+ if ((unit & 0xC0) != 0x80) {

+ expectedUnits = 0;

+ _throwIfNecessary(

+ "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

+ sink.writeCharCode(_REPLACEMENT_CHARACTER);

+ break multibyte;

+ } else {

+ value = (value << 6) | (unit & 0x3f);

+ expectedUnits--;

+ i++;

+ }

+ } while (expectedUnits > 0);

+ if (value <= _LIMITS[extraUnits - 1]) {

+ // Overly long encoding. The value could be encoded with a shorter

+ // encoding.

+ _throwIfNecessary(

+ "Overlong encoding of 0x${value.toRadixString(16)}");

+ value = _REPLACEMENT_CHARACTER;

+ }

+ sink.writeCharCode(value);

+ }

+ while (i < endIndex) {

+ int unit = codeUnits[i++];

+ if (unit <= _ONE_BYTE_LIMIT) {

+ sink.writeCharCode(unit);

+ } else {

+ if ((unit & 0xE0) == 0xC0) {

+ value = unit & 0x1F;

+ expectedUnits = extraUnits = 1;

+ continue loop;

+ }

+ if ((unit & 0xF0) == 0xE0) {

+ value = unit & 0x0F;

+ expectedUnits = extraUnits = 2;

+ continue loop;

+ }

+ if ((unit & 0xF8) == 0xF0) {

+ value = unit & 0x07;

+ expectedUnits = extraUnits = 3;

+ continue loop;

+ }

+ _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

Lasse Reichstein Nielsen 2013/07/16 12:23:03 Seems inefficient to create the string and not use

floitsch 2013/07/16 14:25:24 inlined.

+ value = _REPLACEMENT_CHARACTER;

+ expectedUnits = extraUnits = 0;

+ sink.writeCharCode(value);

+ }

+ break loop;

+ }

+ if (expectedUnits > 0) {

+ _value = value;

+ _expectedUnits = expectedUnits;

+ _extraUnits = extraUnits;

+ }

+ void _throwIfNecessary(String message) {

+ if (!_allowMalformed) {

+ throw new FormatException(message);

+ }

}

« sdk/lib/codec/encoding.dart ('K') | « sdk/lib/codec/encoding.dart ('k') | tests/lib/convert/utf82_test.dart » ('j') | tests/lib/convert/utf82_test.dart » ('J')