Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(286)

Unified Diff: sdk/lib/convert/utf.dart

Issue 19187002: Replace old utf8 decoder with new one. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Address comments. Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sdk/lib/codec/encoding.dart ('k') | tests/lib/convert/utf82_test.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sdk/lib/convert/utf.dart
diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart
index 09692f0ec87b6102aead2cd6037b8a2988fc51f9..b7fc365265b3f99770a965a424871340b5f07665 100644
--- a/sdk/lib/convert/utf.dart
+++ b/sdk/lib/convert/utf.dart
@@ -21,11 +21,187 @@ class Utf8Encoder extends Converter<String, List<int>> {
* to a string.
*/
class Utf8Decoder extends Converter<List<int>, String> {
+ final bool _allowMalformed;
+
+ /**
+ * Instantiates a new [Utf8Decoder].
+ *
+ * The optional [allowMalformed] argument defines how [convert] deals
+ * with invalid or unterminated character sequences.
+ *
+ * If it is `true` [convert] replaces invalid (or unterminated) character
+ * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
+ * it throws a [FormatException].
+ */
+ Utf8Decoder({ bool allowMalformed: false })
+ : this._allowMalformed = allowMalformed;
+
/**
* Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
* corresponding string.
*/
- // TODO(floitsch): allow to configure the decoder (for example the replacement
- // character).
- String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits);
+ String convert(List<int> codeUnits) {
+ StringBuffer buffer = new StringBuffer();
+ _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
+ decoder.convert(codeUnits, 0, codeUnits.length, buffer);
+ decoder.close(buffer);
+ return buffer.toString();
+ }
+}
+
+// UTF-8 constants.
+const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
+const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
+const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
+const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
+
+// UTF-16 constants.
+const int _SURROGATE_MASK = 0xF800;
+const int _SURROGATE_TAG_MASK = 0xFC00;
+const int _SURROGATE_VALUE_MASK = 0x3FF;
+const int _LEAD_SURROGATE_MIN = 0xD800;
+const int _TAIL_SURROGATE_MIN = 0xDC00;
+
+const int _REPLACEMENT_CHARACTER = 0xFFFD;
+const int _BOM_CHARACTER = 0xFEFF;
+
+bool _isSurrogate(int codeUnit) =>
+ (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
+bool _isLeadSurrogate(int codeUnit) =>
+ (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
+bool _isTailSurrogate(int codeUnit) =>
+ (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
+int _combineSurrogatePair(int lead, int tail) =>
+ 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
+ | (tail & _SURROGATE_VALUE_MASK);
+
+
+/**
+ * Decodes UTF-8.
+ *
+ * The decoder handles chunked input.
+ */
+// TODO(floitsch): make this class public.
+class _Utf8Decoder {
+ final bool _allowMalformed;
+ bool _isFirstCharacter = true;
+ int _value = 0;
+ int _expectedUnits = 0;
+ int _extraUnits = 0;
+
+ _Utf8Decoder(this._allowMalformed);
+
+ bool get hasPartialInput => _expectedUnits > 0;
+
+ // Limits of one through four byte encodings.
+ static const List<int> _LIMITS = const <int>[
+ _ONE_BYTE_LIMIT,
+ _TWO_BYTE_LIMIT,
+ _THREE_BYTE_LIMIT,
+ _FOUR_BYTE_LIMIT ];
+
+ void close(StringSink sink) {
+ if (hasPartialInput) {
+ if (!_allowMalformed) {
+ throw new FormatException("Unfinished UTF-8 octet sequence");
+ }
+ sink.writeCharCode(_REPLACEMENT_CHARACTER);
+ }
+ }
+
+ void convert(List<int> codeUnits, int startIndex, int endIndex,
+ StringSink sink) {
+ int value = _value;
+ int expectedUnits = _expectedUnits;
+ int extraUnits = _extraUnits;
+ _value = 0;
+ _expectedUnits = 0;
+ _extraUnits = 0;
+
+ int i = startIndex;
+ loop: while (true) {
+ multibyte: if (expectedUnits > 0) {
+ do {
+ if (i == endIndex) {
+ break loop;
+ }
+ int unit = codeUnits[i];
+ if ((unit & 0xC0) != 0x80) {
+ expectedUnits = 0;
+ if (!_allowMalformed) {
+ throw new FormatException(
+ "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
+ }
+ _isFirstCharacter = false;
+ sink.writeCharCode(_REPLACEMENT_CHARACTER);
+ break multibyte;
+ } else {
+ value = (value << 6) | (unit & 0x3f);
+ expectedUnits--;
+ i++;
+ }
+ } while (expectedUnits > 0);
+ if (value <= _LIMITS[extraUnits - 1]) {
+ // Overly long encoding. The value could be encoded with a shorter
+ // encoding.
+ if (!_allowMalformed) {
+ throw new FormatException(
+ "Overlong encoding of 0x${value.toRadixString(16)}");
+ }
+ expectedUnits = extraUnits = 0;
+ value = _REPLACEMENT_CHARACTER;
+ }
+ if (value > _FOUR_BYTE_LIMIT) {
+ if (!_allowMalformed) {
+ throw new FormatException("Character outside valid Unicode range: "
+ "0x${value.toRadixString(16)}");
+ }
+ value = _REPLACEMENT_CHARACTER;
+ }
+ if (!_isFirstCharacter || value != _BOM_CHARACTER) {
+ sink.writeCharCode(value);
+ }
+ _isFirstCharacter = false;
+ }
+
+ while (i < endIndex) {
+ int unit = codeUnits[i++];
+ if (unit <= _ONE_BYTE_LIMIT) {
+ _isFirstCharacter = false;
+ sink.writeCharCode(unit);
+ } else {
+ if ((unit & 0xE0) == 0xC0) {
+ value = unit & 0x1F;
+ expectedUnits = extraUnits = 1;
+ continue loop;
+ }
+ if ((unit & 0xF0) == 0xE0) {
+ value = unit & 0x0F;
+ expectedUnits = extraUnits = 2;
+ continue loop;
+ }
+ // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
+ if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
+ value = unit & 0x07;
+ expectedUnits = extraUnits = 3;
+ continue loop;
+ }
+ if (!_allowMalformed) {
+ throw new FormatException(
+ "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
+ }
+ value = _REPLACEMENT_CHARACTER;
+ expectedUnits = extraUnits = 0;
+ _isFirstCharacter = false;
+ sink.writeCharCode(value);
+ }
+ }
+ break loop;
+ }
+ if (expectedUnits > 0) {
+ _value = value;
+ _expectedUnits = expectedUnits;
+ _extraUnits = extraUnits;
+ }
+ }
}
« no previous file with comments | « sdk/lib/codec/encoding.dart ('k') | tests/lib/convert/utf82_test.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698