Chromium Code Reviews| Index: sdk/lib/convert/utf.dart |
| diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart |
| index 6ce09a99e7703b9e6a78a3245434935bdcc66205..1fd9ddcf4e531abc42b54e2cbc95d0ac70e09745 100644 |
| --- a/sdk/lib/convert/utf.dart |
| +++ b/sdk/lib/convert/utf.dart |
| @@ -62,7 +62,7 @@ class Utf8Codec extends Encoding { |
| } |
| /** |
| - * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of |
| + * This class converts strings to their UTF-8 code units (a list of |
| * unsigned 8-bit integers). |
| */ |
| class Utf8Encoder extends Converter<String, List<int>> { |
| @@ -86,6 +86,16 @@ class Utf8Encoder extends Converter<String, List<int>> { |
| } |
| return encoder._buffer.sublist(0, encoder._bufferIndex); |
| } |
| + |
| + StringConversionSink startChunkedConversion(ChunkedConversionSink sink) { |
| + ByteConversionSink ByteSink = sink.adaptTo(outputInterface); |
| + return new _Utf8EncoderSink(sink); |
| + } |
| + |
| + ChunkedConversionInterface get inputInterface => |
| + StringConversionSink.INTERFACE; |
| + ChunkedConversionInterface get outputInterface => |
| + ByteConversionSink.INTERFACE; |
| } |
| /** |
| @@ -110,8 +120,11 @@ class _Utf8Encoder { |
| * writes it to [_buffer]. |
| * |
| * Returns true if the [nextCodeUnit] was combined with the |
| - * [leadingSurrogate]. If it wasn't then nextCodeUnit has not been written |
| - * yet. |
| + * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing |
| + * surrogate and has not been written yet. |
| + * |
| + * It is safe to pass 0 for [nextCodeUnit] in which case only the leading |
| + * surrogate is written. |
| */ |
| bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) { |
| if (_isTailSurrogate(nextCodeUnit)) { |
| @@ -187,6 +200,72 @@ class _Utf8Encoder { |
| } |
| /** |
| + * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit |
| + * integers). |
| + */ |
| +class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { |
| + |
| + final ByteConversionSink _sink; |
| + |
| + _Utf8EncoderSink(this._sink); |
| + |
| + void close() { |
| + if (_carry != 0) { |
| + // addSlice will call close again, but then the carry must be equal to 0. |
| + addSlice("", 0, 0, true); |
| + return; |
| + } |
| + _sink.close(); |
| + } |
| + |
| + void addSlice(String str, int start, int end, bool isLast) { |
| + _bufferIndex = 0; |
| + |
| + if (start == end && !isLast) { |
| + return; |
| + } |
| + |
| + if (_carry != 0) { |
| + int nextCodeUnit = 0; |
| + if (start != end) { |
| + nextCodeUnit = str.codeUnitAt(start); |
| + } else { |
|
Søren Gjesse
2013/07/24 09:26:41
Two spaces before else.
floitsch
2013/07/24 18:31:15
good eye.
done.
|
| + assert(isLast); |
| + } |
| + bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); |
| + // Either we got a non-empty string, or we must not have been combined. |
| + assert(!wasCombined || start != end ); |
| + if (wasCombined) start++; |
| + _carry = 0; |
| + } |
| + do { |
| + start = _fillBuffer(str, start, end); |
| + bool isLastSlice = isLast && (start == end); |
| + if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { |
| + if (isLast && _bufferIndex < _buffer.length - 3) { |
| + // There is still space for the last incomplete surrogate. |
| + // We use a non-surrogate as second argument. This way the |
| + // function will just add the surrogate-half to the buffer. |
| + bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0); |
| + assert(!hasBeenCombined); |
| + } else { |
| + // Otherwise store it in the carry. If [isLast] is true, then |
|
Søren Gjesse
2013/07/24 09:26:41
No need for markup in non-dartdoc comment.
floitsch
2013/07/24 18:31:15
Done.
|
| + // close will flush the last carry. |
| + _carry = str.codeUnitAt(start); |
| + } |
| + start++; |
| + } |
| + _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice); |
| + _bufferIndex = 0; |
| + } while (start < end); |
| + if (isLast) close(); |
| + } |
| + |
| + // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it |
| + // needs to deal with malformed input. |
| +} |
| + |
| +/** |
| * This class converts UTF-8 code units (lists of unsigned 8-bit integers) |
| * to a string. |
| */ |
| @@ -212,18 +291,28 @@ class Utf8Decoder extends Converter<List<int>, String> { |
| */ |
| String convert(List<int> codeUnits) { |
| StringBuffer buffer = new StringBuffer(); |
| - _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
| - decoder.convert(codeUnits, 0, codeUnits.length, buffer); |
| - decoder.close(buffer); |
| + _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); |
| + decoder.convert(codeUnits, 0, codeUnits.length); |
| + decoder.close(); |
| return buffer.toString(); |
| } |
| + |
| + ByteConversionSink startChunkedConversion(ChunkedConversionSink sink) { |
| + StringConversionSink stringSink = sink.adaptTo(outputInterface); |
| + return stringSink.asUtf8Sink(_allowMalformed); |
| + } |
| + |
| + ChunkedConversionInterface get inputInterface => |
| + ByteConversionSink.INTERFACE; |
| + ChunkedConversionInterface get outputInterface => |
| + StringConversionSink.INTERFACE; |
| } |
| // UTF-8 constants. |
| -const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes |
| -const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes |
| -const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes |
| -const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. |
| +const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits |
| +const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits |
| +const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits |
| +const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. |
| // UTF-16 constants. |
| const int _SURROGATE_MASK = 0xF800; |
| @@ -254,12 +343,13 @@ int _combineSurrogatePair(int lead, int tail) => |
| // TODO(floitsch): make this class public. |
| class _Utf8Decoder { |
| final bool _allowMalformed; |
| + final StringSink _stringSink; |
| bool _isFirstCharacter = true; |
| int _value = 0; |
| int _expectedUnits = 0; |
| int _extraUnits = 0; |
| - _Utf8Decoder(this._allowMalformed); |
| + _Utf8Decoder(this._stringSink, this._allowMalformed); |
| bool get hasPartialInput => _expectedUnits > 0; |
| @@ -270,17 +360,29 @@ class _Utf8Decoder { |
| _THREE_BYTE_LIMIT, |
| _FOUR_BYTE_LIMIT ]; |
| - void close(StringSink sink) { |
| + void close() { |
| + flush(); |
| + } |
| + |
| + /** |
| + * Flushes this decoder as if closed. |
| + * |
| + * This method throws if the input was partial and the decoder was |
| + * constructed with `allowMalformed` set to `false`. |
| + */ |
| + void flush() { |
| if (hasPartialInput) { |
| if (!_allowMalformed) { |
| throw new FormatException("Unfinished UTF-8 octet sequence"); |
| } |
| - sink.writeCharCode(_REPLACEMENT_CHARACTER); |
| + _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); |
| + _value = 0; |
| + _expectedUnits = 0; |
| + _extraUnits = 0; |
| } |
| } |
| - void convert(List<int> codeUnits, int startIndex, int endIndex, |
| - StringSink sink) { |
| + void convert(List<int> codeUnits, int startIndex, int endIndex) { |
| int value = _value; |
| int expectedUnits = _expectedUnits; |
| int extraUnits = _extraUnits; |
| @@ -303,7 +405,7 @@ class _Utf8Decoder { |
| "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| } |
| _isFirstCharacter = false; |
| - sink.writeCharCode(_REPLACEMENT_CHARACTER); |
| + _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); |
| break multibyte; |
| } else { |
| value = (value << 6) | (unit & 0x3f); |
| @@ -329,7 +431,7 @@ class _Utf8Decoder { |
| value = _REPLACEMENT_CHARACTER; |
| } |
| if (!_isFirstCharacter || value != _BOM_CHARACTER) { |
| - sink.writeCharCode(value); |
| + _stringSink.writeCharCode(value); |
| } |
| _isFirstCharacter = false; |
| } |
| @@ -338,7 +440,7 @@ class _Utf8Decoder { |
| int unit = codeUnits[i++]; |
| if (unit <= _ONE_BYTE_LIMIT) { |
| _isFirstCharacter = false; |
| - sink.writeCharCode(unit); |
| + _stringSink.writeCharCode(unit); |
| } else { |
| if ((unit & 0xE0) == 0xC0) { |
| value = unit & 0x1F; |
| @@ -363,7 +465,7 @@ class _Utf8Decoder { |
| value = _REPLACEMENT_CHARACTER; |
| expectedUnits = extraUnits = 0; |
| _isFirstCharacter = false; |
| - sink.writeCharCode(value); |
| + _stringSink.writeCharCode(value); |
| } |
| } |
| break loop; |