| Index: sdk/lib/convert/utf.dart
|
| diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart
|
| index 6ce09a99e7703b9e6a78a3245434935bdcc66205..e9a4aa894537432c2a693c1b46210370d0d8b316 100644
|
| --- a/sdk/lib/convert/utf.dart
|
| +++ b/sdk/lib/convert/utf.dart
|
| @@ -62,7 +62,7 @@ class Utf8Codec extends Encoding {
|
| }
|
|
|
| /**
|
| - * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of
|
| + * This class converts strings to their UTF-8 code units (a list of
|
| * unsigned 8-bit integers).
|
| */
|
| class Utf8Encoder extends Converter<String, List<int>> {
|
| @@ -86,6 +86,20 @@ class Utf8Encoder extends Converter<String, List<int>> {
|
| }
|
| return encoder._buffer.sublist(0, encoder._bufferIndex);
|
| }
|
| +
|
| + /**
|
| + * Starts a chunked conversion.
|
| + *
|
| + * The converter works more efficiently if the given [sink] is a
|
| + * [ByteConversionSink].
|
| + */
|
| + StringConversionSink startChunkedConversion(
|
| + ChunkedConversionSink<List<int>> sink) {
|
| + if (sink is! ByteConversionSink) {
|
| + sink = new ByteConversionSink.from(sink);
|
| + }
|
| + return new _Utf8EncoderSink(sink);
|
| + }
|
| }
|
|
|
| /**
|
| @@ -110,8 +124,11 @@ class _Utf8Encoder {
|
| * writes it to [_buffer].
|
| *
|
| * Returns true if the [nextCodeUnit] was combined with the
|
| - * [leadingSurrogate]. If it wasn't then nextCodeUnit has not been written
|
| - * yet.
|
| + * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing
|
| + * surrogate and has not been written yet.
|
| + *
|
| + * It is safe to pass 0 for [nextCodeUnit] in which case only the leading
|
| + * surrogate is written.
|
| */
|
| bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {
|
| if (_isTailSurrogate(nextCodeUnit)) {
|
| @@ -187,6 +204,72 @@ class _Utf8Encoder {
|
| }
|
|
|
| /**
|
| + * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit
|
| + * integers).
|
| + */
|
| +class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {
|
| +
|
| + final ByteConversionSink _sink;
|
| +
|
| + _Utf8EncoderSink(this._sink);
|
| +
|
| + void close() {
|
| + if (_carry != 0) {
|
| + // addSlice will call close again, but then the carry must be equal to 0.
|
| + addSlice("", 0, 0, true);
|
| + return;
|
| + }
|
| + _sink.close();
|
| + }
|
| +
|
| + void addSlice(String str, int start, int end, bool isLast) {
|
| + _bufferIndex = 0;
|
| +
|
| + if (start == end && !isLast) {
|
| + return;
|
| + }
|
| +
|
| + if (_carry != 0) {
|
| + int nextCodeUnit = 0;
|
| + if (start != end) {
|
| + nextCodeUnit = str.codeUnitAt(start);
|
| + } else {
|
| + assert(isLast);
|
| + }
|
| + bool wasCombined = _writeSurrogate(_carry, nextCodeUnit);
|
| + // Either we got a non-empty string, or we must not have been combined.
|
| + assert(!wasCombined || start != end );
|
| + if (wasCombined) start++;
|
| + _carry = 0;
|
| + }
|
| + do {
|
| + start = _fillBuffer(str, start, end);
|
| + bool isLastSlice = isLast && (start == end);
|
| + if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {
|
| + if (isLast && _bufferIndex < _buffer.length - 3) {
|
| + // There is still space for the last incomplete surrogate.
|
| + // We use a non-surrogate as second argument. This way the
|
| + // function will just add the surrogate-half to the buffer.
|
| + bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);
|
| + assert(!hasBeenCombined);
|
| + } else {
|
| + // Otherwise store it in the carry. If isLast is true, then
|
| + // close will flush the last carry.
|
| + _carry = str.codeUnitAt(start);
|
| + }
|
| + start++;
|
| + }
|
| + _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);
|
| + _bufferIndex = 0;
|
| + } while (start < end);
|
| + if (isLast) close();
|
| + }
|
| +
|
| + // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it
|
| + // needs to deal with malformed input.
|
| +}
|
| +
|
| +/**
|
| * This class converts UTF-8 code units (lists of unsigned 8-bit integers)
|
| * to a string.
|
| */
|
| @@ -212,18 +295,35 @@ class Utf8Decoder extends Converter<List<int>, String> {
|
| */
|
| String convert(List<int> codeUnits) {
|
| StringBuffer buffer = new StringBuffer();
|
| - _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
|
| - decoder.convert(codeUnits, 0, codeUnits.length, buffer);
|
| - decoder.close(buffer);
|
| + _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);
|
| + decoder.convert(codeUnits, 0, codeUnits.length);
|
| + decoder.close();
|
| return buffer.toString();
|
| }
|
| +
|
| + /**
|
| + * Starts a chunked conversion.
|
| + *
|
| + * The converter works more efficiently if the given [sink] is a
|
| + * [StringConversionSink].
|
| + */
|
| + ByteConversionSink startChunkedConversion(
|
| + ChunkedConversionSink<String> sink) {
|
| + StringConversionSink stringSink;
|
| + if (sink is StringConversionSink) {
|
| + stringSink = sink;
|
| + } else {
|
| + stringSink = new StringConversionSink.from(sink);
|
| + }
|
| + return stringSink.asUtf8Sink(_allowMalformed);
|
| + }
|
| }
|
|
|
| // UTF-8 constants.
|
| -const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
|
| -const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
|
| -const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
|
| -const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
|
| +const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits
|
| +const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits
|
| +const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits
|
| +const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.
|
|
|
| // UTF-16 constants.
|
| const int _SURROGATE_MASK = 0xF800;
|
| @@ -254,12 +354,13 @@ int _combineSurrogatePair(int lead, int tail) =>
|
| // TODO(floitsch): make this class public.
|
| class _Utf8Decoder {
|
| final bool _allowMalformed;
|
| + final StringSink _stringSink;
|
| bool _isFirstCharacter = true;
|
| int _value = 0;
|
| int _expectedUnits = 0;
|
| int _extraUnits = 0;
|
|
|
| - _Utf8Decoder(this._allowMalformed);
|
| + _Utf8Decoder(this._stringSink, this._allowMalformed);
|
|
|
| bool get hasPartialInput => _expectedUnits > 0;
|
|
|
| @@ -270,17 +371,29 @@ class _Utf8Decoder {
|
| _THREE_BYTE_LIMIT,
|
| _FOUR_BYTE_LIMIT ];
|
|
|
| - void close(StringSink sink) {
|
| + void close() {
|
| + flush();
|
| + }
|
| +
|
| + /**
|
| + * Flushes this decoder as if closed.
|
| + *
|
| + * This method throws if the input was partial and the decoder was
|
| + * constructed with `allowMalformed` set to `false`.
|
| + */
|
| + void flush() {
|
| if (hasPartialInput) {
|
| if (!_allowMalformed) {
|
| throw new FormatException("Unfinished UTF-8 octet sequence");
|
| }
|
| - sink.writeCharCode(_REPLACEMENT_CHARACTER);
|
| + _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);
|
| + _value = 0;
|
| + _expectedUnits = 0;
|
| + _extraUnits = 0;
|
| }
|
| }
|
|
|
| - void convert(List<int> codeUnits, int startIndex, int endIndex,
|
| - StringSink sink) {
|
| + void convert(List<int> codeUnits, int startIndex, int endIndex) {
|
| int value = _value;
|
| int expectedUnits = _expectedUnits;
|
| int extraUnits = _extraUnits;
|
| @@ -303,7 +416,7 @@ class _Utf8Decoder {
|
| "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
|
| }
|
| _isFirstCharacter = false;
|
| - sink.writeCharCode(_REPLACEMENT_CHARACTER);
|
| + _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);
|
| break multibyte;
|
| } else {
|
| value = (value << 6) | (unit & 0x3f);
|
| @@ -329,7 +442,7 @@ class _Utf8Decoder {
|
| value = _REPLACEMENT_CHARACTER;
|
| }
|
| if (!_isFirstCharacter || value != _BOM_CHARACTER) {
|
| - sink.writeCharCode(value);
|
| + _stringSink.writeCharCode(value);
|
| }
|
| _isFirstCharacter = false;
|
| }
|
| @@ -338,7 +451,7 @@ class _Utf8Decoder {
|
| int unit = codeUnits[i++];
|
| if (unit <= _ONE_BYTE_LIMIT) {
|
| _isFirstCharacter = false;
|
| - sink.writeCharCode(unit);
|
| + _stringSink.writeCharCode(unit);
|
| } else {
|
| if ((unit & 0xE0) == 0xC0) {
|
| value = unit & 0x1F;
|
| @@ -363,7 +476,7 @@ class _Utf8Decoder {
|
| value = _REPLACEMENT_CHARACTER;
|
| expectedUnits = extraUnits = 0;
|
| _isFirstCharacter = false;
|
| - sink.writeCharCode(value);
|
| + _stringSink.writeCharCode(value);
|
| }
|
| }
|
| break loop;
|
|
|