sdk/lib/convert/utf.dart - Issue 19883003: Add chunked conversion to converters.

Unified Diff: sdk/lib/convert/utf.dart

Issue 19883003: Add chunked conversion to converters. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Address comments. Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: sdk/lib/convert/utf.dart

diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart

index 6ce09a99e7703b9e6a78a3245434935bdcc66205..e9a4aa894537432c2a693c1b46210370d0d8b316 100644

--- a/sdk/lib/convert/utf.dart

+++ b/sdk/lib/convert/utf.dart

@@ -62,7 +62,7 @@ class Utf8Codec extends Encoding {

}

/**

- * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of

+ * This class converts strings to their UTF-8 code units (a list of

* unsigned 8-bit integers).

class Utf8Encoder extends Converter<String, List<int>> {

@@ -86,6 +86,20 @@ class Utf8Encoder extends Converter<String, List<int>> {

}

return encoder._buffer.sublist(0, encoder._bufferIndex);

}

+ /**

+ * Starts a chunked conversion.

+ *

+ * The converter works more efficiently if the given [sink] is a

+ * [ByteConversionSink].

+ */

+ StringConversionSink startChunkedConversion(

+ ChunkedConversionSink<List<int>> sink) {

+ if (sink is! ByteConversionSink) {

+ sink = new ByteConversionSink.from(sink);

+ }

+ return new _Utf8EncoderSink(sink);

+ }

}

/**

@@ -110,8 +124,11 @@ class _Utf8Encoder {

* writes it to [_buffer].

* Returns true if the [nextCodeUnit] was combined with the

- * [leadingSurrogate]. If it wasn't then nextCodeUnit has not been written

- * yet.

+ * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing

+ * surrogate and has not been written yet.

+ *

+ * It is safe to pass 0 for [nextCodeUnit] in which case only the leading

+ * surrogate is written.

bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {

if (_isTailSurrogate(nextCodeUnit)) {

@@ -187,6 +204,72 @@ class _Utf8Encoder {

}

/**

+ * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit

+ * integers).

+ */

+class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {

+ final ByteConversionSink _sink;

+ _Utf8EncoderSink(this._sink);

+ void close() {

+ if (_carry != 0) {

+ // addSlice will call close again, but then the carry must be equal to 0.

+ addSlice("", 0, 0, true);

+ return;

+ }

+ _sink.close();

+ }

+ void addSlice(String str, int start, int end, bool isLast) {

+ _bufferIndex = 0;

+ if (start == end && !isLast) {

+ return;

+ }

+ if (_carry != 0) {

+ int nextCodeUnit = 0;

+ if (start != end) {

+ nextCodeUnit = str.codeUnitAt(start);

+ } else {

+ assert(isLast);

+ }

+ bool wasCombined = _writeSurrogate(_carry, nextCodeUnit);

+ // Either we got a non-empty string, or we must not have been combined.

+ assert(!wasCombined || start != end );

+ if (wasCombined) start++;

+ _carry = 0;

+ }

+ do {

+ start = _fillBuffer(str, start, end);

+ bool isLastSlice = isLast && (start == end);

+ if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {

+ if (isLast && _bufferIndex < _buffer.length - 3) {

+ // There is still space for the last incomplete surrogate.

+ // We use a non-surrogate as second argument. This way the

+ // function will just add the surrogate-half to the buffer.

+ bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);

+ assert(!hasBeenCombined);

+ } else {

+ // Otherwise store it in the carry. If isLast is true, then

+ // close will flush the last carry.

+ _carry = str.codeUnitAt(start);

+ }

+ start++;

+ }

+ _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);

+ _bufferIndex = 0;

+ } while (start < end);

+ if (isLast) close();

+ }

+ // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it

+ // needs to deal with malformed input.

+/**

* This class converts UTF-8 code units (lists of unsigned 8-bit integers)

* to a string.

@@ -212,18 +295,35 @@ class Utf8Decoder extends Converter<List<int>, String> {

String convert(List<int> codeUnits) {

StringBuffer buffer = new StringBuffer();

- _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);

- decoder.convert(codeUnits, 0, codeUnits.length, buffer);

- decoder.close(buffer);

+ _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);

+ decoder.convert(codeUnits, 0, codeUnits.length);

+ decoder.close();

return buffer.toString();

}

+ /**

+ * Starts a chunked conversion.

+ *

+ * The converter works more efficiently if the given [sink] is a

+ * [StringConversionSink].

+ */

+ ByteConversionSink startChunkedConversion(

+ ChunkedConversionSink<String> sink) {

+ StringConversionSink stringSink;

+ if (sink is StringConversionSink) {

+ stringSink = sink;

+ } else {

+ stringSink = new StringConversionSink.from(sink);

+ }

+ return stringSink.asUtf8Sink(_allowMalformed);

+ }

}

// UTF-8 constants.

-const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes

-const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes

-const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes

-const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.

+const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits

+const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits

+const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits

+const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.

// UTF-16 constants.

const int _SURROGATE_MASK = 0xF800;

@@ -254,12 +354,13 @@ int _combineSurrogatePair(int lead, int tail) =>

// TODO(floitsch): make this class public.

class _Utf8Decoder {

final bool _allowMalformed;

+ final StringSink _stringSink;

bool _isFirstCharacter = true;

int _value = 0;

int _expectedUnits = 0;

int _extraUnits = 0;

- _Utf8Decoder(this._allowMalformed);

+ _Utf8Decoder(this._stringSink, this._allowMalformed);

bool get hasPartialInput => _expectedUnits > 0;

@@ -270,17 +371,29 @@ class _Utf8Decoder {

_THREE_BYTE_LIMIT,

_FOUR_BYTE_LIMIT ];

- void close(StringSink sink) {

+ void close() {

+ flush();

+ }

+ /**

+ * Flushes this decoder as if closed.

+ *

+ * This method throws if the input was partial and the decoder was

+ * constructed with `allowMalformed` set to `false`.

+ */

+ void flush() {

if (hasPartialInput) {

if (!_allowMalformed) {

throw new FormatException("Unfinished UTF-8 octet sequence");

}

- sink.writeCharCode(_REPLACEMENT_CHARACTER);

+ _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);

+ _value = 0;

+ _expectedUnits = 0;

+ _extraUnits = 0;

}

- void convert(List<int> codeUnits, int startIndex, int endIndex,

- StringSink sink) {

+ void convert(List<int> codeUnits, int startIndex, int endIndex) {

int value = _value;

int expectedUnits = _expectedUnits;

int extraUnits = _extraUnits;

@@ -303,7 +416,7 @@ class _Utf8Decoder {

"Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

}

_isFirstCharacter = false;

- sink.writeCharCode(_REPLACEMENT_CHARACTER);

+ _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);

break multibyte;

} else {

value = (value << 6) | (unit & 0x3f);

@@ -329,7 +442,7 @@ class _Utf8Decoder {

value = _REPLACEMENT_CHARACTER;

}

if (!_isFirstCharacter || value != _BOM_CHARACTER) {

- sink.writeCharCode(value);

+ _stringSink.writeCharCode(value);

}

_isFirstCharacter = false;

}

@@ -338,7 +451,7 @@ class _Utf8Decoder {

int unit = codeUnits[i++];

if (unit <= _ONE_BYTE_LIMIT) {

_isFirstCharacter = false;

- sink.writeCharCode(unit);

+ _stringSink.writeCharCode(unit);

} else {

if ((unit & 0xE0) == 0xC0) {

value = unit & 0x1F;

@@ -363,7 +476,7 @@ class _Utf8Decoder {

value = _REPLACEMENT_CHARACTER;

expectedUnits = extraUnits = 0;

_isFirstCharacter = false;

- sink.writeCharCode(value);

+ _stringSink.writeCharCode(value);

}

break loop;

« no previous file with comments | « sdk/lib/convert/string_conversion.dart ('k') | tests/lib/convert/chunked_conversion1_test.dart » ('j') | no next file with comments »