Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(376)

Unified Diff: sdk/lib/convert/utf.dart

Issue 19883003: Add chunked conversion to converters. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Address comments. Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sdk/lib/convert/string_conversion.dart ('k') | tests/lib/convert/chunked_conversion1_test.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sdk/lib/convert/utf.dart
diff --git a/sdk/lib/convert/utf.dart b/sdk/lib/convert/utf.dart
index 6ce09a99e7703b9e6a78a3245434935bdcc66205..e9a4aa894537432c2a693c1b46210370d0d8b316 100644
--- a/sdk/lib/convert/utf.dart
+++ b/sdk/lib/convert/utf.dart
@@ -62,7 +62,7 @@ class Utf8Codec extends Encoding {
}
/**
- * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of
+ * This class converts strings to their UTF-8 code units (a list of
* unsigned 8-bit integers).
*/
class Utf8Encoder extends Converter<String, List<int>> {
@@ -86,6 +86,20 @@ class Utf8Encoder extends Converter<String, List<int>> {
}
return encoder._buffer.sublist(0, encoder._bufferIndex);
}
+
+ /**
+ * Starts a chunked conversion.
+ *
+ * The converter works more efficiently if the given [sink] is a
+ * [ByteConversionSink].
+ */
+ StringConversionSink startChunkedConversion(
+ ChunkedConversionSink<List<int>> sink) {
+ if (sink is! ByteConversionSink) {
+ sink = new ByteConversionSink.from(sink);
+ }
+ return new _Utf8EncoderSink(sink);
+ }
}
/**
@@ -110,8 +124,11 @@ class _Utf8Encoder {
* writes it to [_buffer].
*
* Returns true if the [nextCodeUnit] was combined with the
- * [leadingSurrogate]. If it wasn't then nextCodeUnit has not been written
- * yet.
+ * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing
+ * surrogate and has not been written yet.
+ *
+ * It is safe to pass 0 for [nextCodeUnit] in which case only the leading
+ * surrogate is written.
*/
bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {
if (_isTailSurrogate(nextCodeUnit)) {
@@ -187,6 +204,72 @@ class _Utf8Encoder {
}
/**
+ * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit
+ * integers).
+ */
+class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {
+
+ final ByteConversionSink _sink;
+
+ _Utf8EncoderSink(this._sink);
+
+ void close() {
+ if (_carry != 0) {
+ // addSlice will call close again, but then the carry must be equal to 0.
+ addSlice("", 0, 0, true);
+ return;
+ }
+ _sink.close();
+ }
+
+ void addSlice(String str, int start, int end, bool isLast) {
+ _bufferIndex = 0;
+
+ if (start == end && !isLast) {
+ return;
+ }
+
+ if (_carry != 0) {
+ int nextCodeUnit = 0;
+ if (start != end) {
+ nextCodeUnit = str.codeUnitAt(start);
+ } else {
+ assert(isLast);
+ }
+ bool wasCombined = _writeSurrogate(_carry, nextCodeUnit);
+ // Either we got a non-empty string, or we must not have been combined.
+ assert(!wasCombined || start != end );
+ if (wasCombined) start++;
+ _carry = 0;
+ }
+ do {
+ start = _fillBuffer(str, start, end);
+ bool isLastSlice = isLast && (start == end);
+ if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {
+ if (isLast && _bufferIndex < _buffer.length - 3) {
+ // There is still space for the last incomplete surrogate.
+ // We use a non-surrogate as second argument. This way the
+ // function will just add the surrogate-half to the buffer.
+ bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);
+ assert(!hasBeenCombined);
+ } else {
+ // Otherwise store it in the carry. If isLast is true, then
+ // close will flush the last carry.
+ _carry = str.codeUnitAt(start);
+ }
+ start++;
+ }
+ _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);
+ _bufferIndex = 0;
+ } while (start < end);
+ if (isLast) close();
+ }
+
+ // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it
+ // needs to deal with malformed input.
+}
+
+/**
* This class converts UTF-8 code units (lists of unsigned 8-bit integers)
* to a string.
*/
@@ -212,18 +295,35 @@ class Utf8Decoder extends Converter<List<int>, String> {
*/
String convert(List<int> codeUnits) {
StringBuffer buffer = new StringBuffer();
- _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
- decoder.convert(codeUnits, 0, codeUnits.length, buffer);
- decoder.close(buffer);
+ _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);
+ decoder.convert(codeUnits, 0, codeUnits.length);
+ decoder.close();
return buffer.toString();
}
+
+ /**
+ * Starts a chunked conversion.
+ *
+ * The converter works more efficiently if the given [sink] is a
+ * [StringConversionSink].
+ */
+ ByteConversionSink startChunkedConversion(
+ ChunkedConversionSink<String> sink) {
+ StringConversionSink stringSink;
+ if (sink is StringConversionSink) {
+ stringSink = sink;
+ } else {
+ stringSink = new StringConversionSink.from(sink);
+ }
+ return stringSink.asUtf8Sink(_allowMalformed);
+ }
}
// UTF-8 constants.
-const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
-const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
-const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
-const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
+const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits
+const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits
+const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits
+const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.
// UTF-16 constants.
const int _SURROGATE_MASK = 0xF800;
@@ -254,12 +354,13 @@ int _combineSurrogatePair(int lead, int tail) =>
// TODO(floitsch): make this class public.
class _Utf8Decoder {
final bool _allowMalformed;
+ final StringSink _stringSink;
bool _isFirstCharacter = true;
int _value = 0;
int _expectedUnits = 0;
int _extraUnits = 0;
- _Utf8Decoder(this._allowMalformed);
+ _Utf8Decoder(this._stringSink, this._allowMalformed);
bool get hasPartialInput => _expectedUnits > 0;
@@ -270,17 +371,29 @@ class _Utf8Decoder {
_THREE_BYTE_LIMIT,
_FOUR_BYTE_LIMIT ];
- void close(StringSink sink) {
+ void close() {
+ flush();
+ }
+
+ /**
+ * Flushes this decoder as if closed.
+ *
+ * This method throws if the input was partial and the decoder was
+ * constructed with `allowMalformed` set to `false`.
+ */
+ void flush() {
if (hasPartialInput) {
if (!_allowMalformed) {
throw new FormatException("Unfinished UTF-8 octet sequence");
}
- sink.writeCharCode(_REPLACEMENT_CHARACTER);
+ _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);
+ _value = 0;
+ _expectedUnits = 0;
+ _extraUnits = 0;
}
}
- void convert(List<int> codeUnits, int startIndex, int endIndex,
- StringSink sink) {
+ void convert(List<int> codeUnits, int startIndex, int endIndex) {
int value = _value;
int expectedUnits = _expectedUnits;
int extraUnits = _extraUnits;
@@ -303,7 +416,7 @@ class _Utf8Decoder {
"Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
}
_isFirstCharacter = false;
- sink.writeCharCode(_REPLACEMENT_CHARACTER);
+ _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);
break multibyte;
} else {
value = (value << 6) | (unit & 0x3f);
@@ -329,7 +442,7 @@ class _Utf8Decoder {
value = _REPLACEMENT_CHARACTER;
}
if (!_isFirstCharacter || value != _BOM_CHARACTER) {
- sink.writeCharCode(value);
+ _stringSink.writeCharCode(value);
}
_isFirstCharacter = false;
}
@@ -338,7 +451,7 @@ class _Utf8Decoder {
int unit = codeUnits[i++];
if (unit <= _ONE_BYTE_LIMIT) {
_isFirstCharacter = false;
- sink.writeCharCode(unit);
+ _stringSink.writeCharCode(unit);
} else {
if ((unit & 0xE0) == 0xC0) {
value = unit & 0x1F;
@@ -363,7 +476,7 @@ class _Utf8Decoder {
value = _REPLACEMENT_CHARACTER;
expectedUnits = extraUnits = 0;
_isFirstCharacter = false;
- sink.writeCharCode(value);
+ _stringSink.writeCharCode(value);
}
}
break loop;
« no previous file with comments | « sdk/lib/convert/string_conversion.dart ('k') | tests/lib/convert/chunked_conversion1_test.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698