pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart - Issue 2698353003: unfork DDC's copy of most SDK libraries

Unified Diff: pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart

Issue 2698353003: unfork DDC's copy of most SDK libraries (Closed)

Patch Set: revert core_patch Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « pkg/dev_compiler/tool/input_sdk/lib/convert/string_conversion.dart ('k') | pkg/dev_compiler/tool/input_sdk/lib/core/annotations.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart

diff --git a/pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart b/pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart

deleted file mode 100644

index 486ab4ddd756cddde917a3d3f225537215c07af3..0000000000000000000000000000000000000000

--- a/pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart

+++ /dev/null

@@ -1,572 +0,0 @@

-// BSD-style license that can be found in the LICENSE file.

-part of dart.convert;

-/** The Unicode Replacement character `U+FFFD` (�). */

-const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;

-/** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */

-const int UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;

-/**

- * An instance of the default implementation of the [Utf8Codec].

- *

- * This instance provides a convenient access to the most common UTF-8

- * use cases.

- *

- * Examples:

- *

- * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");

- * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,

- * 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]);

- */

-const Utf8Codec UTF8 = const Utf8Codec();

-/**

- * A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes

- * UTF-8 code units to strings.

- */

-class Utf8Codec extends Encoding {

- final bool _allowMalformed;

- /**

- * Instantiates a new [Utf8Codec].

- *

- * The optional [allowMalformed] argument defines how [decoder] (and [decode])

- * deal with invalid or unterminated character sequences.

- *

- * If it is `true` (and not overridden at the method invocation) [decode] and

- * the [decoder] replace invalid (or unterminated) octet

- * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise

- * they throw a [FormatException].

- */

- const Utf8Codec({ bool allowMalformed: false })

- : _allowMalformed = allowMalformed;

- String get name => "utf-8";

- /**

- * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

- * corresponding string.

- *

- * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

- * character is discarded.

- *

- * If [allowMalformed] is `true` the decoder replaces invalid (or

- * unterminated) character sequences with the Unicode Replacement character

- * `U+FFFD` (�). Otherwise it throws a [FormatException].

- *

- * If [allowMalformed] is not given, it defaults to the `allowMalformed` that

- * was used to instantiate `this`.

- */

- String decode(List<int> codeUnits, { bool allowMalformed }) {

- if (allowMalformed == null) allowMalformed = _allowMalformed;

- return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);

- }

- Utf8Encoder get encoder => const Utf8Encoder();

- Utf8Decoder get decoder {

- return new Utf8Decoder(allowMalformed: _allowMalformed);

- }

-/**

- * This class converts strings to their UTF-8 code units (a list of

- * unsigned 8-bit integers).

- */

-class Utf8Encoder extends Converter<String, List<int>> {

- const Utf8Encoder();

- /**

- * Converts [string] to its UTF-8 code units (a list of

- * unsigned 8-bit integers).

- *

- * If [start] and [end] are provided, only the substring

- * `string.substring(start, end)` is converted.

- */

- List<int> convert(String string, [int start = 0, int end]) {

- int stringLength = string.length;

- RangeError.checkValidRange(start, end, stringLength);

- if (end == null) end = stringLength;

- int length = end - start;

- if (length == 0) return new Uint8List(0);

- // Create a new encoder with a length that is guaranteed to be big enough.

- // A single code unit uses at most 3 bytes, a surrogate pair at most 4.

- _Utf8Encoder encoder = new _Utf8Encoder.withBufferSize(length * 3);

- int endPosition = encoder._fillBuffer(string, start, end);

- assert(endPosition >= end - 1);

- if (endPosition != end) {

- // Encoding skipped the last code unit.

- // That can only happen if the last code unit is a leadsurrogate.

- // Force encoding of the lead surrogate by itself.

- int lastCodeUnit = string.codeUnitAt(end - 1);

- assert(_isLeadSurrogate(lastCodeUnit));

- // We use a non-surrogate as `nextUnit` so that _writeSurrogate just

- // writes the lead-surrogate.

- bool wasCombined = encoder._writeSurrogate(lastCodeUnit, 0);

- assert(!wasCombined);

- }

- return encoder._buffer.sublist(0, encoder._bufferIndex);

- }

- /**

- * Starts a chunked conversion.

- *

- * The converter works more efficiently if the given [sink] is a

- * [ByteConversionSink].

- */

- StringConversionSink startChunkedConversion(Sink<List<int>> sink) {

- if (sink is! ByteConversionSink) {

- sink = new ByteConversionSink.from(sink);

- }

- return new _Utf8EncoderSink(sink);

- }

- // Override the base-classes bind, to provide a better type.

- Stream<List<int>> bind(Stream<String> stream) => super.bind(stream);

-/**

- * This class encodes Strings to UTF-8 code units (unsigned 8 bit integers).

- */

-// TODO(floitsch): make this class public.

-class _Utf8Encoder {

- int _carry = 0;

- int _bufferIndex = 0;

- final List<int> _buffer;

- static const _DEFAULT_BYTE_BUFFER_SIZE = 1024;

- _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE);

- _Utf8Encoder.withBufferSize(int bufferSize)

- : _buffer = _createBuffer(bufferSize);

- /**

- * Allow an implementation to pick the most efficient way of storing bytes.

- */

- static List<int> _createBuffer(int size) => new Uint8List(size);

- /**

- * Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and

- * writes it to [_buffer].

- *

- * Returns true if the [nextCodeUnit] was combined with the

- * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing

- * surrogate and has not been written yet.

- *

- * It is safe to pass 0 for [nextCodeUnit] in which case only the leading

- * surrogate is written.

- */

- bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {

- if (_isTailSurrogate(nextCodeUnit)) {

- int rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit);

- // If the rune is encoded with 2 code-units then it must be encoded

- // with 4 bytes in UTF-8.

- assert(rune > _THREE_BYTE_LIMIT);

- assert(rune <= _FOUR_BYTE_LIMIT);

- _buffer[_bufferIndex++] = 0xF0 | (rune >> 18);

- _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f);

- _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);

- _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);

- return true;

- } else {

- // TODO(floitsch): allow to throw on malformed strings.

- // Encode the half-surrogate directly into UTF-8. This yields

- // invalid UTF-8, but we started out with invalid UTF-16.

- // Surrogates are always encoded in 3 bytes in UTF-8.

- _buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12);

- _buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f);

- _buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f);

- return false;

- }

- /**

- * Fills the [_buffer] with as many characters as possible.

- *

- * Does not encode any trailing lead-surrogate. This must be done by the

- * caller.

- *

- * Returns the position in the string. The returned index points to the

- * first code unit that hasn't been encoded.

- */

- int _fillBuffer(String str, int start, int end) {

- if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) {

- // Don't handle a trailing lead-surrogate in this loop. The caller has

- // to deal with those.

- end--;

- }

- int stringIndex;

- for (stringIndex = start; stringIndex < end; stringIndex++) {

- int codeUnit = str.codeUnitAt(stringIndex);

- // ASCII has the same representation in UTF-8 and UTF-16.

- if (codeUnit <= _ONE_BYTE_LIMIT) {

- if (_bufferIndex >= _buffer.length) break;

- _buffer[_bufferIndex++] = codeUnit;

- } else if (_isLeadSurrogate(codeUnit)) {

- if (_bufferIndex + 3 >= _buffer.length) break;

- // Note that it is safe to read the next code unit. We decremented

- // [end] above when the last valid code unit was a leading surrogate.

- int nextCodeUnit = str.codeUnitAt(stringIndex + 1);

- bool wasCombined = _writeSurrogate(codeUnit, nextCodeUnit);

- if (wasCombined) stringIndex++;

- } else {

- int rune = codeUnit;

- if (rune <= _TWO_BYTE_LIMIT) {

- if (_bufferIndex + 1 >= _buffer.length) break;

- _buffer[_bufferIndex++] = 0xC0 | (rune >> 6);

- _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);

- } else {

- assert(rune <= _THREE_BYTE_LIMIT);

- if (_bufferIndex + 2 >= _buffer.length) break;

- _buffer[_bufferIndex++] = 0xE0 | (rune >> 12);

- _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);

- _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);

- }

- return stringIndex;

- }

-/**

- * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit

- * integers).

- */

-class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {

- final ByteConversionSink _sink;

- _Utf8EncoderSink(this._sink);

- void close() {

- if (_carry != 0) {

- // addSlice will call close again, but then the carry must be equal to 0.

- addSlice("", 0, 0, true);

- return;

- }

- _sink.close();

- }

- void addSlice(String str, int start, int end, bool isLast) {

- _bufferIndex = 0;

- if (start == end && !isLast) {

- return;

- }

- if (_carry != 0) {

- int nextCodeUnit = 0;

- if (start != end) {

- nextCodeUnit = str.codeUnitAt(start);

- } else {

- assert(isLast);

- }

- bool wasCombined = _writeSurrogate(_carry, nextCodeUnit);

- // Either we got a non-empty string, or we must not have been combined.

- assert(!wasCombined || start != end );

- if (wasCombined) start++;

- _carry = 0;

- }

- do {

- start = _fillBuffer(str, start, end);

- bool isLastSlice = isLast && (start == end);

- if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {

- if (isLast && _bufferIndex < _buffer.length - 3) {

- // There is still space for the last incomplete surrogate.

- // We use a non-surrogate as second argument. This way the

- // function will just add the surrogate-half to the buffer.

- bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);

- assert(!hasBeenCombined);

- } else {

- // Otherwise store it in the carry. If isLast is true, then

- // close will flush the last carry.

- _carry = str.codeUnitAt(start);

- }

- start++;

- }

- _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);

- _bufferIndex = 0;

- } while (start < end);

- if (isLast) close();

- }

- // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it

- // needs to deal with malformed input.

-/**

- * This class converts UTF-8 code units (lists of unsigned 8-bit integers)

- * to a string.

- */

-class Utf8Decoder extends Converter<List<int>, String> {

- final bool _allowMalformed;

- /**

- * Instantiates a new [Utf8Decoder].

- *

- * The optional [allowMalformed] argument defines how [convert] deals

- * with invalid or unterminated character sequences.

- *

- * If it is `true` [convert] replaces invalid (or unterminated) character

- * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise

- * it throws a [FormatException].

- */

- const Utf8Decoder({ bool allowMalformed: false })

- : this._allowMalformed = allowMalformed;

- /**

- * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

- * corresponding string.

- *

- * Uses the code units from [start] to, but no including, [end].

- * If [end] is omitted, it defaults to `codeUnits.length`.

- *

- * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

- * character is discarded.

- */

- String convert(List<int> codeUnits, [int start = 0, int end]) {

- // Allow the implementation to intercept and specialize based on the type

- // of codeUnits.

- String result = _convertIntercepted(_allowMalformed, codeUnits, start, end);

- if (result != null) {

- return result;

- }

- int length = codeUnits.length;

- RangeError.checkValidRange(start, end, length);

- if (end == null) end = length;

- StringBuffer buffer = new StringBuffer();

- _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);

- decoder.convert(codeUnits, start, end);

- decoder.close();

- return buffer.toString();

- }

- /**

- * Starts a chunked conversion.

- *

- * The converter works more efficiently if the given [sink] is a

- * [StringConversionSink].

- */

- ByteConversionSink startChunkedConversion(Sink<String> sink) {

- StringConversionSink stringSink;

- if (sink is StringConversionSink) {

- stringSink = sink;

- } else {

- stringSink = new StringConversionSink.from(sink);

- }

- return stringSink.asUtf8Sink(_allowMalformed);

- }

- // Override the base-classes bind, to provide a better type.

- Stream<String> bind(Stream<List<int>> stream) => super.bind(stream);

- external Converter<List<int>, dynamic/*=T*/> fuse/*<T>*/(

- Converter<String, dynamic/*=T*/> next);

- external static String _convertIntercepted(

- bool allowMalformed, List<int> codeUnits, int start, int end);

-// UTF-8 constants.

-const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits

-const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits

-const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits

-const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.

-// UTF-16 constants.

-const int _SURROGATE_MASK = 0xF800;

-const int _SURROGATE_TAG_MASK = 0xFC00;

-const int _SURROGATE_VALUE_MASK = 0x3FF;

-const int _LEAD_SURROGATE_MIN = 0xD800;

-const int _TAIL_SURROGATE_MIN = 0xDC00;

-bool _isLeadSurrogate(int codeUnit) =>

- (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;

-bool _isTailSurrogate(int codeUnit) =>

- (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;

-int _combineSurrogatePair(int lead, int tail) =>

- 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)

- | (tail & _SURROGATE_VALUE_MASK);

-/**

- * Decodes UTF-8.

- *

- * The decoder handles chunked input.

- */

-// TODO(floitsch): make this class public.

-class _Utf8Decoder {

- final bool _allowMalformed;

- final StringSink _stringSink;

- bool _isFirstCharacter = true;

- int _value = 0;

- int _expectedUnits = 0;

- int _extraUnits = 0;

- _Utf8Decoder(this._stringSink, this._allowMalformed);

- bool get hasPartialInput => _expectedUnits > 0;

- // Limits of one through four byte encodings.

- static const List<int> _LIMITS = const <int>[

- _ONE_BYTE_LIMIT,

- _TWO_BYTE_LIMIT,

- _THREE_BYTE_LIMIT,

- _FOUR_BYTE_LIMIT ];

- void close() {

- flush();

- }

- /**

- * Flushes this decoder as if closed.

- *

- * This method throws if the input was partial and the decoder was

- * constructed with `allowMalformed` set to `false`.

- */

- void flush() {

- if (hasPartialInput) {

- if (!_allowMalformed) {

- throw new FormatException("Unfinished UTF-8 octet sequence");

- }

- _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

- _value = 0;

- _expectedUnits = 0;

- _extraUnits = 0;

- }

- void convert(List<int> codeUnits, int startIndex, int endIndex) {

- int value = _value;

- int expectedUnits = _expectedUnits;

- int extraUnits = _extraUnits;

- _value = 0;

- _expectedUnits = 0;

- _extraUnits = 0;

- int scanOneByteCharacters(units, int from) {

- final to = endIndex;

- final mask = _ONE_BYTE_LIMIT;

- for (var i = from; i < to; i++) {

- final unit = units[i];

- if ((unit & mask) != unit) return i - from;

- }

- return to - from;

- }

- void addSingleBytes(int from, int to) {

- assert(from >= startIndex && from <= endIndex);

- assert(to >= startIndex && to <= endIndex);

- _stringSink.write(new String.fromCharCodes(codeUnits, from, to));

- }

- int i = startIndex;

- loop: while (true) {

- multibyte: if (expectedUnits > 0) {

- do {

- if (i == endIndex) {

- break loop;

- }

- int unit = codeUnits[i];

- if ((unit & 0xC0) != 0x80) {

- expectedUnits = 0;

- if (!_allowMalformed) {

- throw new FormatException(

- "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

- }

- _isFirstCharacter = false;

- _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

- break multibyte;

- } else {

- value = (value << 6) | (unit & 0x3f);

- expectedUnits--;

- i++;

- }

- } while (expectedUnits > 0);

- if (value <= _LIMITS[extraUnits - 1]) {

- // Overly long encoding. The value could be encoded with a shorter

- // encoding.

- if (!_allowMalformed) {

- throw new FormatException(

- "Overlong encoding of 0x${value.toRadixString(16)}");

- }

- expectedUnits = extraUnits = 0;

- value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

- }

- if (value > _FOUR_BYTE_LIMIT) {

- if (!_allowMalformed) {

- throw new FormatException("Character outside valid Unicode range: "

- "0x${value.toRadixString(16)}");

- }

- value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

- }

- if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) {

- _stringSink.writeCharCode(value);

- }

- _isFirstCharacter = false;

- }

- while (i < endIndex) {

- int oneBytes = scanOneByteCharacters(codeUnits, i);

- if (oneBytes > 0) {

- _isFirstCharacter = false;

- addSingleBytes(i, i + oneBytes);

- i += oneBytes;

- if (i == endIndex) break;

- }

- int unit = codeUnits[i++];

- // TODO(floitsch): the way we test we could potentially allow

- // units that are too large, if they happen to have the

- // right bit-pattern. (Same is true for the multibyte loop above).

- // TODO(floitsch): optimize this loop. See:

- // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.dart?column_width=80

- if (unit < 0) {

- // TODO(floitsch): should this be unit <= 0 ?

- if (!_allowMalformed) {

- throw new FormatException(

- "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");

- }

- _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

- } else {

- assert(unit > _ONE_BYTE_LIMIT);

- if ((unit & 0xE0) == 0xC0) {

- value = unit & 0x1F;

- expectedUnits = extraUnits = 1;

- continue loop;

- }

- if ((unit & 0xF0) == 0xE0) {

- value = unit & 0x0F;

- expectedUnits = extraUnits = 2;

- continue loop;

- }

- // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.

- if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {

- value = unit & 0x07;

- expectedUnits = extraUnits = 3;

- continue loop;

- }

- if (!_allowMalformed) {

- throw new FormatException(

- "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

- }

- value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

- expectedUnits = extraUnits = 0;

- _isFirstCharacter = false;

- _stringSink.writeCharCode(value);

- }

- break loop;

- }

- if (expectedUnits > 0) {

- _value = value;

- _expectedUnits = expectedUnits;

- _extraUnits = extraUnits;

- }