| Index: mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart
|
| diff --git a/mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart b/mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..ff1b1ed81e6079b529d85649071ffa7c0a838541
|
| --- /dev/null
|
| +++ b/mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart
|
| @@ -0,0 +1,276 @@
|
| +// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
|
| +// for details. All rights reserved. Use of this source code is governed by a
|
| +// BSD-style license that can be found in the LICENSE file.
|
| +
|
| +part of utf;
|
| +
|
| +const int _UTF8_ONE_BYTE_MAX = 0x7f;
|
| +const int _UTF8_TWO_BYTE_MAX = 0x7ff;
|
| +const int _UTF8_THREE_BYTE_MAX = 0xffff;
|
| +
|
| +const int _UTF8_LO_SIX_BIT_MASK = 0x3f;
|
| +
|
| +const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;
|
| +const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;
|
| +const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;
|
| +const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;
|
| +const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;
|
| +
|
| +const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f;
|
| +const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf;
|
| +const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7;
|
| +
|
| +const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;
|
| +const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80;
|
| +
|
| +/**
|
| + * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert
|
| + * as much of the input as needed. Set the replacementCharacter to null to
|
| + * throw an ArgumentError rather than replace the bad value.
|
| + */
|
| +IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0,
|
| + int length,
|
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
|
| + return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint);
|
| +}
|
| +
|
| +/**
|
| + * Produce a String from a List of UTF-8 encoded bytes. The parameters
|
| + * can set an offset into a list of bytes (as int), limit the length of the
|
| + * values to be decoded, and override the default Unicode replacement character.
|
| + * Set the replacementCharacter to null to throw an ArgumentError
|
| + * rather than replace the bad value.
|
| + */
|
| +String decodeUtf8(List<int> bytes, [int offset = 0, int length,
|
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
|
| + return new String.fromCharCodes(
|
| + (new Utf8Decoder(bytes, offset, length, replacementCodepoint))
|
| + .decodeRest());
|
| +}
|
| +
|
| +/**
|
| + * Produce a sequence of UTF-8 encoded bytes from the provided string.
|
| + */
|
| +List<int> encodeUtf8(String str) =>
|
| + codepointsToUtf8(stringToCodepoints(str));
|
| +
|
| +int _addToEncoding(int offset, int bytes, int value, List<int> buffer) {
|
| + while (bytes > 0) {
|
| + buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE |
|
| + (value & _UTF8_LO_SIX_BIT_MASK);
|
| + value = value >> 6;
|
| + bytes--;
|
| + }
|
| + return value;
|
| +}
|
| +
|
| +/**
|
| + * Encode code points as UTF-8 code units.
|
| + */
|
| +List<int> codepointsToUtf8(
|
| + List<int> codepoints, [int offset = 0, int length]) {
|
| + ListRange source = new ListRange(codepoints, offset, length);
|
| +
|
| + int encodedLength = 0;
|
| + for (int value in source) {
|
| + if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
|
| + encodedLength += 3;
|
| + } else if (value <= _UTF8_ONE_BYTE_MAX) {
|
| + encodedLength++;
|
| + } else if (value <= _UTF8_TWO_BYTE_MAX) {
|
| + encodedLength += 2;
|
| + } else if (value <= _UTF8_THREE_BYTE_MAX) {
|
| + encodedLength += 3;
|
| + } else if (value <= UNICODE_VALID_RANGE_MAX) {
|
| + encodedLength += 4;
|
| + }
|
| + }
|
| +
|
| + List<int> encoded = new List<int>(encodedLength);
|
| + int insertAt = 0;
|
| + for (int value in source) {
|
| + if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
|
| + encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]);
|
| + insertAt += 3;
|
| + } else if (value <= _UTF8_ONE_BYTE_MAX) {
|
| + encoded[insertAt] = value;
|
| + insertAt++;
|
| + } else if (value <= _UTF8_TWO_BYTE_MAX) {
|
| + encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | (
|
| + _UTF8_FIRST_BYTE_OF_TWO_MASK &
|
| + _addToEncoding(insertAt, 1, value, encoded));
|
| + insertAt += 2;
|
| + } else if (value <= _UTF8_THREE_BYTE_MAX) {
|
| + encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | (
|
| + _UTF8_FIRST_BYTE_OF_THREE_MASK &
|
| + _addToEncoding(insertAt, 2, value, encoded));
|
| + insertAt += 3;
|
| + } else if (value <= UNICODE_VALID_RANGE_MAX) {
|
| + encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | (
|
| + _UTF8_FIRST_BYTE_OF_FOUR_MASK &
|
| + _addToEncoding(insertAt, 3, value, encoded));
|
| + insertAt += 4;
|
| + }
|
| + }
|
| + return encoded;
|
| +}
|
| +
|
| +// Because UTF-8 specifies byte order, we do not have to follow the pattern
|
| +// used by UTF-16 & UTF-32 regarding byte order.
|
| +List<int> utf8ToCodepoints(
|
| + List<int> utf8EncodedBytes, [int offset = 0, int length,
|
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
|
| + return new Utf8Decoder(utf8EncodedBytes, offset, length,
|
| + replacementCodepoint).decodeRest();
|
| +}
|
| +
|
| +/**
|
| + * Return type of [decodeUtf8AsIterable] and variants. The Iterable type
|
| + * provides an iterator on demand and the iterator will only translate bytes
|
| + * as requested by the user of the iterator. (Note: results are not cached.)
|
| + */
|
| +// TODO(floitsch): Consider removing the extend and switch to implements since
|
| +// that's cheaper to allocate.
|
| +class IterableUtf8Decoder extends IterableBase<int> {
|
| + final List<int> bytes;
|
| + final int offset;
|
| + final int length;
|
| + final int replacementCodepoint;
|
| +
|
| + IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,
|
| + this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
|
| +
|
| + Utf8Decoder get iterator =>
|
| + new Utf8Decoder(bytes, offset, length, replacementCodepoint);
|
| +}
|
| +
|
| +/**
|
| + * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The
|
| + * parameters can set an offset into a list of bytes (as int), limit the length
|
| + * of the values to be decoded, and override the default Unicode replacement
|
| + * character. Set the replacementCharacter to null to throw an
|
| + * ArgumentError rather than replace the bad value. The return value
|
| + * from this method can be used as an Iterable (e.g. in a for-loop).
|
| + */
|
| +class Utf8Decoder implements Iterator<int> {
|
| + // TODO(kevmoo): should this field be private?
|
| + final ListRangeIterator utf8EncodedBytesIterator;
|
| + final int replacementCodepoint;
|
| + int _current = null;
|
| +
|
| + Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
|
| + this.replacementCodepoint =
|
| + UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
|
| + utf8EncodedBytesIterator =
|
| + (new ListRange(utf8EncodedBytes, offset, length)).iterator;
|
| +
|
| +
|
| + Utf8Decoder._fromListRangeIterator(ListRange source, [
|
| + this.replacementCodepoint =
|
| + UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
|
| + utf8EncodedBytesIterator = source.iterator;
|
| +
|
| + /** Decode the remaininder of the characters in this decoder
|
| + * into a [List<int>].
|
| + */
|
| + List<int> decodeRest() {
|
| + List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
|
| + int i = 0;
|
| + while (moveNext()) {
|
| + codepoints[i++] = current;
|
| + }
|
| + if (i == codepoints.length) {
|
| + return codepoints;
|
| + } else {
|
| + List<int> truncCodepoints = new List<int>(i);
|
| + truncCodepoints.setRange(0, i, codepoints);
|
| + return truncCodepoints;
|
| + }
|
| + }
|
| +
|
| + int get current => _current;
|
| +
|
| + bool moveNext() {
|
| + _current = null;
|
| +
|
| + if (!utf8EncodedBytesIterator.moveNext()) return false;
|
| +
|
| + int value = utf8EncodedBytesIterator.current;
|
| + int additionalBytes = 0;
|
| +
|
| + if (value < 0) {
|
| + if (replacementCodepoint != null) {
|
| + _current = replacementCodepoint;
|
| + return true;
|
| + } else {
|
| + throw new ArgumentError(
|
| + "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
|
| + }
|
| + } else if (value <= _UTF8_ONE_BYTE_MAX) {
|
| + _current = value;
|
| + return true;
|
| + } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
|
| + if (replacementCodepoint != null) {
|
| + _current = replacementCodepoint;
|
| + return true;
|
| + } else {
|
| + throw new ArgumentError(
|
| + "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
|
| + }
|
| + } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
|
| + value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
|
| + additionalBytes = 1;
|
| + } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
|
| + value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
|
| + additionalBytes = 2;
|
| + } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
|
| + value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
|
| + additionalBytes = 3;
|
| + } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
|
| + value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
|
| + additionalBytes = 4;
|
| + } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
|
| + value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
|
| + additionalBytes = 5;
|
| + } else if (replacementCodepoint != null) {
|
| + _current = replacementCodepoint;
|
| + return true;
|
| + } else {
|
| + throw new ArgumentError(
|
| + "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
|
| + }
|
| + int j = 0;
|
| + while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
|
| + int nextValue = utf8EncodedBytesIterator.current;
|
| + if (nextValue > _UTF8_ONE_BYTE_MAX &&
|
| + nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
|
| + value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
|
| + } else {
|
| + // if sequence-starting code unit, reposition cursor to start here
|
| + if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
|
| + utf8EncodedBytesIterator.backup();
|
| + }
|
| + break;
|
| + }
|
| + j++;
|
| + }
|
| + bool validSequence = (j == additionalBytes && (
|
| + value < UNICODE_UTF16_RESERVED_LO ||
|
| + value > UNICODE_UTF16_RESERVED_HI));
|
| + bool nonOverlong =
|
| + (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
|
| + (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
|
| + (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
|
| + bool inRange = value <= UNICODE_VALID_RANGE_MAX;
|
| + if (validSequence && nonOverlong && inRange) {
|
| + _current = value;
|
| + return true;
|
| + } else if (replacementCodepoint != null) {
|
| + _current = replacementCodepoint;
|
| + return true;
|
| + } else {
|
| + throw new ArgumentError(
|
| + "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
|
| + }
|
| + }
|
| +}
|
|
|