Index: mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart |
diff --git a/mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart b/mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart |
new file mode 100644 |
index 0000000000000000000000000000000000000000..ff1b1ed81e6079b529d85649071ffa7c0a838541 |
--- /dev/null |
+++ b/mojo/public/dart/third_party/utf/lib/src/utf/utf8.dart |
@@ -0,0 +1,276 @@ |
+// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
+// for details. All rights reserved. Use of this source code is governed by a |
+// BSD-style license that can be found in the LICENSE file. |
+ |
+part of utf; |
+ |
+const int _UTF8_ONE_BYTE_MAX = 0x7f; |
+const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
+const int _UTF8_THREE_BYTE_MAX = 0xffff; |
+ |
+const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
+ |
+const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; |
+const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; |
+const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; |
+const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; |
+const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; |
+ |
+const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; |
+const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; |
+const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; |
+ |
+const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; |
+const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; |
+ |
+/** |
+ * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert |
+ * as much of the input as needed. Set the replacementCharacter to null to |
+ * throw an ArgumentError rather than replace the bad value. |
+ */ |
+IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, |
+ int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); |
+} |
+ |
+/** |
+ * Produce a String from a List of UTF-8 encoded bytes. The parameters |
+ * can set an offset into a list of bytes (as int), limit the length of the |
+ * values to be decoded, and override the default Unicode replacement character. |
+ * Set the replacementCharacter to null to throw an ArgumentError |
+ * rather than replace the bad value. |
+ */ |
+String decodeUtf8(List<int> bytes, [int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new String.fromCharCodes( |
+ (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) |
+ .decodeRest()); |
+} |
+ |
+/** |
+ * Produce a sequence of UTF-8 encoded bytes from the provided string. |
+ */ |
+List<int> encodeUtf8(String str) => |
+ codepointsToUtf8(stringToCodepoints(str)); |
+ |
+int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { |
+ while (bytes > 0) { |
+ buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | |
+ (value & _UTF8_LO_SIX_BIT_MASK); |
+ value = value >> 6; |
+ bytes--; |
+ } |
+ return value; |
+} |
+ |
+/** |
+ * Encode code points as UTF-8 code units. |
+ */ |
+List<int> codepointsToUtf8( |
+ List<int> codepoints, [int offset = 0, int length]) { |
+ ListRange source = new ListRange(codepoints, offset, length); |
+ |
+ int encodedLength = 0; |
+ for (int value in source) { |
+ if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
+ encodedLength += 3; |
+ } else if (value <= _UTF8_ONE_BYTE_MAX) { |
+ encodedLength++; |
+ } else if (value <= _UTF8_TWO_BYTE_MAX) { |
+ encodedLength += 2; |
+ } else if (value <= _UTF8_THREE_BYTE_MAX) { |
+ encodedLength += 3; |
+ } else if (value <= UNICODE_VALID_RANGE_MAX) { |
+ encodedLength += 4; |
+ } |
+ } |
+ |
+ List<int> encoded = new List<int>(encodedLength); |
+ int insertAt = 0; |
+ for (int value in source) { |
+ if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
+ encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); |
+ insertAt += 3; |
+ } else if (value <= _UTF8_ONE_BYTE_MAX) { |
+ encoded[insertAt] = value; |
+ insertAt++; |
+ } else if (value <= _UTF8_TWO_BYTE_MAX) { |
+ encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( |
+ _UTF8_FIRST_BYTE_OF_TWO_MASK & |
+ _addToEncoding(insertAt, 1, value, encoded)); |
+ insertAt += 2; |
+ } else if (value <= _UTF8_THREE_BYTE_MAX) { |
+ encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( |
+ _UTF8_FIRST_BYTE_OF_THREE_MASK & |
+ _addToEncoding(insertAt, 2, value, encoded)); |
+ insertAt += 3; |
+ } else if (value <= UNICODE_VALID_RANGE_MAX) { |
+ encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( |
+ _UTF8_FIRST_BYTE_OF_FOUR_MASK & |
+ _addToEncoding(insertAt, 3, value, encoded)); |
+ insertAt += 4; |
+ } |
+ } |
+ return encoded; |
+} |
+ |
+// Because UTF-8 specifies byte order, we do not have to follow the pattern |
+// used by UTF-16 & UTF-32 regarding byte order. |
+List<int> utf8ToCodepoints( |
+ List<int> utf8EncodedBytes, [int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new Utf8Decoder(utf8EncodedBytes, offset, length, |
+ replacementCodepoint).decodeRest(); |
+} |
+ |
+/** |
+ * Return type of [decodeUtf8AsIterable] and variants. The Iterable type |
+ * provides an iterator on demand and the iterator will only translate bytes |
+ * as requested by the user of the iterator. (Note: results are not cached.) |
+ */ |
+// TODO(floitsch): Consider removing the extend and switch to implements since |
+// that's cheaper to allocate. |
+class IterableUtf8Decoder extends IterableBase<int> { |
+ final List<int> bytes; |
+ final int offset; |
+ final int length; |
+ final int replacementCodepoint; |
+ |
+ IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, |
+ this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
+ |
+ Utf8Decoder get iterator => |
+ new Utf8Decoder(bytes, offset, length, replacementCodepoint); |
+} |
+ |
+/** |
+ * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
+ * parameters can set an offset into a list of bytes (as int), limit the length |
+ * of the values to be decoded, and override the default Unicode replacement |
+ * character. Set the replacementCharacter to null to throw an |
+ * ArgumentError rather than replace the bad value. The return value |
+ * from this method can be used as an Iterable (e.g. in a for-loop). |
+ */ |
+class Utf8Decoder implements Iterator<int> { |
+ // TODO(kevmoo): should this field be private? |
+ final ListRangeIterator utf8EncodedBytesIterator; |
+ final int replacementCodepoint; |
+ int _current = null; |
+ |
+ Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
+ this.replacementCodepoint = |
+ UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
+ utf8EncodedBytesIterator = |
+ (new ListRange(utf8EncodedBytes, offset, length)).iterator; |
+ |
+ |
+ Utf8Decoder._fromListRangeIterator(ListRange source, [ |
+ this.replacementCodepoint = |
+ UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
+ utf8EncodedBytesIterator = source.iterator; |
+ |
+ /** Decode the remaininder of the characters in this decoder |
+ * into a [List<int>]. |
+ */ |
+ List<int> decodeRest() { |
+ List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); |
+ int i = 0; |
+ while (moveNext()) { |
+ codepoints[i++] = current; |
+ } |
+ if (i == codepoints.length) { |
+ return codepoints; |
+ } else { |
+ List<int> truncCodepoints = new List<int>(i); |
+ truncCodepoints.setRange(0, i, codepoints); |
+ return truncCodepoints; |
+ } |
+ } |
+ |
+ int get current => _current; |
+ |
+ bool moveNext() { |
+ _current = null; |
+ |
+ if (!utf8EncodedBytesIterator.moveNext()) return false; |
+ |
+ int value = utf8EncodedBytesIterator.current; |
+ int additionalBytes = 0; |
+ |
+ if (value < 0) { |
+ if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ return true; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
+ } |
+ } else if (value <= _UTF8_ONE_BYTE_MAX) { |
+ _current = value; |
+ return true; |
+ } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
+ if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ return true; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
+ } |
+ } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
+ value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
+ additionalBytes = 1; |
+ } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
+ value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
+ additionalBytes = 2; |
+ } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
+ value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
+ additionalBytes = 3; |
+ } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
+ value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
+ additionalBytes = 4; |
+ } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
+ value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
+ additionalBytes = 5; |
+ } else if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ return true; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
+ } |
+ int j = 0; |
+ while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { |
+ int nextValue = utf8EncodedBytesIterator.current; |
+ if (nextValue > _UTF8_ONE_BYTE_MAX && |
+ nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
+ value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
+ } else { |
+ // if sequence-starting code unit, reposition cursor to start here |
+ if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
+ utf8EncodedBytesIterator.backup(); |
+ } |
+ break; |
+ } |
+ j++; |
+ } |
+ bool validSequence = (j == additionalBytes && ( |
+ value < UNICODE_UTF16_RESERVED_LO || |
+ value > UNICODE_UTF16_RESERVED_HI)); |
+ bool nonOverlong = |
+ (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
+ (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
+ (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
+ bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
+ if (validSequence && nonOverlong && inRange) { |
+ _current = value; |
+ return true; |
+ } else if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ return true; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
+ } |
+ } |
+} |