Index: mojo/public/dart/third_party/utf/lib/src/utf/utf32.dart |
diff --git a/mojo/public/dart/third_party/utf/lib/src/utf/utf32.dart b/mojo/public/dart/third_party/utf/lib/src/utf/utf32.dart |
new file mode 100644 |
index 0000000000000000000000000000000000000000..e51009d6bbac57f4afe2bf7885bcf66121918913 |
--- /dev/null |
+++ b/mojo/public/dart/third_party/utf/lib/src/utf/utf32.dart |
@@ -0,0 +1,343 @@ |
+// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
+// for details. All rights reserved. Use of this source code is governed by a |
+// BSD-style license that can be found in the LICENSE file. |
+ |
+part of utf; |
+ |
+/** |
+ * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert |
+ * as much of the input as needed. Determines the byte order from the BOM, |
+ * or uses big-endian as a default. This method always strips a leading BOM. |
+ * Set the replacementCharacter to null to throw an ArgumentError |
+ * rather than replace the bad value. |
+ */ |
+IterableUtf32Decoder decodeUtf32AsIterable(List<int> bytes, [ |
+ int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new IterableUtf32Decoder._( |
+ () => new Utf32BytesDecoder(bytes, offset, length, replacementCodepoint)); |
+} |
+ |
+/** |
+ * Decodes the UTF-32BE bytes as an iterable. Thus, the consumer can only convert |
+ * as much of the input as needed. This method strips a leading BOM by default, |
+ * but can be overridden by setting the optional parameter [stripBom] to false. |
+ * Set the replacementCharacter to null to throw an ArgumentError |
+ * rather than replace the bad value. |
+ */ |
+IterableUtf32Decoder decodeUtf32beAsIterable(List<int> bytes, [ |
+ int offset = 0, int length, bool stripBom = true, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new IterableUtf32Decoder._( |
+ () => new Utf32beBytesDecoder(bytes, offset, length, stripBom, |
+ replacementCodepoint)); |
+} |
+ |
+/** |
+ * Decodes the UTF-32LE bytes as an iterable. Thus, the consumer can only convert |
+ * as much of the input as needed. This method strips a leading BOM by default, |
+ * but can be overridden by setting the optional parameter [stripBom] to false. |
+ * Set the replacementCharacter to null to throw an ArgumentError |
+ * rather than replace the bad value. |
+ */ |
+IterableUtf32Decoder decodeUtf32leAsIterable(List<int> bytes, [ |
+ int offset = 0, int length, bool stripBom = true, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new IterableUtf32Decoder._( |
+ () => new Utf32leBytesDecoder(bytes, offset, length, stripBom, |
+ replacementCodepoint)); |
+} |
+ |
+/** |
+ * Produce a String from a sequence of UTF-32 encoded bytes. The parameters |
+ * allow an offset into a list of bytes (as int), limiting the length of the |
+ * values be decoded and the ability of override the default Unicode |
+ * replacement character. Set the replacementCharacter to null to throw an |
+ * ArgumentError rather than replace the bad value. |
+ */ |
+String decodeUtf32(List<int> bytes, [int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new String.fromCharCodes((new Utf32BytesDecoder(bytes, offset, length, |
+ replacementCodepoint)).decodeRest()); |
+} |
+/** |
+ * Produce a String from a sequence of UTF-32BE encoded bytes. The parameters |
+ * allow an offset into a list of bytes (as int), limiting the length of the |
+ * values be decoded and the ability of override the default Unicode |
+ * replacement character. Set the replacementCharacter to null to throw an |
+ * ArgumentError rather than replace the bad value. |
+ */ |
+String decodeUtf32be( |
+ List<int> bytes, [int offset = 0, int length, bool stripBom = true, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => |
+ new String.fromCharCodes((new Utf32beBytesDecoder(bytes, offset, length, |
+ stripBom, replacementCodepoint)).decodeRest()); |
+ |
+/** |
+ * Produce a String from a sequence of UTF-32LE encoded bytes. The parameters |
+ * allow an offset into a list of bytes (as int), limiting the length of the |
+ * values be decoded and the ability of override the default Unicode |
+ * replacement character. Set the replacementCharacter to null to throw an |
+ * ArgumentError rather than replace the bad value. |
+ */ |
+String decodeUtf32le( |
+ List<int> bytes, [int offset = 0, int length, bool stripBom = true, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => |
+ new String.fromCharCodes((new Utf32leBytesDecoder(bytes, offset, length, |
+ stripBom, replacementCodepoint)).decodeRest()); |
+ |
+/** |
+ * Produce a list of UTF-32 encoded bytes. This method prefixes the resulting |
+ * bytes with a big-endian byte-order-marker. |
+ */ |
+List<int> encodeUtf32(String str) => |
+ encodeUtf32be(str, true); |
+ |
+/** |
+ * Produce a list of UTF-32BE encoded bytes. By default, this method produces |
+ * UTF-32BE bytes with no BOM. |
+ */ |
+List<int> encodeUtf32be(String str, [bool writeBOM = false]) { |
+ List<int> utf32CodeUnits = stringToCodepoints(str); |
+ List<int> encoding = new List<int>(4 * utf32CodeUnits.length + |
+ (writeBOM ? 4 : 0)); |
+ int i = 0; |
+ if (writeBOM) { |
+ encoding[i++] = 0; |
+ encoding[i++] = 0; |
+ encoding[i++] = UNICODE_UTF_BOM_HI; |
+ encoding[i++] = UNICODE_UTF_BOM_LO; |
+ } |
+ for (int unit in utf32CodeUnits) { |
+ encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; |
+ encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; |
+ encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; |
+ encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; |
+ } |
+ return encoding; |
+} |
+ |
+/** |
+ * Produce a list of UTF-32LE encoded bytes. By default, this method produces |
+ * UTF-32BE bytes with no BOM. |
+ */ |
+List<int> encodeUtf32le(String str, [bool writeBOM = false]) { |
+ List<int> utf32CodeUnits = stringToCodepoints(str); |
+ List<int> encoding = new List<int>(4 * utf32CodeUnits.length + |
+ (writeBOM ? 4 : 0)); |
+ int i = 0; |
+ if (writeBOM) { |
+ encoding[i++] = UNICODE_UTF_BOM_LO; |
+ encoding[i++] = UNICODE_UTF_BOM_HI; |
+ encoding[i++] = 0; |
+ encoding[i++] = 0; |
+ } |
+ for (int unit in utf32CodeUnits) { |
+ encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; |
+ encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; |
+ encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; |
+ encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; |
+ } |
+ return encoding; |
+} |
+ |
+/** |
+ * Identifies whether a List of bytes starts (based on offset) with a |
+ * byte-order marker (BOM). |
+ */ |
+bool hasUtf32Bom( |
+ List<int> utf32EncodedBytes, [int offset = 0, int length]) { |
+ return hasUtf32beBom(utf32EncodedBytes, offset, length) || |
+ hasUtf32leBom(utf32EncodedBytes, offset, length); |
+} |
+ |
+/** |
+ * Identifies whether a List of bytes starts (based on offset) with a |
+ * big-endian byte-order marker (BOM). |
+ */ |
+bool hasUtf32beBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { |
+ int end = length != null ? offset + length : utf32EncodedBytes.length; |
+ return (offset + 4) <= end && |
+ utf32EncodedBytes[offset] == 0 && utf32EncodedBytes[offset + 1] == 0 && |
+ utf32EncodedBytes[offset + 2] == UNICODE_UTF_BOM_HI && |
+ utf32EncodedBytes[offset + 3] == UNICODE_UTF_BOM_LO; |
+} |
+ |
+/** |
+ * Identifies whether a List of bytes starts (based on offset) with a |
+ * little-endian byte-order marker (BOM). |
+ */ |
+bool hasUtf32leBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { |
+ int end = length != null ? offset + length : utf32EncodedBytes.length; |
+ return (offset + 4) <= end && |
+ utf32EncodedBytes[offset] == UNICODE_UTF_BOM_LO && |
+ utf32EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI && |
+ utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; |
+} |
+ |
+typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); |
+ |
+/** |
+ * Return type of [decodeUtf32AsIterable] and variants. The Iterable type |
+ * provides an iterator on demand and the iterator will only translate bytes |
+ * as requested by the user of the iterator. (Note: results are not cached.) |
+ */ |
+// TODO(floitsch): Consider removing the extend and switch to implements since |
+// that's cheaper to allocate. |
+class IterableUtf32Decoder extends IterableBase<int> { |
+ final Utf32BytesDecoderProvider codeunitsProvider; |
+ |
+ IterableUtf32Decoder._(this.codeunitsProvider); |
+ |
+ Utf32BytesDecoder get iterator => codeunitsProvider(); |
+} |
+ |
+/** |
+ * Abstrace parent class converts encoded bytes to codepoints. |
+ */ |
+abstract class Utf32BytesDecoder implements ListRangeIterator { |
+ // TODO(kevmoo): should this field be private? |
+ final ListRangeIterator utf32EncodedBytesIterator; |
+ final int replacementCodepoint; |
+ int _current = null; |
+ |
+ Utf32BytesDecoder._fromListRangeIterator( |
+ this.utf32EncodedBytesIterator, this.replacementCodepoint); |
+ |
+ factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ |
+ int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ if (length == null) { |
+ length = utf32EncodedBytes.length - offset; |
+ } |
+ if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { |
+ return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, |
+ false, replacementCodepoint); |
+ } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { |
+ return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, |
+ false, replacementCodepoint); |
+ } else { |
+ return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, |
+ replacementCodepoint); |
+ } |
+ } |
+ |
+ List<int> decodeRest() { |
+ List<int> codeunits = new List<int>(remaining); |
+ int i = 0; |
+ while (moveNext()) { |
+ codeunits[i++] = current; |
+ } |
+ return codeunits; |
+ } |
+ |
+ int get current => _current; |
+ |
+ bool moveNext() { |
+ _current = null; |
+ int remaining = utf32EncodedBytesIterator.remaining; |
+ if (remaining == 0) { |
+ _current = null; |
+ return false; |
+ } |
+ if (remaining < 4) { |
+ utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); |
+ if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ return true; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); |
+ } |
+ } |
+ int codepoint = decode(); |
+ if (_validCodepoint(codepoint)) { |
+ _current = codepoint; |
+ return true; |
+ } else if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ return true; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); |
+ } |
+ } |
+ |
+ int get position => utf32EncodedBytesIterator.position ~/ 4; |
+ |
+ void backup([int by = 1]) { |
+ utf32EncodedBytesIterator.backup(4 * by); |
+ } |
+ |
+ int get remaining => (utf32EncodedBytesIterator.remaining + 3) ~/ 4; |
+ |
+ void skip([int count = 1]) { |
+ utf32EncodedBytesIterator.skip(4 * count); |
+ } |
+ |
+ int decode(); |
+} |
+ |
+/** |
+ * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes |
+ * to produce the unicode codepoint. |
+ */ |
+class Utf32beBytesDecoder extends Utf32BytesDecoder { |
+ Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, |
+ int length, bool stripBom = true, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
+ super._fromListRangeIterator( |
+ (new ListRange(utf32EncodedBytes, offset, length)).iterator, |
+ replacementCodepoint) { |
+ if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { |
+ skip(); |
+ } |
+ } |
+ |
+ int decode() { |
+ utf32EncodedBytesIterator.moveNext(); |
+ int value = utf32EncodedBytesIterator.current; |
+ utf32EncodedBytesIterator.moveNext(); |
+ value = (value << 8) + utf32EncodedBytesIterator.current; |
+ utf32EncodedBytesIterator.moveNext(); |
+ value = (value << 8) + utf32EncodedBytesIterator.current; |
+ utf32EncodedBytesIterator.moveNext(); |
+ value = (value << 8) + utf32EncodedBytesIterator.current; |
+ return value; |
+ } |
+} |
+ |
+/** |
+ * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes |
+ * to produce the unicode codepoint. |
+ */ |
+class Utf32leBytesDecoder extends Utf32BytesDecoder { |
+ Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, |
+ int length, bool stripBom = true, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
+ super._fromListRangeIterator( |
+ (new ListRange(utf32EncodedBytes, offset, length)).iterator, |
+ replacementCodepoint) { |
+ if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { |
+ skip(); |
+ } |
+ } |
+ |
+ int decode() { |
+ utf32EncodedBytesIterator.moveNext(); |
+ int value = utf32EncodedBytesIterator.current; |
+ utf32EncodedBytesIterator.moveNext(); |
+ value += (utf32EncodedBytesIterator.current << 8); |
+ utf32EncodedBytesIterator.moveNext(); |
+ value += (utf32EncodedBytesIterator.current << 16); |
+ utf32EncodedBytesIterator.moveNext(); |
+ value += (utf32EncodedBytesIterator.current << 24); |
+ return value; |
+ } |
+} |
+ |
+bool _validCodepoint(int codepoint) { |
+ return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || |
+ (codepoint > UNICODE_UTF16_RESERVED_HI && |
+ codepoint < UNICODE_VALID_RANGE_MAX); |
+} |