Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(117)

Unified Diff: pkg/utf/lib/utf16.dart

Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: pkg/utf/lib/utf16.dart
diff --git a/pkg/utf/lib/utf16.dart b/pkg/utf/lib/utf16.dart
deleted file mode 100644
index 438c6781a74fa78ba0944e683e5c7c3fd308c1f8..0000000000000000000000000000000000000000
--- a/pkg/utf/lib/utf16.dart
+++ /dev/null
@@ -1,502 +0,0 @@
-// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
-// for details. All rights reserved. Use of this source code is governed by a
-// BSD-style license that can be found in the LICENSE file.
-
-part of utf;
-
-// TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
-/**
- * Provide a list of Unicode codepoints for a given string.
- */
-List<int> stringToCodepoints(String str) {
- // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.
- // So we need to convert.
- return _utf16CodeUnitsToCodepoints(str.codeUnits);
-}
-
-/**
- * Generate a string from the provided Unicode codepoints.
- *
- * *Deprecated* Use [String.fromCharCodes] instead.
- */
-String codepointsToString(List<int> codepoints) {
- return new String.fromCharCodes(codepoints);
-}
-
-/**
- * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.
- * The parameters can override the default Unicode replacement character. Set
- * the replacementCharacter to null to throw an ArgumentError
- * rather than replace the bad value.
- */
-class Utf16CodeUnitDecoder implements Iterator<int> {
- final _ListRangeIterator utf16CodeUnitIterator;
- final int replacementCodepoint;
- int _current = null;
-
- Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,
- int this.replacementCodepoint =
- UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
- utf16CodeUnitIterator =
- (new _ListRange(utf16CodeUnits, offset, length)).iterator;
-
- Utf16CodeUnitDecoder.fromListRangeIterator(
- _ListRangeIterator this.utf16CodeUnitIterator,
- int this.replacementCodepoint);
-
- Iterator<int> get iterator => this;
-
- int get current => _current;
-
- bool moveNext() {
- _current = null;
- if (!utf16CodeUnitIterator.moveNext()) return false;
-
- int value = utf16CodeUnitIterator.current;
- if (value < 0) {
- if (replacementCodepoint != null) {
- _current = replacementCodepoint;
- } else {
- throw new ArgumentError(
- "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
- }
- } else if (value < UNICODE_UTF16_RESERVED_LO ||
- (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
- // transfer directly
- _current = value;
- } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
- utf16CodeUnitIterator.moveNext()) {
- // merge surrogate pair
- int nextValue = utf16CodeUnitIterator.current;
- if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
- nextValue <= UNICODE_UTF16_RESERVED_HI) {
- value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;
- value += UNICODE_UTF16_OFFSET +
- (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);
- _current = value;
- } else {
- if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&
- nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {
- utf16CodeUnitIterator.backup();
- }
- if (replacementCodepoint != null) {
- _current = replacementCodepoint;
- } else {
- throw new ArgumentError(
- "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
- }
- }
- } else if (replacementCodepoint != null) {
- _current = replacementCodepoint;
- } else {
- throw new ArgumentError(
- "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
- }
- return true;
- }
-}
-
-/**
- * Encode code points as UTF16 code units.
- */
-List<int> _codepointsToUtf16CodeUnits(
- List<int> codepoints,
- [int offset = 0,
- int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
-
- _ListRange listRange = new _ListRange(codepoints, offset, length);
- int encodedLength = 0;
- for (int value in listRange) {
- if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
- (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
- encodedLength++;
- } else if (value > UNICODE_PLANE_ONE_MAX &&
- value <= UNICODE_VALID_RANGE_MAX) {
- encodedLength += 2;
- } else {
- encodedLength++;
- }
- }
-
- List<int> codeUnitsBuffer = new List<int>(encodedLength);
- int j = 0;
- for (int value in listRange) {
- if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
- (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
- codeUnitsBuffer[j++] = value;
- } else if (value > UNICODE_PLANE_ONE_MAX &&
- value <= UNICODE_VALID_RANGE_MAX) {
- int base = value - UNICODE_UTF16_OFFSET;
- codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +
- ((base & UNICODE_UTF16_HI_MASK) >> 10);
- codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +
- (base & UNICODE_UTF16_LO_MASK);
- } else if (replacementCodepoint != null) {
- codeUnitsBuffer[j++] = replacementCodepoint;
- } else {
- throw new ArgumentError("Invalid encoding");
- }
- }
- return codeUnitsBuffer;
-}
-
-/**
- * Decodes the utf16 codeunits to codepoints.
- */
-List<int> _utf16CodeUnitsToCodepoints(
- List<int> utf16CodeUnits, [int offset = 0, int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- _ListRangeIterator source =
- (new _ListRange(utf16CodeUnits, offset, length)).iterator;
- Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder
- .fromListRangeIterator(source, replacementCodepoint);
- List<int> codepoints = new List<int>(source.remaining);
- int i = 0;
- while (decoder.moveNext()) {
- codepoints[i++] = decoder.current;
- }
- if (i == codepoints.length) {
- return codepoints;
- } else {
- List<int> codepointTrunc = new List<int>(i);
- codepointTrunc.setRange(0, i, codepoints);
- return codepointTrunc;
- }
-}
-
-/**
- * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert
- * as much of the input as needed. Determines the byte order from the BOM,
- * or uses big-endian as a default. This method always strips a leading BOM.
- * Set the [replacementCodepoint] to null to throw an ArgumentError
- * rather than replace the bad value. The default value for
- * [replacementCodepoint] is U+FFFD.
- */
-IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,
- int length, int replacementCodepoint =
- UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- return new IterableUtf16Decoder._(
- () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length,
- replacementCodepoint), replacementCodepoint);
-}
-
-/**
- * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only
- * convert as much of the input as needed. This method strips a leading BOM by
- * default, but can be overridden by setting the optional parameter [stripBom]
- * to false. Set the [replacementCodepoint] to null to throw an
- * ArgumentError rather than replace the bad value. The default
- * value for the [replacementCodepoint] is U+FFFD.
- */
-IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0,
- int length, bool stripBom = true, int replacementCodepoint =
- UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- return new IterableUtf16Decoder._(
- () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,
- replacementCodepoint), replacementCodepoint);
-}
-
-/**
- * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only
- * convert as much of the input as needed. This method strips a leading BOM by
- * default, but can be overridden by setting the optional parameter [stripBom]
- * to false. Set the [replacementCodepoint] to null to throw an
- * ArgumentError rather than replace the bad value. The default
- * value for the [replacementCodepoint] is U+FFFD.
- */
-IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0,
- int length, bool stripBom = true, int replacementCodepoint =
- UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- return new IterableUtf16Decoder._(
- () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,
- replacementCodepoint), replacementCodepoint);
-}
-
-/**
- * Produce a String from a sequence of UTF-16 encoded bytes. This method always
- * strips a leading BOM. Set the [replacementCodepoint] to null to throw an
- * ArgumentError rather than replace the bad value. The default
- * value for the [replacementCodepoint] is U+FFFD.
- */
-String decodeUtf16(List<int> bytes, [int offset = 0, int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes,
- offset, length, replacementCodepoint);
- List<int> codeunits = decoder.decodeRest();
- return new String.fromCharCodes(
- _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
-}
-
-/**
- * Produce a String from a sequence of UTF-16BE encoded bytes. This method
- * strips a leading BOM by default, but can be overridden by setting the
- * optional parameter [stripBom] to false. Set the [replacementCodepoint] to
- * null to throw an ArgumentError rather than replace the bad value.
- * The default value for the [replacementCodepoint] is U+FFFD.
- */
-String decodeUtf16be(List<int> bytes, [int offset = 0, int length,
- bool stripBom = true,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset,
- length, stripBom, replacementCodepoint)).decodeRest();
- return new String.fromCharCodes(
- _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
-}
-
-/**
- * Produce a String from a sequence of UTF-16LE encoded bytes. This method
- * strips a leading BOM by default, but can be overridden by setting the
- * optional parameter [stripBom] to false. Set the [replacementCodepoint] to
- * null to throw an ArgumentError rather than replace the bad value.
- * The default value for the [replacementCodepoint] is U+FFFD.
- */
-String decodeUtf16le(List<int> bytes, [int offset = 0, int length,
- bool stripBom = true,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset,
- length, stripBom, replacementCodepoint)).decodeRest();
- return new String.fromCharCodes(
- _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));
-}
-
-/**
- * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting
- * bytes with a big-endian byte-order-marker.
- */
-List<int> encodeUtf16(String str) =>
- encodeUtf16be(str, true);
-
-/**
- * Produce a list of UTF-16BE encoded bytes. By default, this method produces
- * UTF-16BE bytes with no BOM.
- */
-List<int> encodeUtf16be(String str, [bool writeBOM = false]) {
- List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);
- List<int> encoding =
- new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));
- int i = 0;
- if (writeBOM) {
- encoding[i++] = UNICODE_UTF_BOM_HI;
- encoding[i++] = UNICODE_UTF_BOM_LO;
- }
- for (int unit in utf16CodeUnits) {
- encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;
- encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;
- }
- return encoding;
-}
-
-/**
- * Produce a list of UTF-16LE encoded bytes. By default, this method produces
- * UTF-16LE bytes with no BOM.
- */
-List<int> encodeUtf16le(String str, [bool writeBOM = false]) {
- List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);
- List<int> encoding =
- new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));
- int i = 0;
- if (writeBOM) {
- encoding[i++] = UNICODE_UTF_BOM_LO;
- encoding[i++] = UNICODE_UTF_BOM_HI;
- }
- for (int unit in utf16CodeUnits) {
- encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;
- encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;
- }
- return encoding;
-}
-
-/**
- * Identifies whether a List of bytes starts (based on offset) with a
- * byte-order marker (BOM).
- */
-bool hasUtf16Bom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {
- return hasUtf16beBom(utf32EncodedBytes, offset, length) ||
- hasUtf16leBom(utf32EncodedBytes, offset, length);
-}
-
-/**
- * Identifies whether a List of bytes starts (based on offset) with a
- * big-endian byte-order marker (BOM).
- */
-bool hasUtf16beBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {
- int end = length != null ? offset + length : utf16EncodedBytes.length;
- return (offset + 2) <= end &&
- utf16EncodedBytes[offset] == UNICODE_UTF_BOM_HI &&
- utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_LO;
-}
-
-/**
- * Identifies whether a List of bytes starts (based on offset) with a
- * little-endian byte-order marker (BOM).
- */
-bool hasUtf16leBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {
- int end = length != null ? offset + length : utf16EncodedBytes.length;
- return (offset + 2) <= end &&
- utf16EncodedBytes[offset] == UNICODE_UTF_BOM_LO &&
- utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI;
-}
-
-List<int> _stringToUtf16CodeUnits(String str) {
- return _codepointsToUtf16CodeUnits(str.codeUnits);
-}
-
-typedef _ListRangeIterator _CodeUnitsProvider();
-
-/**
- * Return type of [decodeUtf16AsIterable] and variants. The Iterable type
- * provides an iterator on demand and the iterator will only translate bytes
- * as requested by the user of the iterator. (Note: results are not cached.)
- */
-// TODO(floitsch): Consider removing the extend and switch to implements since
-// that's cheaper to allocate.
-class IterableUtf16Decoder extends IterableBase<int> {
- final _CodeUnitsProvider codeunitsProvider;
- final int replacementCodepoint;
-
- IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint);
-
- Utf16CodeUnitDecoder get iterator =>
- new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(),
- replacementCodepoint);
-}
-
-/**
- * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes
- * to produce the code unit (0-(2^16)-1). Relies on BOM to determine
- * endian-ness, and defaults to BE.
- */
-abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator {
- final _ListRangeIterator utf16EncodedBytesIterator;
- final int replacementCodepoint;
- int _current = null;
-
- Utf16BytesToCodeUnitsDecoder._fromListRangeIterator(
- this.utf16EncodedBytesIterator, this.replacementCodepoint);
-
- factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
- int offset = 0, int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- if (length == null) {
- length = utf16EncodedBytes.length - offset;
- }
- if (hasUtf16beBom(utf16EncodedBytes, offset, length)) {
- return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,
- length - 2, false, replacementCodepoint);
- } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) {
- return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,
- length - 2, false, replacementCodepoint);
- } else {
- return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset,
- length, false, replacementCodepoint);
- }
- }
-
- /**
- * Provides a fast way to decode the rest of the source bytes in a single
- * call. This method trades memory for improved speed in that it potentially
- * over-allocates the List containing results.
- */
- List<int> decodeRest() {
- List<int> codeunits = new List<int>(remaining);
- int i = 0;
- while (moveNext()) {
- codeunits[i++] = current;
- }
- if (i == codeunits.length) {
- return codeunits;
- } else {
- List<int> truncCodeunits = new List<int>(i);
- truncCodeunits.setRange(0, i, codeunits);
- return truncCodeunits;
- }
- }
-
- int get current => _current;
-
- bool moveNext() {
- _current = null;
- int remaining = utf16EncodedBytesIterator.remaining;
- if (remaining == 0) {
- _current = null;
- return false;
- }
- if (remaining == 1) {
- utf16EncodedBytesIterator.moveNext();
- if (replacementCodepoint != null) {
- _current = replacementCodepoint;
- return true;
- } else {
- throw new ArgumentError(
- "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");
- }
- }
- _current = decode();
- return true;
- }
-
- int get position => utf16EncodedBytesIterator.position ~/ 2;
-
- void backup([int by = 1]) {
- utf16EncodedBytesIterator.backup(2 * by);
- }
-
- int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;
-
- void skip([int count = 1]) {
- utf16EncodedBytesIterator.skip(2 * count);
- }
-
- int decode();
-}
-
-/**
- * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes
- * to produce the code unit (0-(2^16)-1).
- */
-class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {
- Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
- int offset = 0, int length, bool stripBom = true,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
- super._fromListRangeIterator(
- (new _ListRange(utf16EncodedBytes, offset, length)).iterator,
- replacementCodepoint) {
- if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) {
- skip();
- }
- }
-
- int decode() {
- utf16EncodedBytesIterator.moveNext();
- int hi = utf16EncodedBytesIterator.current;
- utf16EncodedBytesIterator.moveNext();
- int lo = utf16EncodedBytesIterator.current;
- return (hi << 8) + lo;
- }
-}
-
-/**
- * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes
- * to produce the code unit (0-(2^16)-1).
- */
-class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {
- Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [
- int offset = 0, int length, bool stripBom = true,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
- super._fromListRangeIterator(
- (new _ListRange(utf16EncodedBytes, offset, length)).iterator,
- replacementCodepoint) {
- if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) {
- skip();
- }
- }
-
- int decode() {
- utf16EncodedBytesIterator.moveNext();
- int lo = utf16EncodedBytesIterator.current;
- utf16EncodedBytesIterator.moveNext();
- int hi = utf16EncodedBytesIterator.current;
- return (hi << 8) + lo;
- }
-}
« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698