pkg/utf/lib/utf16.dart - Issue 68563004: Move unicode tests to utf package.

Unified Diff: pkg/utf/lib/utf16.dart

Issue 68563004: Move unicode tests to utf package. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Simplify test. Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: pkg/utf/lib/utf16.dart

diff --git a/pkg/utf/lib/utf16.dart b/pkg/utf/lib/utf16.dart

index 7de9e616581296979d4f7e5f8e094a841012a4bd..438c6781a74fa78ba0944e683e5c7c3fd308c1f8 100644

--- a/pkg/utf/lib/utf16.dart

+++ b/pkg/utf/lib/utf16.dart

@@ -4,6 +4,167 @@

part of utf;

+// TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).

floitsch 2013/11/18 17:08:17 Whole section copied verbatim.

Lasse Reichstein Nielsen 2013/11/19 07:42:38 I don't think I want to add anything new to the ut

floitsch 2013/11/19 10:40:32 I don't agree. The utf-package contains much more

Lasse Reichstein Nielsen 2013/11/19 12:25:43 From the same package - in that case, LGTM.

+/**

+ * Provide a list of Unicode codepoints for a given string.

+ */

+List<int> stringToCodepoints(String str) {

+ // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.

+ // So we need to convert.

+ return _utf16CodeUnitsToCodepoints(str.codeUnits);

Lasse Reichstein Nielsen 2013/11/19 07:42:38 If we keep it (and I don't think we should - if it

floitsch 2013/11/19 10:40:32 This was code that already existed in the package.

+/**

+ * Generate a string from the provided Unicode codepoints.

+ *

+ * *Deprecated* Use [String.fromCharCodes] instead.

Lasse Reichstein Nielsen 2013/11/19 07:42:38 Ditto - remove this. Definitely remove the "Deprec

floitsch 2013/11/19 10:40:32 Not in this CL.

+ */

+String codepointsToString(List<int> codepoints) {

+ return new String.fromCharCodes(codepoints);

+/**

+ * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.

+ * The parameters can override the default Unicode replacement character. Set

+ * the replacementCharacter to null to throw an ArgumentError

+ * rather than replace the bad value.

+ */

+class Utf16CodeUnitDecoder implements Iterator<int> {

Lasse Reichstein Nielsen 2013/11/19 07:42:38 Do we have a way to use a Converter to go from inp

floitsch 2013/11/19 10:40:32 Again. this is code that already existed. Not chan

+ final _ListRangeIterator utf16CodeUnitIterator;

+ final int replacementCodepoint;

+ int _current = null;

+ Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,

+ int this.replacementCodepoint =

+ UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

+ utf16CodeUnitIterator =

+ (new _ListRange(utf16CodeUnits, offset, length)).iterator;

+ Utf16CodeUnitDecoder.fromListRangeIterator(

+ _ListRangeIterator this.utf16CodeUnitIterator,

+ int this.replacementCodepoint);

+ Iterator<int> get iterator => this;

+ int get current => _current;

+ bool moveNext() {

+ _current = null;

+ if (!utf16CodeUnitIterator.moveNext()) return false;

+ int value = utf16CodeUnitIterator.current;

+ if (value < 0) {

+ if (replacementCodepoint != null) {

+ _current = replacementCodepoint;

+ } else {

+ throw new ArgumentError(

+ "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

+ }

+ } else if (value < UNICODE_UTF16_RESERVED_LO ||

+ (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

+ // transfer directly

+ _current = value;

+ } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&

+ utf16CodeUnitIterator.moveNext()) {

+ // merge surrogate pair

+ int nextValue = utf16CodeUnitIterator.current;

+ if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&

+ nextValue <= UNICODE_UTF16_RESERVED_HI) {

+ value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;

+ value += UNICODE_UTF16_OFFSET +

+ (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);

+ _current = value;

+ } else {

+ if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&

+ nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {

+ utf16CodeUnitIterator.backup();

+ }

+ if (replacementCodepoint != null) {

+ _current = replacementCodepoint;

+ } else {

+ throw new ArgumentError(

+ "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

+ }

+ } else if (replacementCodepoint != null) {

+ _current = replacementCodepoint;

+ } else {

+ throw new ArgumentError(

+ "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

+ }

+ return true;

+ }

+/**

+ * Encode code points as UTF16 code units.

+ */

+List<int> _codepointsToUtf16CodeUnits(

Lasse Reichstein Nielsen 2013/11/19 07:42:38 If this isn't used, remove it. If it is, consider

floitsch 2013/11/19 10:40:32 ditto.

+ List<int> codepoints,

+ [int offset = 0,

+ int length,

+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

+ _ListRange listRange = new _ListRange(codepoints, offset, length);

+ int encodedLength = 0;

+ for (int value in listRange) {

+ if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||

+ (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

+ encodedLength++;

+ } else if (value > UNICODE_PLANE_ONE_MAX &&

+ value <= UNICODE_VALID_RANGE_MAX) {

+ encodedLength += 2;

+ } else {

+ encodedLength++;

+ }

+ List<int> codeUnitsBuffer = new List<int>(encodedLength);

+ int j = 0;

+ for (int value in listRange) {

+ if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||

+ (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

+ codeUnitsBuffer[j++] = value;

+ } else if (value > UNICODE_PLANE_ONE_MAX &&

+ value <= UNICODE_VALID_RANGE_MAX) {

+ int base = value - UNICODE_UTF16_OFFSET;

+ codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +

+ ((base & UNICODE_UTF16_HI_MASK) >> 10);

+ codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +

+ (base & UNICODE_UTF16_LO_MASK);

+ } else if (replacementCodepoint != null) {

+ codeUnitsBuffer[j++] = replacementCodepoint;

+ } else {

+ throw new ArgumentError("Invalid encoding");

+ }

+ return codeUnitsBuffer;

+/**

+ * Decodes the utf16 codeunits to codepoints.

+ */

+List<int> _utf16CodeUnitsToCodepoints(

Lasse Reichstein Nielsen 2013/11/19 07:42:38 If not used, remove. If used, consider rewriting a

floitsch 2013/11/19 10:40:32 ditto.

+ List<int> utf16CodeUnits, [int offset = 0, int length,

+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

+ _ListRangeIterator source =

+ (new _ListRange(utf16CodeUnits, offset, length)).iterator;

+ Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder

+ .fromListRangeIterator(source, replacementCodepoint);

+ List<int> codepoints = new List<int>(source.remaining);

+ int i = 0;

+ while (decoder.moveNext()) {

+ codepoints[i++] = decoder.current;

+ }

+ if (i == codepoints.length) {

+ return codepoints;

+ } else {

+ List<int> codepointTrunc = new List<int>(i);

+ codepointTrunc.setRange(0, i, codepoints);

+ return codepointTrunc;

+ }

/**

* Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert

* as much of the input as needed. Determines the byte order from the BOM,

@@ -256,7 +417,12 @@ abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator {

bool moveNext() {

_current = null;

- if (utf16EncodedBytesIterator.remaining < 2) {

+ int remaining = utf16EncodedBytesIterator.remaining;

+ if (remaining == 0) {

+ _current = null;

+ return false;

+ }

+ if (remaining == 1) {

utf16EncodedBytesIterator.moveNext();

if (replacementCodepoint != null) {

_current = replacementCodepoint;

@@ -265,10 +431,9 @@ abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator {

throw new ArgumentError(

"Invalid UTF16 at ${utf16EncodedBytesIterator.position}");

}

- } else {

- _current = decode();

- return true;

}

+ _current = decode();

+ return true;

}

int get position => utf16EncodedBytesIterator.position ~/ 2;

« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | pkg/utf/test/utf16_test.dart » ('J')