Chromium Code Reviews| Index: pkg/utf/lib/utf16.dart |
| diff --git a/pkg/utf/lib/utf16.dart b/pkg/utf/lib/utf16.dart |
| index 7de9e616581296979d4f7e5f8e094a841012a4bd..438c6781a74fa78ba0944e683e5c7c3fd308c1f8 100644 |
| --- a/pkg/utf/lib/utf16.dart |
| +++ b/pkg/utf/lib/utf16.dart |
| @@ -4,6 +4,167 @@ |
| part of utf; |
| +// TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). |
|
floitsch
2013/11/18 17:08:17
Whole section copied verbatim.
Lasse Reichstein Nielsen
2013/11/19 07:42:38
I don't think I want to add anything new to the ut
floitsch
2013/11/19 10:40:32
I don't agree.
The utf-package contains much more
Lasse Reichstein Nielsen
2013/11/19 12:25:43
From the same package - in that case, LGTM.
|
| +/** |
| + * Provide a list of Unicode codepoints for a given string. |
| + */ |
| +List<int> stringToCodepoints(String str) { |
| + // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. |
| + // So we need to convert. |
| + return _utf16CodeUnitsToCodepoints(str.codeUnits); |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If we keep it (and I don't think we should - if it
floitsch
2013/11/19 10:40:32
This was code that already existed in the package.
|
| +} |
| + |
| +/** |
| + * Generate a string from the provided Unicode codepoints. |
| + * |
| + * *Deprecated* Use [String.fromCharCodes] instead. |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Ditto - remove this. Definitely remove the "Deprec
floitsch
2013/11/19 10:40:32
Not in this CL.
|
| + */ |
| +String codepointsToString(List<int> codepoints) { |
| + return new String.fromCharCodes(codepoints); |
| +} |
| + |
| +/** |
| + * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. |
| + * The parameters can override the default Unicode replacement character. Set |
| + * the replacementCharacter to null to throw an ArgumentError |
| + * rather than replace the bad value. |
| + */ |
| +class Utf16CodeUnitDecoder implements Iterator<int> { |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Do we have a way to use a Converter to go from inp
floitsch
2013/11/19 10:40:32
Again. this is code that already existed. Not chan
|
| + final _ListRangeIterator utf16CodeUnitIterator; |
| + final int replacementCodepoint; |
| + int _current = null; |
| + |
| + Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, |
| + int this.replacementCodepoint = |
| + UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| + utf16CodeUnitIterator = |
| + (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
| + |
| + Utf16CodeUnitDecoder.fromListRangeIterator( |
| + _ListRangeIterator this.utf16CodeUnitIterator, |
| + int this.replacementCodepoint); |
| + |
| + Iterator<int> get iterator => this; |
| + |
| + int get current => _current; |
| + |
| + bool moveNext() { |
| + _current = null; |
| + if (!utf16CodeUnitIterator.moveNext()) return false; |
| + |
| + int value = utf16CodeUnitIterator.current; |
| + if (value < 0) { |
| + if (replacementCodepoint != null) { |
| + _current = replacementCodepoint; |
| + } else { |
| + throw new ArgumentError( |
| + "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
| + } |
| + } else if (value < UNICODE_UTF16_RESERVED_LO || |
| + (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
| + // transfer directly |
| + _current = value; |
| + } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
| + utf16CodeUnitIterator.moveNext()) { |
| + // merge surrogate pair |
| + int nextValue = utf16CodeUnitIterator.current; |
| + if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
| + nextValue <= UNICODE_UTF16_RESERVED_HI) { |
| + value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; |
| + value += UNICODE_UTF16_OFFSET + |
| + (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); |
| + _current = value; |
| + } else { |
| + if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && |
| + nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { |
| + utf16CodeUnitIterator.backup(); |
| + } |
| + if (replacementCodepoint != null) { |
| + _current = replacementCodepoint; |
| + } else { |
| + throw new ArgumentError( |
| + "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
| + } |
| + } |
| + } else if (replacementCodepoint != null) { |
| + _current = replacementCodepoint; |
| + } else { |
| + throw new ArgumentError( |
| + "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
| + } |
| + return true; |
| + } |
| +} |
| + |
| +/** |
| + * Encode code points as UTF16 code units. |
| + */ |
| +List<int> _codepointsToUtf16CodeUnits( |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If this isn't used, remove it. If it is, consider
floitsch
2013/11/19 10:40:32
ditto.
|
| + List<int> codepoints, |
| + [int offset = 0, |
| + int length, |
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| + |
| + _ListRange listRange = new _ListRange(codepoints, offset, length); |
| + int encodedLength = 0; |
| + for (int value in listRange) { |
| + if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
| + (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
| + encodedLength++; |
| + } else if (value > UNICODE_PLANE_ONE_MAX && |
| + value <= UNICODE_VALID_RANGE_MAX) { |
| + encodedLength += 2; |
| + } else { |
| + encodedLength++; |
| + } |
| + } |
| + |
| + List<int> codeUnitsBuffer = new List<int>(encodedLength); |
| + int j = 0; |
| + for (int value in listRange) { |
| + if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
| + (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
| + codeUnitsBuffer[j++] = value; |
| + } else if (value > UNICODE_PLANE_ONE_MAX && |
| + value <= UNICODE_VALID_RANGE_MAX) { |
| + int base = value - UNICODE_UTF16_OFFSET; |
| + codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + |
| + ((base & UNICODE_UTF16_HI_MASK) >> 10); |
| + codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + |
| + (base & UNICODE_UTF16_LO_MASK); |
| + } else if (replacementCodepoint != null) { |
| + codeUnitsBuffer[j++] = replacementCodepoint; |
| + } else { |
| + throw new ArgumentError("Invalid encoding"); |
| + } |
| + } |
| + return codeUnitsBuffer; |
| +} |
| + |
| +/** |
| + * Decodes the utf16 codeunits to codepoints. |
| + */ |
| +List<int> _utf16CodeUnitsToCodepoints( |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If not used, remove.
If used, consider rewriting a
floitsch
2013/11/19 10:40:32
ditto.
|
| + List<int> utf16CodeUnits, [int offset = 0, int length, |
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| + _ListRangeIterator source = |
| + (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
| + Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder |
| + .fromListRangeIterator(source, replacementCodepoint); |
| + List<int> codepoints = new List<int>(source.remaining); |
| + int i = 0; |
| + while (decoder.moveNext()) { |
| + codepoints[i++] = decoder.current; |
| + } |
| + if (i == codepoints.length) { |
| + return codepoints; |
| + } else { |
| + List<int> codepointTrunc = new List<int>(i); |
| + codepointTrunc.setRange(0, i, codepoints); |
| + return codepointTrunc; |
| + } |
| +} |
| + |
| /** |
| * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert |
| * as much of the input as needed. Determines the byte order from the BOM, |
| @@ -256,7 +417,12 @@ abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator { |
| bool moveNext() { |
| _current = null; |
| - if (utf16EncodedBytesIterator.remaining < 2) { |
| + int remaining = utf16EncodedBytesIterator.remaining; |
| + if (remaining == 0) { |
| + _current = null; |
| + return false; |
| + } |
| + if (remaining == 1) { |
| utf16EncodedBytesIterator.moveNext(); |
| if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| @@ -265,10 +431,9 @@ abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator { |
| throw new ArgumentError( |
| "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); |
| } |
| - } else { |
| - _current = decode(); |
| - return true; |
| } |
| + _current = decode(); |
| + return true; |
| } |
| int get position => utf16EncodedBytesIterator.position ~/ 2; |