Index: pkg/utf/lib/utf16.dart |
diff --git a/pkg/utf/lib/utf16.dart b/pkg/utf/lib/utf16.dart |
index 7de9e616581296979d4f7e5f8e094a841012a4bd..438c6781a74fa78ba0944e683e5c7c3fd308c1f8 100644 |
--- a/pkg/utf/lib/utf16.dart |
+++ b/pkg/utf/lib/utf16.dart |
@@ -4,6 +4,167 @@ |
part of utf; |
+// TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). |
floitsch
2013/11/18 17:08:17
Whole section copied verbatim.
Lasse Reichstein Nielsen
2013/11/19 07:42:38
I don't think I want to add anything new to the ut
floitsch
2013/11/19 10:40:32
I don't agree.
The utf-package contains much more
Lasse Reichstein Nielsen
2013/11/19 12:25:43
From the same package - in that case, LGTM.
|
+/** |
+ * Provide a list of Unicode codepoints for a given string. |
+ */ |
+List<int> stringToCodepoints(String str) { |
+ // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. |
+ // So we need to convert. |
+ return _utf16CodeUnitsToCodepoints(str.codeUnits); |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If we keep it (and I don't think we should - if it
floitsch
2013/11/19 10:40:32
This was code that already existed in the package.
|
+} |
+ |
+/** |
+ * Generate a string from the provided Unicode codepoints. |
+ * |
+ * *Deprecated* Use [String.fromCharCodes] instead. |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Ditto - remove this. Definitely remove the "Deprec
floitsch
2013/11/19 10:40:32
Not in this CL.
|
+ */ |
+String codepointsToString(List<int> codepoints) { |
+ return new String.fromCharCodes(codepoints); |
+} |
+ |
+/** |
+ * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. |
+ * The parameters can override the default Unicode replacement character. Set |
+ * the replacementCharacter to null to throw an ArgumentError |
+ * rather than replace the bad value. |
+ */ |
+class Utf16CodeUnitDecoder implements Iterator<int> { |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Do we have a way to use a Converter to go from inp
floitsch
2013/11/19 10:40:32
Again. this is code that already existed. Not chan
|
+ final _ListRangeIterator utf16CodeUnitIterator; |
+ final int replacementCodepoint; |
+ int _current = null; |
+ |
+ Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, |
+ int this.replacementCodepoint = |
+ UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
+ utf16CodeUnitIterator = |
+ (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
+ |
+ Utf16CodeUnitDecoder.fromListRangeIterator( |
+ _ListRangeIterator this.utf16CodeUnitIterator, |
+ int this.replacementCodepoint); |
+ |
+ Iterator<int> get iterator => this; |
+ |
+ int get current => _current; |
+ |
+ bool moveNext() { |
+ _current = null; |
+ if (!utf16CodeUnitIterator.moveNext()) return false; |
+ |
+ int value = utf16CodeUnitIterator.current; |
+ if (value < 0) { |
+ if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
+ } |
+ } else if (value < UNICODE_UTF16_RESERVED_LO || |
+ (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
+ // transfer directly |
+ _current = value; |
+ } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
+ utf16CodeUnitIterator.moveNext()) { |
+ // merge surrogate pair |
+ int nextValue = utf16CodeUnitIterator.current; |
+ if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
+ nextValue <= UNICODE_UTF16_RESERVED_HI) { |
+ value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; |
+ value += UNICODE_UTF16_OFFSET + |
+ (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); |
+ _current = value; |
+ } else { |
+ if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && |
+ nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { |
+ utf16CodeUnitIterator.backup(); |
+ } |
+ if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
+ } |
+ } |
+ } else if (replacementCodepoint != null) { |
+ _current = replacementCodepoint; |
+ } else { |
+ throw new ArgumentError( |
+ "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
+ } |
+ return true; |
+ } |
+} |
+ |
+/** |
+ * Encode code points as UTF16 code units. |
+ */ |
+List<int> _codepointsToUtf16CodeUnits( |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If this isn't used, remove it. If it is, consider
floitsch
2013/11/19 10:40:32
ditto.
|
+ List<int> codepoints, |
+ [int offset = 0, |
+ int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ |
+ _ListRange listRange = new _ListRange(codepoints, offset, length); |
+ int encodedLength = 0; |
+ for (int value in listRange) { |
+ if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
+ (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
+ encodedLength++; |
+ } else if (value > UNICODE_PLANE_ONE_MAX && |
+ value <= UNICODE_VALID_RANGE_MAX) { |
+ encodedLength += 2; |
+ } else { |
+ encodedLength++; |
+ } |
+ } |
+ |
+ List<int> codeUnitsBuffer = new List<int>(encodedLength); |
+ int j = 0; |
+ for (int value in listRange) { |
+ if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
+ (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
+ codeUnitsBuffer[j++] = value; |
+ } else if (value > UNICODE_PLANE_ONE_MAX && |
+ value <= UNICODE_VALID_RANGE_MAX) { |
+ int base = value - UNICODE_UTF16_OFFSET; |
+ codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + |
+ ((base & UNICODE_UTF16_HI_MASK) >> 10); |
+ codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + |
+ (base & UNICODE_UTF16_LO_MASK); |
+ } else if (replacementCodepoint != null) { |
+ codeUnitsBuffer[j++] = replacementCodepoint; |
+ } else { |
+ throw new ArgumentError("Invalid encoding"); |
+ } |
+ } |
+ return codeUnitsBuffer; |
+} |
+ |
+/** |
+ * Decodes the utf16 codeunits to codepoints. |
+ */ |
+List<int> _utf16CodeUnitsToCodepoints( |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If not used, remove.
If used, consider rewriting a
floitsch
2013/11/19 10:40:32
ditto.
|
+ List<int> utf16CodeUnits, [int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ _ListRangeIterator source = |
+ (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
+ Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder |
+ .fromListRangeIterator(source, replacementCodepoint); |
+ List<int> codepoints = new List<int>(source.remaining); |
+ int i = 0; |
+ while (decoder.moveNext()) { |
+ codepoints[i++] = decoder.current; |
+ } |
+ if (i == codepoints.length) { |
+ return codepoints; |
+ } else { |
+ List<int> codepointTrunc = new List<int>(i); |
+ codepointTrunc.setRange(0, i, codepoints); |
+ return codepointTrunc; |
+ } |
+} |
+ |
/** |
* Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert |
* as much of the input as needed. Determines the byte order from the BOM, |
@@ -256,7 +417,12 @@ abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator { |
bool moveNext() { |
_current = null; |
- if (utf16EncodedBytesIterator.remaining < 2) { |
+ int remaining = utf16EncodedBytesIterator.remaining; |
+ if (remaining == 0) { |
+ _current = null; |
+ return false; |
+ } |
+ if (remaining == 1) { |
utf16EncodedBytesIterator.moveNext(); |
if (replacementCodepoint != null) { |
_current = replacementCodepoint; |
@@ -265,10 +431,9 @@ abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator { |
throw new ArgumentError( |
"Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); |
} |
- } else { |
- _current = decode(); |
- return true; |
} |
+ _current = decode(); |
+ return true; |
} |
int get position => utf16EncodedBytesIterator.position ~/ 2; |