Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of utf; | 5 part of utf; |
| 6 | 6 |
| 7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | |
|
floitsch
2013/11/18 17:08:17
Whole section copied verbatim.
Lasse Reichstein Nielsen
2013/11/19 07:42:38
I don't think I want to add anything new to the ut
floitsch
2013/11/19 10:40:32
I don't agree.
The utf-package contains much more
Lasse Reichstein Nielsen
2013/11/19 12:25:43
From the same package - in that case, LGTM.
| |
| 8 /** | |
| 9 * Provide a list of Unicode codepoints for a given string. | |
| 10 */ | |
| 11 List<int> stringToCodepoints(String str) { | |
| 12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. | |
| 13 // So we need to convert. | |
| 14 return _utf16CodeUnitsToCodepoints(str.codeUnits); | |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If we keep it (and I don't think we should - if it
floitsch
2013/11/19 10:40:32
This was code that already existed in the package.
| |
| 15 } | |
| 16 | |
| 17 /** | |
| 18 * Generate a string from the provided Unicode codepoints. | |
| 19 * | |
| 20 * *Deprecated* Use [String.fromCharCodes] instead. | |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Ditto - remove this. Definitely remove the "Deprec
floitsch
2013/11/19 10:40:32
Not in this CL.
| |
| 21 */ | |
| 22 String codepointsToString(List<int> codepoints) { | |
| 23 return new String.fromCharCodes(codepoints); | |
| 24 } | |
| 25 | |
| 26 /** | |
| 27 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. | |
| 28 * The parameters can override the default Unicode replacement character. Set | |
| 29 * the replacementCharacter to null to throw an ArgumentError | |
| 30 * rather than replace the bad value. | |
| 31 */ | |
| 32 class Utf16CodeUnitDecoder implements Iterator<int> { | |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Do we have a way to use a Converter to go from inp
floitsch
2013/11/19 10:40:32
Again. this is code that already existed. Not chan
| |
| 33 final _ListRangeIterator utf16CodeUnitIterator; | |
| 34 final int replacementCodepoint; | |
| 35 int _current = null; | |
| 36 | |
| 37 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, | |
| 38 int this.replacementCodepoint = | |
| 39 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
| 40 utf16CodeUnitIterator = | |
| 41 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
| 42 | |
| 43 Utf16CodeUnitDecoder.fromListRangeIterator( | |
| 44 _ListRangeIterator this.utf16CodeUnitIterator, | |
| 45 int this.replacementCodepoint); | |
| 46 | |
| 47 Iterator<int> get iterator => this; | |
| 48 | |
| 49 int get current => _current; | |
| 50 | |
| 51 bool moveNext() { | |
| 52 _current = null; | |
| 53 if (!utf16CodeUnitIterator.moveNext()) return false; | |
| 54 | |
| 55 int value = utf16CodeUnitIterator.current; | |
| 56 if (value < 0) { | |
| 57 if (replacementCodepoint != null) { | |
| 58 _current = replacementCodepoint; | |
| 59 } else { | |
| 60 throw new ArgumentError( | |
| 61 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
| 62 } | |
| 63 } else if (value < UNICODE_UTF16_RESERVED_LO || | |
| 64 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
| 65 // transfer directly | |
| 66 _current = value; | |
| 67 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
| 68 utf16CodeUnitIterator.moveNext()) { | |
| 69 // merge surrogate pair | |
| 70 int nextValue = utf16CodeUnitIterator.current; | |
| 71 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
| 72 nextValue <= UNICODE_UTF16_RESERVED_HI) { | |
| 73 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; | |
| 74 value += UNICODE_UTF16_OFFSET + | |
| 75 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); | |
| 76 _current = value; | |
| 77 } else { | |
| 78 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && | |
| 79 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { | |
| 80 utf16CodeUnitIterator.backup(); | |
| 81 } | |
| 82 if (replacementCodepoint != null) { | |
| 83 _current = replacementCodepoint; | |
| 84 } else { | |
| 85 throw new ArgumentError( | |
| 86 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
| 87 } | |
| 88 } | |
| 89 } else if (replacementCodepoint != null) { | |
| 90 _current = replacementCodepoint; | |
| 91 } else { | |
| 92 throw new ArgumentError( | |
| 93 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
| 94 } | |
| 95 return true; | |
| 96 } | |
| 97 } | |
| 98 | |
| 99 /** | |
| 100 * Encode code points as UTF16 code units. | |
| 101 */ | |
| 102 List<int> _codepointsToUtf16CodeUnits( | |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If this isn't used, remove it. If it is, consider
floitsch
2013/11/19 10:40:32
ditto.
| |
| 103 List<int> codepoints, | |
| 104 [int offset = 0, | |
| 105 int length, | |
| 106 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 107 | |
| 108 _ListRange listRange = new _ListRange(codepoints, offset, length); | |
| 109 int encodedLength = 0; | |
| 110 for (int value in listRange) { | |
| 111 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
| 112 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
| 113 encodedLength++; | |
| 114 } else if (value > UNICODE_PLANE_ONE_MAX && | |
| 115 value <= UNICODE_VALID_RANGE_MAX) { | |
| 116 encodedLength += 2; | |
| 117 } else { | |
| 118 encodedLength++; | |
| 119 } | |
| 120 } | |
| 121 | |
| 122 List<int> codeUnitsBuffer = new List<int>(encodedLength); | |
| 123 int j = 0; | |
| 124 for (int value in listRange) { | |
| 125 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
| 126 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
| 127 codeUnitsBuffer[j++] = value; | |
| 128 } else if (value > UNICODE_PLANE_ONE_MAX && | |
| 129 value <= UNICODE_VALID_RANGE_MAX) { | |
| 130 int base = value - UNICODE_UTF16_OFFSET; | |
| 131 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + | |
| 132 ((base & UNICODE_UTF16_HI_MASK) >> 10); | |
| 133 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + | |
| 134 (base & UNICODE_UTF16_LO_MASK); | |
| 135 } else if (replacementCodepoint != null) { | |
| 136 codeUnitsBuffer[j++] = replacementCodepoint; | |
| 137 } else { | |
| 138 throw new ArgumentError("Invalid encoding"); | |
| 139 } | |
| 140 } | |
| 141 return codeUnitsBuffer; | |
| 142 } | |
| 143 | |
| 144 /** | |
| 145 * Decodes the utf16 codeunits to codepoints. | |
| 146 */ | |
| 147 List<int> _utf16CodeUnitsToCodepoints( | |
|
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If not used, remove.
If used, consider rewriting a
floitsch
2013/11/19 10:40:32
ditto.
| |
| 148 List<int> utf16CodeUnits, [int offset = 0, int length, | |
| 149 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 150 _ListRangeIterator source = | |
| 151 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
| 152 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder | |
| 153 .fromListRangeIterator(source, replacementCodepoint); | |
| 154 List<int> codepoints = new List<int>(source.remaining); | |
| 155 int i = 0; | |
| 156 while (decoder.moveNext()) { | |
| 157 codepoints[i++] = decoder.current; | |
| 158 } | |
| 159 if (i == codepoints.length) { | |
| 160 return codepoints; | |
| 161 } else { | |
| 162 List<int> codepointTrunc = new List<int>(i); | |
| 163 codepointTrunc.setRange(0, i, codepoints); | |
| 164 return codepointTrunc; | |
| 165 } | |
| 166 } | |
| 167 | |
| 7 /** | 168 /** |
| 8 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert | 169 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert |
| 9 * as much of the input as needed. Determines the byte order from the BOM, | 170 * as much of the input as needed. Determines the byte order from the BOM, |
| 10 * or uses big-endian as a default. This method always strips a leading BOM. | 171 * or uses big-endian as a default. This method always strips a leading BOM. |
| 11 * Set the [replacementCodepoint] to null to throw an ArgumentError | 172 * Set the [replacementCodepoint] to null to throw an ArgumentError |
| 12 * rather than replace the bad value. The default value for | 173 * rather than replace the bad value. The default value for |
| 13 * [replacementCodepoint] is U+FFFD. | 174 * [replacementCodepoint] is U+FFFD. |
| 14 */ | 175 */ |
| 15 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0, | 176 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0, |
| 16 int length, int replacementCodepoint = | 177 int length, int replacementCodepoint = |
| (...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 249 List<int> truncCodeunits = new List<int>(i); | 410 List<int> truncCodeunits = new List<int>(i); |
| 250 truncCodeunits.setRange(0, i, codeunits); | 411 truncCodeunits.setRange(0, i, codeunits); |
| 251 return truncCodeunits; | 412 return truncCodeunits; |
| 252 } | 413 } |
| 253 } | 414 } |
| 254 | 415 |
| 255 int get current => _current; | 416 int get current => _current; |
| 256 | 417 |
| 257 bool moveNext() { | 418 bool moveNext() { |
| 258 _current = null; | 419 _current = null; |
| 259 if (utf16EncodedBytesIterator.remaining < 2) { | 420 int remaining = utf16EncodedBytesIterator.remaining; |
| 421 if (remaining == 0) { | |
| 422 _current = null; | |
| 423 return false; | |
| 424 } | |
| 425 if (remaining == 1) { | |
| 260 utf16EncodedBytesIterator.moveNext(); | 426 utf16EncodedBytesIterator.moveNext(); |
| 261 if (replacementCodepoint != null) { | 427 if (replacementCodepoint != null) { |
| 262 _current = replacementCodepoint; | 428 _current = replacementCodepoint; |
| 263 return true; | 429 return true; |
| 264 } else { | 430 } else { |
| 265 throw new ArgumentError( | 431 throw new ArgumentError( |
| 266 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); | 432 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); |
| 267 } | 433 } |
| 268 } else { | |
| 269 _current = decode(); | |
| 270 return true; | |
| 271 } | 434 } |
| 435 _current = decode(); | |
| 436 return true; | |
| 272 } | 437 } |
| 273 | 438 |
| 274 int get position => utf16EncodedBytesIterator.position ~/ 2; | 439 int get position => utf16EncodedBytesIterator.position ~/ 2; |
| 275 | 440 |
| 276 void backup([int by = 1]) { | 441 void backup([int by = 1]) { |
| 277 utf16EncodedBytesIterator.backup(2 * by); | 442 utf16EncodedBytesIterator.backup(2 * by); |
| 278 } | 443 } |
| 279 | 444 |
| 280 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; | 445 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; |
| 281 | 446 |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 328 } | 493 } |
| 329 | 494 |
| 330 int decode() { | 495 int decode() { |
| 331 utf16EncodedBytesIterator.moveNext(); | 496 utf16EncodedBytesIterator.moveNext(); |
| 332 int lo = utf16EncodedBytesIterator.current; | 497 int lo = utf16EncodedBytesIterator.current; |
| 333 utf16EncodedBytesIterator.moveNext(); | 498 utf16EncodedBytesIterator.moveNext(); |
| 334 int hi = utf16EncodedBytesIterator.current; | 499 int hi = utf16EncodedBytesIterator.current; |
| 335 return (hi << 8) + lo; | 500 return (hi << 8) + lo; |
| 336 } | 501 } |
| 337 } | 502 } |
| OLD | NEW |