OLD | NEW |
---|---|
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of utf; | 5 part of utf; |
6 | 6 |
7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | |
floitsch
2013/11/18 17:08:17
Whole section copied verbatim.
Lasse Reichstein Nielsen
2013/11/19 07:42:38
I don't think I want to add anything new to the ut
floitsch
2013/11/19 10:40:32
I don't agree.
The utf-package contains much more
Lasse Reichstein Nielsen
2013/11/19 12:25:43
From the same package - in that case, LGTM.
| |
8 /** | |
9 * Provide a list of Unicode codepoints for a given string. | |
10 */ | |
11 List<int> stringToCodepoints(String str) { | |
12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. | |
13 // So we need to convert. | |
14 return _utf16CodeUnitsToCodepoints(str.codeUnits); | |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If we keep it (and I don't think we should - if it
floitsch
2013/11/19 10:40:32
This was code that already existed in the package.
| |
15 } | |
16 | |
17 /** | |
18 * Generate a string from the provided Unicode codepoints. | |
19 * | |
20 * *Deprecated* Use [String.fromCharCodes] instead. | |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Ditto - remove this. Definitely remove the "Deprec
floitsch
2013/11/19 10:40:32
Not in this CL.
| |
21 */ | |
22 String codepointsToString(List<int> codepoints) { | |
23 return new String.fromCharCodes(codepoints); | |
24 } | |
25 | |
26 /** | |
27 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. | |
28 * The parameters can override the default Unicode replacement character. Set | |
29 * the replacementCharacter to null to throw an ArgumentError | |
30 * rather than replace the bad value. | |
31 */ | |
32 class Utf16CodeUnitDecoder implements Iterator<int> { | |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
Do we have a way to use a Converter to go from inp
floitsch
2013/11/19 10:40:32
Again. this is code that already existed. Not chan
| |
33 final _ListRangeIterator utf16CodeUnitIterator; | |
34 final int replacementCodepoint; | |
35 int _current = null; | |
36 | |
37 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, | |
38 int this.replacementCodepoint = | |
39 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
40 utf16CodeUnitIterator = | |
41 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
42 | |
43 Utf16CodeUnitDecoder.fromListRangeIterator( | |
44 _ListRangeIterator this.utf16CodeUnitIterator, | |
45 int this.replacementCodepoint); | |
46 | |
47 Iterator<int> get iterator => this; | |
48 | |
49 int get current => _current; | |
50 | |
51 bool moveNext() { | |
52 _current = null; | |
53 if (!utf16CodeUnitIterator.moveNext()) return false; | |
54 | |
55 int value = utf16CodeUnitIterator.current; | |
56 if (value < 0) { | |
57 if (replacementCodepoint != null) { | |
58 _current = replacementCodepoint; | |
59 } else { | |
60 throw new ArgumentError( | |
61 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
62 } | |
63 } else if (value < UNICODE_UTF16_RESERVED_LO || | |
64 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
65 // transfer directly | |
66 _current = value; | |
67 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
68 utf16CodeUnitIterator.moveNext()) { | |
69 // merge surrogate pair | |
70 int nextValue = utf16CodeUnitIterator.current; | |
71 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
72 nextValue <= UNICODE_UTF16_RESERVED_HI) { | |
73 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; | |
74 value += UNICODE_UTF16_OFFSET + | |
75 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); | |
76 _current = value; | |
77 } else { | |
78 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && | |
79 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { | |
80 utf16CodeUnitIterator.backup(); | |
81 } | |
82 if (replacementCodepoint != null) { | |
83 _current = replacementCodepoint; | |
84 } else { | |
85 throw new ArgumentError( | |
86 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
87 } | |
88 } | |
89 } else if (replacementCodepoint != null) { | |
90 _current = replacementCodepoint; | |
91 } else { | |
92 throw new ArgumentError( | |
93 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
94 } | |
95 return true; | |
96 } | |
97 } | |
98 | |
99 /** | |
100 * Encode code points as UTF16 code units. | |
101 */ | |
102 List<int> _codepointsToUtf16CodeUnits( | |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If this isn't used, remove it. If it is, consider
floitsch
2013/11/19 10:40:32
ditto.
| |
103 List<int> codepoints, | |
104 [int offset = 0, | |
105 int length, | |
106 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
107 | |
108 _ListRange listRange = new _ListRange(codepoints, offset, length); | |
109 int encodedLength = 0; | |
110 for (int value in listRange) { | |
111 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
112 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
113 encodedLength++; | |
114 } else if (value > UNICODE_PLANE_ONE_MAX && | |
115 value <= UNICODE_VALID_RANGE_MAX) { | |
116 encodedLength += 2; | |
117 } else { | |
118 encodedLength++; | |
119 } | |
120 } | |
121 | |
122 List<int> codeUnitsBuffer = new List<int>(encodedLength); | |
123 int j = 0; | |
124 for (int value in listRange) { | |
125 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
126 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
127 codeUnitsBuffer[j++] = value; | |
128 } else if (value > UNICODE_PLANE_ONE_MAX && | |
129 value <= UNICODE_VALID_RANGE_MAX) { | |
130 int base = value - UNICODE_UTF16_OFFSET; | |
131 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + | |
132 ((base & UNICODE_UTF16_HI_MASK) >> 10); | |
133 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + | |
134 (base & UNICODE_UTF16_LO_MASK); | |
135 } else if (replacementCodepoint != null) { | |
136 codeUnitsBuffer[j++] = replacementCodepoint; | |
137 } else { | |
138 throw new ArgumentError("Invalid encoding"); | |
139 } | |
140 } | |
141 return codeUnitsBuffer; | |
142 } | |
143 | |
144 /** | |
145 * Decodes the utf16 codeunits to codepoints. | |
146 */ | |
147 List<int> _utf16CodeUnitsToCodepoints( | |
Lasse Reichstein Nielsen
2013/11/19 07:42:38
If not used, remove.
If used, consider rewriting a
floitsch
2013/11/19 10:40:32
ditto.
| |
148 List<int> utf16CodeUnits, [int offset = 0, int length, | |
149 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
150 _ListRangeIterator source = | |
151 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
152 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder | |
153 .fromListRangeIterator(source, replacementCodepoint); | |
154 List<int> codepoints = new List<int>(source.remaining); | |
155 int i = 0; | |
156 while (decoder.moveNext()) { | |
157 codepoints[i++] = decoder.current; | |
158 } | |
159 if (i == codepoints.length) { | |
160 return codepoints; | |
161 } else { | |
162 List<int> codepointTrunc = new List<int>(i); | |
163 codepointTrunc.setRange(0, i, codepoints); | |
164 return codepointTrunc; | |
165 } | |
166 } | |
167 | |
7 /** | 168 /** |
8 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert | 169 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert |
9 * as much of the input as needed. Determines the byte order from the BOM, | 170 * as much of the input as needed. Determines the byte order from the BOM, |
10 * or uses big-endian as a default. This method always strips a leading BOM. | 171 * or uses big-endian as a default. This method always strips a leading BOM. |
11 * Set the [replacementCodepoint] to null to throw an ArgumentError | 172 * Set the [replacementCodepoint] to null to throw an ArgumentError |
12 * rather than replace the bad value. The default value for | 173 * rather than replace the bad value. The default value for |
13 * [replacementCodepoint] is U+FFFD. | 174 * [replacementCodepoint] is U+FFFD. |
14 */ | 175 */ |
15 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0, | 176 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0, |
16 int length, int replacementCodepoint = | 177 int length, int replacementCodepoint = |
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
249 List<int> truncCodeunits = new List<int>(i); | 410 List<int> truncCodeunits = new List<int>(i); |
250 truncCodeunits.setRange(0, i, codeunits); | 411 truncCodeunits.setRange(0, i, codeunits); |
251 return truncCodeunits; | 412 return truncCodeunits; |
252 } | 413 } |
253 } | 414 } |
254 | 415 |
255 int get current => _current; | 416 int get current => _current; |
256 | 417 |
257 bool moveNext() { | 418 bool moveNext() { |
258 _current = null; | 419 _current = null; |
259 if (utf16EncodedBytesIterator.remaining < 2) { | 420 int remaining = utf16EncodedBytesIterator.remaining; |
421 if (remaining == 0) { | |
422 _current = null; | |
423 return false; | |
424 } | |
425 if (remaining == 1) { | |
260 utf16EncodedBytesIterator.moveNext(); | 426 utf16EncodedBytesIterator.moveNext(); |
261 if (replacementCodepoint != null) { | 427 if (replacementCodepoint != null) { |
262 _current = replacementCodepoint; | 428 _current = replacementCodepoint; |
263 return true; | 429 return true; |
264 } else { | 430 } else { |
265 throw new ArgumentError( | 431 throw new ArgumentError( |
266 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); | 432 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); |
267 } | 433 } |
268 } else { | |
269 _current = decode(); | |
270 return true; | |
271 } | 434 } |
435 _current = decode(); | |
436 return true; | |
272 } | 437 } |
273 | 438 |
274 int get position => utf16EncodedBytesIterator.position ~/ 2; | 439 int get position => utf16EncodedBytesIterator.position ~/ 2; |
275 | 440 |
276 void backup([int by = 1]) { | 441 void backup([int by = 1]) { |
277 utf16EncodedBytesIterator.backup(2 * by); | 442 utf16EncodedBytesIterator.backup(2 * by); |
278 } | 443 } |
279 | 444 |
280 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; | 445 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; |
281 | 446 |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
328 } | 493 } |
329 | 494 |
330 int decode() { | 495 int decode() { |
331 utf16EncodedBytesIterator.moveNext(); | 496 utf16EncodedBytesIterator.moveNext(); |
332 int lo = utf16EncodedBytesIterator.current; | 497 int lo = utf16EncodedBytesIterator.current; |
333 utf16EncodedBytesIterator.moveNext(); | 498 utf16EncodedBytesIterator.moveNext(); |
334 int hi = utf16EncodedBytesIterator.current; | 499 int hi = utf16EncodedBytesIterator.current; |
335 return (hi << 8) + lo; | 500 return (hi << 8) + lo; |
336 } | 501 } |
337 } | 502 } |
OLD | NEW |