OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.utf; | 5 part of dart.utf; |
6 | 6 |
7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | 7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). |
8 /** | 8 /** |
9 * Provide a list of Unicode codepoints for a given string. | 9 * Provide a list of Unicode codepoints for a given string. |
10 */ | 10 */ |
(...skipping 27 matching lines...) Expand all Loading... |
38 const int UNICODE_UTF16_OFFSET = 0x10000; | 38 const int UNICODE_UTF16_OFFSET = 0x10000; |
39 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800; | 39 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800; |
40 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00; | 40 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00; |
41 const int UNICODE_UTF16_HI_MASK = 0xffc00; | 41 const int UNICODE_UTF16_HI_MASK = 0xffc00; |
42 const int UNICODE_UTF16_LO_MASK = 0x3ff; | 42 const int UNICODE_UTF16_LO_MASK = 0x3ff; |
43 | 43 |
44 /** | 44 /** |
45 * Encode code points as UTF16 code units. | 45 * Encode code points as UTF16 code units. |
46 */ | 46 */ |
47 List<int> _codepointsToUtf16CodeUnits( | 47 List<int> _codepointsToUtf16CodeUnits( |
48 List<int> codepoints, [int offset = 0, int length, | 48 List<int> codepoints, |
49 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 49 [int offset = 0, |
| 50 int length, |
| 51 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
50 | 52 |
51 _ListRange listRange = new _ListRange(codepoints, offset, length); | 53 _ListRange listRange = new _ListRange(codepoints, offset, length); |
52 int encodedLength = 0; | 54 int encodedLength = 0; |
53 for (int value in listRange) { | 55 for (int value in listRange) { |
54 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | 56 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
55 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | 57 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
56 encodedLength++; | 58 encodedLength++; |
57 } else if (value > UNICODE_PLANE_ONE_MAX && | 59 } else if (value > UNICODE_PLANE_ONE_MAX && |
58 value <= UNICODE_VALID_RANGE_MAX) { | 60 value <= UNICODE_VALID_RANGE_MAX) { |
59 encodedLength += 2; | 61 encodedLength += 2; |
60 } else { | 62 } else { |
61 encodedLength++; | 63 encodedLength++; |
62 } | 64 } |
63 } | 65 } |
64 | 66 |
65 List<int> codeUnitsBuffer = new List<int>(encodedLength); | 67 List<int> codeUnitsBuffer = new List<int>.fixedLength(encodedLength); |
66 int j = 0; | 68 int j = 0; |
67 for (int value in listRange) { | 69 for (int value in listRange) { |
68 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | 70 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
69 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | 71 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
70 codeUnitsBuffer[j++] = value; | 72 codeUnitsBuffer[j++] = value; |
71 } else if (value > UNICODE_PLANE_ONE_MAX && | 73 } else if (value > UNICODE_PLANE_ONE_MAX && |
72 value <= UNICODE_VALID_RANGE_MAX) { | 74 value <= UNICODE_VALID_RANGE_MAX) { |
73 int base = value - UNICODE_UTF16_OFFSET; | 75 int base = value - UNICODE_UTF16_OFFSET; |
74 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + | 76 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + |
75 ((base & UNICODE_UTF16_HI_MASK) >> 10); | 77 ((base & UNICODE_UTF16_HI_MASK) >> 10); |
76 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + | 78 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + |
77 (base & UNICODE_UTF16_LO_MASK); | 79 (base & UNICODE_UTF16_LO_MASK); |
78 } else if (replacementCodepoint != null) { | 80 } else if (replacementCodepoint != null) { |
79 codeUnitsBuffer[j++] = replacementCodepoint; | 81 codeUnitsBuffer[j++] = replacementCodepoint; |
80 } else { | 82 } else { |
81 throw new ArgumentError("Invalid encoding"); | 83 throw new ArgumentError("Invalid encoding"); |
82 } | 84 } |
83 } | 85 } |
84 return codeUnitsBuffer; | 86 return codeUnitsBuffer; |
85 } | 87 } |
86 | 88 |
87 /** | 89 /** |
88 * Decodes the utf16 codeunits to codepoints. | 90 * Decodes the utf16 codeunits to codepoints. |
89 */ | 91 */ |
90 List<int> _utf16CodeUnitsToCodepoints( | 92 List<int> _utf16CodeUnitsToCodepoints( |
91 List<int> utf16CodeUnits, [int offset = 0, int length, | 93 List<int> utf16CodeUnits, [int offset = 0, int length, |
92 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 94 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
93 _ListRangeIterator source = | 95 _ListRangeIterator source = |
94 (new _ListRange(utf16CodeUnits, offset, length)).iterator(); | 96 (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
95 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder | 97 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder |
96 .fromListRangeIterator(source, replacementCodepoint); | 98 .fromListRangeIterator(source, replacementCodepoint); |
97 List<int> codepoints = new List<int>(source.remaining); | 99 List<int> codepoints = new List<int>.fixedLength(source.remaining); |
98 int i = 0; | 100 int i = 0; |
99 while (decoder.hasNext) { | 101 while (decoder.moveNext()) { |
100 codepoints[i++] = decoder.next(); | 102 codepoints[i++] = decoder.current; |
101 } | 103 } |
102 if (i == codepoints.length) { | 104 if (i == codepoints.length) { |
103 return codepoints; | 105 return codepoints; |
104 } else { | 106 } else { |
105 List<int> codepointTrunc = new List<int>(i); | 107 List<int> codepointTrunc = new List<int>.fixedLength(i); |
106 codepointTrunc.setRange(0, i, codepoints); | 108 codepointTrunc.setRange(0, i, codepoints); |
107 return codepointTrunc; | 109 return codepointTrunc; |
108 } | 110 } |
109 } | 111 } |
110 | 112 |
111 /** | 113 /** |
112 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. | 114 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. |
113 * The parameters can override the default Unicode replacement character. Set | 115 * The parameters can override the default Unicode replacement character. Set |
114 * the replacementCharacter to null to throw an ArgumentError | 116 * the replacementCharacter to null to throw an ArgumentError |
115 * rather than replace the bad value. | 117 * rather than replace the bad value. |
116 */ | 118 */ |
117 class Utf16CodeUnitDecoder implements Iterator<int> { | 119 class Utf16CodeUnitDecoder implements Iterator<int> { |
118 final _ListRangeIterator utf16CodeUnitIterator; | 120 final _ListRangeIterator utf16CodeUnitIterator; |
119 final int replacementCodepoint; | 121 final int replacementCodepoint; |
| 122 int _current = null; |
120 | 123 |
121 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, | 124 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, |
122 int this.replacementCodepoint = | 125 int this.replacementCodepoint = |
123 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 126 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
124 utf16CodeUnitIterator = (new _ListRange(utf16CodeUnits, offset, length)) | 127 utf16CodeUnitIterator = |
125 .iterator(); | 128 (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
126 | 129 |
127 Utf16CodeUnitDecoder.fromListRangeIterator( | 130 Utf16CodeUnitDecoder.fromListRangeIterator( |
128 _ListRangeIterator this.utf16CodeUnitIterator, | 131 _ListRangeIterator this.utf16CodeUnitIterator, |
129 int this.replacementCodepoint); | 132 int this.replacementCodepoint); |
130 | 133 |
131 Iterator<int> iterator() => this; | 134 Iterator<int> get iterator => this; |
132 | 135 |
133 bool get hasNext => utf16CodeUnitIterator.hasNext; | 136 int get current => _current; |
134 | 137 |
135 int next() { | 138 bool moveNext() { |
136 int value = utf16CodeUnitIterator.next(); | 139 _current = null; |
| 140 if (!utf16CodeUnitIterator.moveNext()) return false; |
| 141 |
| 142 int value = utf16CodeUnitIterator.current; |
137 if (value < 0) { | 143 if (value < 0) { |
138 if (replacementCodepoint != null) { | 144 if (replacementCodepoint != null) { |
139 return replacementCodepoint; | 145 _current = replacementCodepoint; |
140 } else { | 146 } else { |
141 throw new ArgumentError( | 147 throw new ArgumentError( |
142 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | 148 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
143 } | 149 } |
144 } else if (value < UNICODE_UTF16_RESERVED_LO || | 150 } else if (value < UNICODE_UTF16_RESERVED_LO || |
145 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | 151 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
146 // transfer directly | 152 // transfer directly |
147 return value; | 153 _current = value; |
148 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | 154 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
149 utf16CodeUnitIterator.hasNext) { | 155 utf16CodeUnitIterator.moveNext()) { |
150 // merge surrogate pair | 156 // merge surrogate pair |
151 int nextValue = utf16CodeUnitIterator.next(); | 157 int nextValue = utf16CodeUnitIterator.current; |
152 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | 158 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
153 nextValue <= UNICODE_UTF16_RESERVED_HI) { | 159 nextValue <= UNICODE_UTF16_RESERVED_HI) { |
154 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; | 160 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; |
155 value += UNICODE_UTF16_OFFSET + | 161 value += UNICODE_UTF16_OFFSET + |
156 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); | 162 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); |
157 return value; | 163 _current = value; |
158 } else { | 164 } else { |
159 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && | 165 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && |
160 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { | 166 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { |
161 utf16CodeUnitIterator.backup(); | 167 utf16CodeUnitIterator.backup(); |
162 } | 168 } |
163 if (replacementCodepoint != null) { | 169 if (replacementCodepoint != null) { |
164 return replacementCodepoint; | 170 _current = replacementCodepoint; |
165 } else { | 171 } else { |
166 throw new ArgumentError( | 172 throw new ArgumentError( |
167 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | 173 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
168 } | 174 } |
169 } | 175 } |
170 } else if (replacementCodepoint != null) { | 176 } else if (replacementCodepoint != null) { |
171 return replacementCodepoint; | 177 _current = replacementCodepoint; |
172 } else { | 178 } else { |
173 throw new ArgumentError( | 179 throw new ArgumentError( |
174 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | 180 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
175 } | 181 } |
| 182 return true; |
176 } | 183 } |
177 } | 184 } |
178 | 185 |
179 /** | 186 /** |
180 * _ListRange in an internal type used to create a lightweight Interable on a | 187 * _ListRange in an internal type used to create a lightweight Interable on a |
181 * range within a source list. DO NOT MODIFY the underlying list while | 188 * range within a source list. DO NOT MODIFY the underlying list while |
182 * iterating over it. The results of doing so are undefined. | 189 * iterating over it. The results of doing so are undefined. |
183 */ | 190 */ |
184 class _ListRange implements Iterable { | 191 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 192 // that's cheaper to allocate. |
| 193 class _ListRange extends Iterable { |
185 final List _source; | 194 final List _source; |
186 final int _offset; | 195 final int _offset; |
187 final int _length; | 196 final int _length; |
188 | 197 |
189 _ListRange(source, [offset = 0, length]) : | 198 _ListRange(source, [offset = 0, length]) : |
190 this._source = source, | 199 this._source = source, |
191 this._offset = offset, | 200 this._offset = offset, |
192 this._length = (length == null ? source.length - offset : length) { | 201 this._length = (length == null ? source.length - offset : length) { |
193 if (_offset < 0 || _offset > _source.length) { | 202 if (_offset < 0 || _offset > _source.length) { |
194 throw new RangeError.value(_offset); | 203 throw new RangeError.value(_offset); |
195 } | 204 } |
196 if (_length != null && (_length < 0)) { | 205 if (_length != null && (_length < 0)) { |
197 throw new RangeError.value(_length); | 206 throw new RangeError.value(_length); |
198 } | 207 } |
199 if (_length + _offset > _source.length) { | 208 if (_length + _offset > _source.length) { |
200 throw new RangeError.value(_length + _offset); | 209 throw new RangeError.value(_length + _offset); |
201 } | 210 } |
202 } | 211 } |
203 | 212 |
204 _ListRangeIterator iterator() => | 213 _ListRangeIterator get iterator => |
205 new _ListRangeIteratorImpl(_source, _offset, _offset + _length); | 214 new _ListRangeIteratorImpl(_source, _offset, _offset + _length); |
206 | 215 |
207 int get length => _length; | 216 int get length => _length; |
208 } | 217 } |
209 | 218 |
210 /** | 219 /** |
211 * The _ListRangeIterator provides more capabilities than a standard iterator, | 220 * The _ListRangeIterator provides more capabilities than a standard iterator, |
212 * including the ability to get the current position, count remaining items, | 221 * including the ability to get the current position, count remaining items, |
213 * and move forward/backward within the iterator. | 222 * and move forward/backward within the iterator. |
214 */ | 223 */ |
215 abstract class _ListRangeIterator implements Iterator<int> { | 224 abstract class _ListRangeIterator implements Iterator<int> { |
216 bool hasNext; | 225 bool moveNext(); |
217 int next(); | 226 int get current; |
218 int get position; | 227 int get position; |
219 void backup([by]); | 228 void backup([by]); |
220 int get remaining; | 229 int get remaining; |
221 void skip([count]); | 230 void skip([count]); |
222 } | 231 } |
223 | 232 |
224 class _ListRangeIteratorImpl implements _ListRangeIterator { | 233 class _ListRangeIteratorImpl implements _ListRangeIterator { |
225 final List<int> _source; | 234 final List<int> _source; |
226 int _offset; | 235 int _offset; |
227 final int _end; | 236 final int _end; |
228 | 237 |
229 _ListRangeIteratorImpl(this._source, this._offset, this._end); | 238 _ListRangeIteratorImpl(this._source, int offset, this._end) |
| 239 : _offset = offset - 1; |
230 | 240 |
231 bool get hasNext => _offset < _end; | 241 int get current => _source[_offset]; |
232 | 242 |
233 int next() => _source[_offset++]; | 243 bool moveNext() => ++_offset < _end; |
234 | 244 |
235 int get position => _offset; | 245 int get position => _offset; |
236 | 246 |
237 void backup([int by = 1]) { | 247 void backup([int by = 1]) { |
238 _offset -= by; | 248 _offset -= by; |
239 } | 249 } |
240 | 250 |
241 int get remaining => _end - _offset; | 251 int get remaining => _end - _offset - 1; |
242 | 252 |
243 void skip([int count = 1]) { | 253 void skip([int count = 1]) { |
244 _offset += count; | 254 _offset += count; |
245 } | 255 } |
246 } | 256 } |
247 | 257 |
OLD | NEW |