| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of dart.utf; | 5 part of dart.utf; |
| 6 | 6 |
| 7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | 7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). |
| 8 /** | 8 /** |
| 9 * Provide a list of Unicode codepoints for a given string. | 9 * Provide a list of Unicode codepoints for a given string. |
| 10 */ | 10 */ |
| (...skipping 27 matching lines...) Expand all Loading... |
| 38 const int UNICODE_UTF16_OFFSET = 0x10000; | 38 const int UNICODE_UTF16_OFFSET = 0x10000; |
| 39 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800; | 39 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800; |
| 40 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00; | 40 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00; |
| 41 const int UNICODE_UTF16_HI_MASK = 0xffc00; | 41 const int UNICODE_UTF16_HI_MASK = 0xffc00; |
| 42 const int UNICODE_UTF16_LO_MASK = 0x3ff; | 42 const int UNICODE_UTF16_LO_MASK = 0x3ff; |
| 43 | 43 |
| 44 /** | 44 /** |
| 45 * Encode code points as UTF16 code units. | 45 * Encode code points as UTF16 code units. |
| 46 */ | 46 */ |
| 47 List<int> _codepointsToUtf16CodeUnits( | 47 List<int> _codepointsToUtf16CodeUnits( |
| 48 List<int> codepoints, [int offset = 0, int length, | 48 List<int> codepoints, |
| 49 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 49 [int offset = 0, |
| 50 int length, |
| 51 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 50 | 52 |
| 51 _ListRange listRange = new _ListRange(codepoints, offset, length); | 53 _ListRange listRange = new _ListRange(codepoints, offset, length); |
| 52 int encodedLength = 0; | 54 int encodedLength = 0; |
| 53 for (int value in listRange) { | 55 for (int value in listRange) { |
| 54 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | 56 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
| 55 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | 57 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
| 56 encodedLength++; | 58 encodedLength++; |
| 57 } else if (value > UNICODE_PLANE_ONE_MAX && | 59 } else if (value > UNICODE_PLANE_ONE_MAX && |
| 58 value <= UNICODE_VALID_RANGE_MAX) { | 60 value <= UNICODE_VALID_RANGE_MAX) { |
| 59 encodedLength += 2; | 61 encodedLength += 2; |
| 60 } else { | 62 } else { |
| 61 encodedLength++; | 63 encodedLength++; |
| 62 } | 64 } |
| 63 } | 65 } |
| 64 | 66 |
| 65 List<int> codeUnitsBuffer = new List<int>(encodedLength); | 67 List<int> codeUnitsBuffer = new List<int>.fixedLength(encodedLength); |
| 66 int j = 0; | 68 int j = 0; |
| 67 for (int value in listRange) { | 69 for (int value in listRange) { |
| 68 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | 70 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || |
| 69 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | 71 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
| 70 codeUnitsBuffer[j++] = value; | 72 codeUnitsBuffer[j++] = value; |
| 71 } else if (value > UNICODE_PLANE_ONE_MAX && | 73 } else if (value > UNICODE_PLANE_ONE_MAX && |
| 72 value <= UNICODE_VALID_RANGE_MAX) { | 74 value <= UNICODE_VALID_RANGE_MAX) { |
| 73 int base = value - UNICODE_UTF16_OFFSET; | 75 int base = value - UNICODE_UTF16_OFFSET; |
| 74 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + | 76 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + |
| 75 ((base & UNICODE_UTF16_HI_MASK) >> 10); | 77 ((base & UNICODE_UTF16_HI_MASK) >> 10); |
| 76 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + | 78 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + |
| 77 (base & UNICODE_UTF16_LO_MASK); | 79 (base & UNICODE_UTF16_LO_MASK); |
| 78 } else if (replacementCodepoint != null) { | 80 } else if (replacementCodepoint != null) { |
| 79 codeUnitsBuffer[j++] = replacementCodepoint; | 81 codeUnitsBuffer[j++] = replacementCodepoint; |
| 80 } else { | 82 } else { |
| 81 throw new ArgumentError("Invalid encoding"); | 83 throw new ArgumentError("Invalid encoding"); |
| 82 } | 84 } |
| 83 } | 85 } |
| 84 return codeUnitsBuffer; | 86 return codeUnitsBuffer; |
| 85 } | 87 } |
| 86 | 88 |
| 87 /** | 89 /** |
| 88 * Decodes the utf16 codeunits to codepoints. | 90 * Decodes the utf16 codeunits to codepoints. |
| 89 */ | 91 */ |
| 90 List<int> _utf16CodeUnitsToCodepoints( | 92 List<int> _utf16CodeUnitsToCodepoints( |
| 91 List<int> utf16CodeUnits, [int offset = 0, int length, | 93 List<int> utf16CodeUnits, [int offset = 0, int length, |
| 92 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 94 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 93 _ListRangeIterator source = | 95 _ListRangeIterator source = |
| 94 (new _ListRange(utf16CodeUnits, offset, length)).iterator(); | 96 (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
| 95 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder | 97 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder |
| 96 .fromListRangeIterator(source, replacementCodepoint); | 98 .fromListRangeIterator(source, replacementCodepoint); |
| 97 List<int> codepoints = new List<int>(source.remaining); | 99 List<int> codepoints = new List<int>.fixedLength(source.remaining); |
| 98 int i = 0; | 100 int i = 0; |
| 99 while (decoder.hasNext) { | 101 while (decoder.moveNext()) { |
| 100 codepoints[i++] = decoder.next(); | 102 codepoints[i++] = decoder.current; |
| 101 } | 103 } |
| 102 if (i == codepoints.length) { | 104 if (i == codepoints.length) { |
| 103 return codepoints; | 105 return codepoints; |
| 104 } else { | 106 } else { |
| 105 List<int> codepointTrunc = new List<int>(i); | 107 List<int> codepointTrunc = new List<int>.fixedLength(i); |
| 106 codepointTrunc.setRange(0, i, codepoints); | 108 codepointTrunc.setRange(0, i, codepoints); |
| 107 return codepointTrunc; | 109 return codepointTrunc; |
| 108 } | 110 } |
| 109 } | 111 } |
| 110 | 112 |
| 111 /** | 113 /** |
| 112 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. | 114 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. |
| 113 * The parameters can override the default Unicode replacement character. Set | 115 * The parameters can override the default Unicode replacement character. Set |
| 114 * the replacementCharacter to null to throw an ArgumentError | 116 * the replacementCharacter to null to throw an ArgumentError |
| 115 * rather than replace the bad value. | 117 * rather than replace the bad value. |
| 116 */ | 118 */ |
| 117 class Utf16CodeUnitDecoder implements Iterator<int> { | 119 class Utf16CodeUnitDecoder implements Iterator<int> { |
| 118 final _ListRangeIterator utf16CodeUnitIterator; | 120 final _ListRangeIterator utf16CodeUnitIterator; |
| 119 final int replacementCodepoint; | 121 final int replacementCodepoint; |
| 122 int _current = null; |
| 120 | 123 |
| 121 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, | 124 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, |
| 122 int this.replacementCodepoint = | 125 int this.replacementCodepoint = |
| 123 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 126 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 124 utf16CodeUnitIterator = (new _ListRange(utf16CodeUnits, offset, length)) | 127 utf16CodeUnitIterator = |
| 125 .iterator(); | 128 (new _ListRange(utf16CodeUnits, offset, length)).iterator; |
| 126 | 129 |
| 127 Utf16CodeUnitDecoder.fromListRangeIterator( | 130 Utf16CodeUnitDecoder.fromListRangeIterator( |
| 128 _ListRangeIterator this.utf16CodeUnitIterator, | 131 _ListRangeIterator this.utf16CodeUnitIterator, |
| 129 int this.replacementCodepoint); | 132 int this.replacementCodepoint); |
| 130 | 133 |
| 131 Iterator<int> iterator() => this; | 134 Iterator<int> get iterator => this; |
| 132 | 135 |
| 133 bool get hasNext => utf16CodeUnitIterator.hasNext; | 136 int get current => _current; |
| 134 | 137 |
| 135 int next() { | 138 bool moveNext() { |
| 136 int value = utf16CodeUnitIterator.next(); | 139 _current = null; |
| 140 if (!utf16CodeUnitIterator.moveNext()) return false; |
| 141 |
| 142 int value = utf16CodeUnitIterator.current; |
| 137 if (value < 0) { | 143 if (value < 0) { |
| 138 if (replacementCodepoint != null) { | 144 if (replacementCodepoint != null) { |
| 139 return replacementCodepoint; | 145 _current = replacementCodepoint; |
| 140 } else { | 146 } else { |
| 141 throw new ArgumentError( | 147 throw new ArgumentError( |
| 142 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | 148 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
| 143 } | 149 } |
| 144 } else if (value < UNICODE_UTF16_RESERVED_LO || | 150 } else if (value < UNICODE_UTF16_RESERVED_LO || |
| 145 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | 151 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { |
| 146 // transfer directly | 152 // transfer directly |
| 147 return value; | 153 _current = value; |
| 148 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | 154 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
| 149 utf16CodeUnitIterator.hasNext) { | 155 utf16CodeUnitIterator.moveNext()) { |
| 150 // merge surrogate pair | 156 // merge surrogate pair |
| 151 int nextValue = utf16CodeUnitIterator.next(); | 157 int nextValue = utf16CodeUnitIterator.current; |
| 152 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | 158 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && |
| 153 nextValue <= UNICODE_UTF16_RESERVED_HI) { | 159 nextValue <= UNICODE_UTF16_RESERVED_HI) { |
| 154 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; | 160 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; |
| 155 value += UNICODE_UTF16_OFFSET + | 161 value += UNICODE_UTF16_OFFSET + |
| 156 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); | 162 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); |
| 157 return value; | 163 _current = value; |
| 158 } else { | 164 } else { |
| 159 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && | 165 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && |
| 160 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { | 166 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { |
| 161 utf16CodeUnitIterator.backup(); | 167 utf16CodeUnitIterator.backup(); |
| 162 } | 168 } |
| 163 if (replacementCodepoint != null) { | 169 if (replacementCodepoint != null) { |
| 164 return replacementCodepoint; | 170 _current = replacementCodepoint; |
| 165 } else { | 171 } else { |
| 166 throw new ArgumentError( | 172 throw new ArgumentError( |
| 167 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | 173 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
| 168 } | 174 } |
| 169 } | 175 } |
| 170 } else if (replacementCodepoint != null) { | 176 } else if (replacementCodepoint != null) { |
| 171 return replacementCodepoint; | 177 _current = replacementCodepoint; |
| 172 } else { | 178 } else { |
| 173 throw new ArgumentError( | 179 throw new ArgumentError( |
| 174 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | 180 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); |
| 175 } | 181 } |
| 182 return true; |
| 176 } | 183 } |
| 177 } | 184 } |
| 178 | 185 |
| 179 /** | 186 /** |
| 180 * _ListRange in an internal type used to create a lightweight Interable on a | 187 * _ListRange in an internal type used to create a lightweight Interable on a |
| 181 * range within a source list. DO NOT MODIFY the underlying list while | 188 * range within a source list. DO NOT MODIFY the underlying list while |
| 182 * iterating over it. The results of doing so are undefined. | 189 * iterating over it. The results of doing so are undefined. |
| 183 */ | 190 */ |
| 184 class _ListRange implements Iterable { | 191 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 192 // that's cheaper to allocate. |
| 193 class _ListRange extends Iterable { |
| 185 final List _source; | 194 final List _source; |
| 186 final int _offset; | 195 final int _offset; |
| 187 final int _length; | 196 final int _length; |
| 188 | 197 |
| 189 _ListRange(source, [offset = 0, length]) : | 198 _ListRange(source, [offset = 0, length]) : |
| 190 this._source = source, | 199 this._source = source, |
| 191 this._offset = offset, | 200 this._offset = offset, |
| 192 this._length = (length == null ? source.length - offset : length) { | 201 this._length = (length == null ? source.length - offset : length) { |
| 193 if (_offset < 0 || _offset > _source.length) { | 202 if (_offset < 0 || _offset > _source.length) { |
| 194 throw new RangeError.value(_offset); | 203 throw new RangeError.value(_offset); |
| 195 } | 204 } |
| 196 if (_length != null && (_length < 0)) { | 205 if (_length != null && (_length < 0)) { |
| 197 throw new RangeError.value(_length); | 206 throw new RangeError.value(_length); |
| 198 } | 207 } |
| 199 if (_length + _offset > _source.length) { | 208 if (_length + _offset > _source.length) { |
| 200 throw new RangeError.value(_length + _offset); | 209 throw new RangeError.value(_length + _offset); |
| 201 } | 210 } |
| 202 } | 211 } |
| 203 | 212 |
| 204 _ListRangeIterator iterator() => | 213 _ListRangeIterator get iterator => |
| 205 new _ListRangeIteratorImpl(_source, _offset, _offset + _length); | 214 new _ListRangeIteratorImpl(_source, _offset, _offset + _length); |
| 206 | 215 |
| 207 int get length => _length; | 216 int get length => _length; |
| 208 } | 217 } |
| 209 | 218 |
| 210 /** | 219 /** |
| 211 * The _ListRangeIterator provides more capabilities than a standard iterator, | 220 * The _ListRangeIterator provides more capabilities than a standard iterator, |
| 212 * including the ability to get the current position, count remaining items, | 221 * including the ability to get the current position, count remaining items, |
| 213 * and move forward/backward within the iterator. | 222 * and move forward/backward within the iterator. |
| 214 */ | 223 */ |
| 215 abstract class _ListRangeIterator implements Iterator<int> { | 224 abstract class _ListRangeIterator implements Iterator<int> { |
| 216 bool hasNext; | 225 bool moveNext(); |
| 217 int next(); | 226 int get current; |
| 218 int get position; | 227 int get position; |
| 219 void backup([by]); | 228 void backup([by]); |
| 220 int get remaining; | 229 int get remaining; |
| 221 void skip([count]); | 230 void skip([count]); |
| 222 } | 231 } |
| 223 | 232 |
| 224 class _ListRangeIteratorImpl implements _ListRangeIterator { | 233 class _ListRangeIteratorImpl implements _ListRangeIterator { |
| 225 final List<int> _source; | 234 final List<int> _source; |
| 226 int _offset; | 235 int _offset; |
| 227 final int _end; | 236 final int _end; |
| 228 | 237 |
| 229 _ListRangeIteratorImpl(this._source, this._offset, this._end); | 238 _ListRangeIteratorImpl(this._source, int offset, this._end) |
| 239 : _offset = offset - 1; |
| 230 | 240 |
| 231 bool get hasNext => _offset < _end; | 241 int get current => _source[_offset]; |
| 232 | 242 |
| 233 int next() => _source[_offset++]; | 243 bool moveNext() => ++_offset < _end; |
| 234 | 244 |
| 235 int get position => _offset; | 245 int get position => _offset; |
| 236 | 246 |
| 237 void backup([int by = 1]) { | 247 void backup([int by = 1]) { |
| 238 _offset -= by; | 248 _offset -= by; |
| 239 } | 249 } |
| 240 | 250 |
| 241 int get remaining => _end - _offset; | 251 int get remaining => _end - _offset - 1; |
| 242 | 252 |
| 243 void skip([int count = 1]) { | 253 void skip([int count = 1]) { |
| 244 _offset += count; | 254 _offset += count; |
| 245 } | 255 } |
| 246 } | 256 } |
| 247 | 257 |
| OLD | NEW |