| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 /** | 5 /** |
| 6 * Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and | 6 * Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and |
| 7 * UTF-32. | 7 * UTF-32. |
| 8 */ | 8 */ |
| 9 library utf; | 9 library utf; |
| 10 | 10 |
| 11 import "dart:async"; | 11 import "dart:async"; |
| 12 import "dart:collection"; | 12 import "dart:collection"; |
| 13 | 13 |
| 14 part "constants.dart"; |
| 15 part "list_range.dart"; |
| 14 part "utf_stream.dart"; | 16 part "utf_stream.dart"; |
| 15 part "utf8.dart"; | 17 part "utf8.dart"; |
| 16 part "utf16.dart"; | 18 part "utf16.dart"; |
| 17 part "utf32.dart"; | 19 part "utf32.dart"; |
| 18 | |
| 19 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | |
| 20 /** | |
| 21 * Provide a list of Unicode codepoints for a given string. | |
| 22 */ | |
| 23 List<int> stringToCodepoints(String str) { | |
| 24 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. | |
| 25 // So we need to convert. | |
| 26 return _utf16CodeUnitsToCodepoints(str.codeUnits); | |
| 27 } | |
| 28 | |
| 29 /** | |
| 30 * Generate a string from the provided Unicode codepoints. | |
| 31 * | |
| 32 * *Deprecated* Use [String.fromCharCodes] instead. | |
| 33 */ | |
| 34 String codepointsToString(List<int> codepoints) { | |
| 35 return new String.fromCharCodes(codepoints); | |
| 36 } | |
| 37 | |
| 38 /** | |
| 39 * Invalid codepoints or encodings may be substituted with the value U+fffd. | |
| 40 */ | |
| 41 const int UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xfffd; | |
| 42 const int UNICODE_BOM = 0xfeff; | |
| 43 const int UNICODE_UTF_BOM_LO = 0xff; | |
| 44 const int UNICODE_UTF_BOM_HI = 0xfe; | |
| 45 | |
| 46 const int UNICODE_BYTE_ZERO_MASK = 0xff; | |
| 47 const int UNICODE_BYTE_ONE_MASK = 0xff00; | |
| 48 const int UNICODE_VALID_RANGE_MAX = 0x10ffff; | |
| 49 const int UNICODE_PLANE_ONE_MAX = 0xffff; | |
| 50 const int UNICODE_UTF16_RESERVED_LO = 0xd800; | |
| 51 const int UNICODE_UTF16_RESERVED_HI = 0xdfff; | |
| 52 const int UNICODE_UTF16_OFFSET = 0x10000; | |
| 53 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800; | |
| 54 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00; | |
| 55 const int UNICODE_UTF16_HI_MASK = 0xffc00; | |
| 56 const int UNICODE_UTF16_LO_MASK = 0x3ff; | |
| 57 | |
| 58 /** | |
| 59 * Encode code points as UTF16 code units. | |
| 60 */ | |
| 61 List<int> _codepointsToUtf16CodeUnits( | |
| 62 List<int> codepoints, | |
| 63 [int offset = 0, | |
| 64 int length, | |
| 65 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 66 | |
| 67 _ListRange listRange = new _ListRange(codepoints, offset, length); | |
| 68 int encodedLength = 0; | |
| 69 for (int value in listRange) { | |
| 70 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
| 71 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
| 72 encodedLength++; | |
| 73 } else if (value > UNICODE_PLANE_ONE_MAX && | |
| 74 value <= UNICODE_VALID_RANGE_MAX) { | |
| 75 encodedLength += 2; | |
| 76 } else { | |
| 77 encodedLength++; | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 List<int> codeUnitsBuffer = new List<int>(encodedLength); | |
| 82 int j = 0; | |
| 83 for (int value in listRange) { | |
| 84 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
| 85 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
| 86 codeUnitsBuffer[j++] = value; | |
| 87 } else if (value > UNICODE_PLANE_ONE_MAX && | |
| 88 value <= UNICODE_VALID_RANGE_MAX) { | |
| 89 int base = value - UNICODE_UTF16_OFFSET; | |
| 90 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + | |
| 91 ((base & UNICODE_UTF16_HI_MASK) >> 10); | |
| 92 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + | |
| 93 (base & UNICODE_UTF16_LO_MASK); | |
| 94 } else if (replacementCodepoint != null) { | |
| 95 codeUnitsBuffer[j++] = replacementCodepoint; | |
| 96 } else { | |
| 97 throw new ArgumentError("Invalid encoding"); | |
| 98 } | |
| 99 } | |
| 100 return codeUnitsBuffer; | |
| 101 } | |
| 102 | |
| 103 /** | |
| 104 * Decodes the utf16 codeunits to codepoints. | |
| 105 */ | |
| 106 List<int> _utf16CodeUnitsToCodepoints( | |
| 107 List<int> utf16CodeUnits, [int offset = 0, int length, | |
| 108 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 109 _ListRangeIterator source = | |
| 110 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
| 111 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder | |
| 112 .fromListRangeIterator(source, replacementCodepoint); | |
| 113 List<int> codepoints = new List<int>(source.remaining); | |
| 114 int i = 0; | |
| 115 while (decoder.moveNext()) { | |
| 116 codepoints[i++] = decoder.current; | |
| 117 } | |
| 118 if (i == codepoints.length) { | |
| 119 return codepoints; | |
| 120 } else { | |
| 121 List<int> codepointTrunc = new List<int>(i); | |
| 122 codepointTrunc.setRange(0, i, codepoints); | |
| 123 return codepointTrunc; | |
| 124 } | |
| 125 } | |
| 126 | |
| 127 /** | |
| 128 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. | |
| 129 * The parameters can override the default Unicode replacement character. Set | |
| 130 * the replacementCharacter to null to throw an ArgumentError | |
| 131 * rather than replace the bad value. | |
| 132 */ | |
| 133 class Utf16CodeUnitDecoder implements Iterator<int> { | |
| 134 final _ListRangeIterator utf16CodeUnitIterator; | |
| 135 final int replacementCodepoint; | |
| 136 int _current = null; | |
| 137 | |
| 138 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, | |
| 139 int this.replacementCodepoint = | |
| 140 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
| 141 utf16CodeUnitIterator = | |
| 142 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
| 143 | |
| 144 Utf16CodeUnitDecoder.fromListRangeIterator( | |
| 145 _ListRangeIterator this.utf16CodeUnitIterator, | |
| 146 int this.replacementCodepoint); | |
| 147 | |
| 148 Iterator<int> get iterator => this; | |
| 149 | |
| 150 int get current => _current; | |
| 151 | |
| 152 bool moveNext() { | |
| 153 _current = null; | |
| 154 if (!utf16CodeUnitIterator.moveNext()) return false; | |
| 155 | |
| 156 int value = utf16CodeUnitIterator.current; | |
| 157 if (value < 0) { | |
| 158 if (replacementCodepoint != null) { | |
| 159 _current = replacementCodepoint; | |
| 160 } else { | |
| 161 throw new ArgumentError( | |
| 162 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
| 163 } | |
| 164 } else if (value < UNICODE_UTF16_RESERVED_LO || | |
| 165 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
| 166 // transfer directly | |
| 167 _current = value; | |
| 168 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
| 169 utf16CodeUnitIterator.moveNext()) { | |
| 170 // merge surrogate pair | |
| 171 int nextValue = utf16CodeUnitIterator.current; | |
| 172 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
| 173 nextValue <= UNICODE_UTF16_RESERVED_HI) { | |
| 174 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; | |
| 175 value += UNICODE_UTF16_OFFSET + | |
| 176 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); | |
| 177 _current = value; | |
| 178 } else { | |
| 179 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && | |
| 180 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { | |
| 181 utf16CodeUnitIterator.backup(); | |
| 182 } | |
| 183 if (replacementCodepoint != null) { | |
| 184 _current = replacementCodepoint; | |
| 185 } else { | |
| 186 throw new ArgumentError( | |
| 187 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
| 188 } | |
| 189 } | |
| 190 } else if (replacementCodepoint != null) { | |
| 191 _current = replacementCodepoint; | |
| 192 } else { | |
| 193 throw new ArgumentError( | |
| 194 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
| 195 } | |
| 196 return true; | |
| 197 } | |
| 198 } | |
| 199 | |
| 200 /** | |
| 201 * _ListRange in an internal type used to create a lightweight Interable on a | |
| 202 * range within a source list. DO NOT MODIFY the underlying list while | |
| 203 * iterating over it. The results of doing so are undefined. | |
| 204 */ | |
| 205 // TODO(floitsch): Consider removing the extend and switch to implements since | |
| 206 // that's cheaper to allocate. | |
| 207 class _ListRange extends IterableBase { | |
| 208 final List _source; | |
| 209 final int _offset; | |
| 210 final int _length; | |
| 211 | |
| 212 _ListRange(source, [offset = 0, length]) : | |
| 213 this._source = source, | |
| 214 this._offset = offset, | |
| 215 this._length = (length == null ? source.length - offset : length) { | |
| 216 if (_offset < 0 || _offset > _source.length) { | |
| 217 throw new RangeError.value(_offset); | |
| 218 } | |
| 219 if (_length != null && (_length < 0)) { | |
| 220 throw new RangeError.value(_length); | |
| 221 } | |
| 222 if (_length + _offset > _source.length) { | |
| 223 throw new RangeError.value(_length + _offset); | |
| 224 } | |
| 225 } | |
| 226 | |
| 227 _ListRangeIterator get iterator => | |
| 228 new _ListRangeIteratorImpl(_source, _offset, _offset + _length); | |
| 229 | |
| 230 int get length => _length; | |
| 231 } | |
| 232 | |
| 233 /** | |
| 234 * The _ListRangeIterator provides more capabilities than a standard iterator, | |
| 235 * including the ability to get the current position, count remaining items, | |
| 236 * and move forward/backward within the iterator. | |
| 237 */ | |
| 238 abstract class _ListRangeIterator implements Iterator<int> { | |
| 239 bool moveNext(); | |
| 240 int get current; | |
| 241 int get position; | |
| 242 void backup([by]); | |
| 243 int get remaining; | |
| 244 void skip([count]); | |
| 245 } | |
| 246 | |
| 247 class _ListRangeIteratorImpl implements _ListRangeIterator { | |
| 248 final List<int> _source; | |
| 249 int _offset; | |
| 250 final int _end; | |
| 251 | |
| 252 _ListRangeIteratorImpl(this._source, int offset, this._end) | |
| 253 : _offset = offset - 1; | |
| 254 | |
| 255 int get current => _source[_offset]; | |
| 256 | |
| 257 bool moveNext() => ++_offset < _end; | |
| 258 | |
| 259 int get position => _offset; | |
| 260 | |
| 261 void backup([int by = 1]) { | |
| 262 _offset -= by; | |
| 263 } | |
| 264 | |
| 265 int get remaining => _end - _offset - 1; | |
| 266 | |
| 267 void skip([int count = 1]) { | |
| 268 _offset += count; | |
| 269 } | |
| 270 } | |
| 271 | |
| OLD | NEW |