OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 /** | 5 /** |
6 * Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and | 6 * Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and |
7 * UTF-32. | 7 * UTF-32. |
8 */ | 8 */ |
9 library utf; | 9 library utf; |
10 | 10 |
11 import "dart:async"; | 11 import "dart:async"; |
12 import "dart:collection"; | 12 import "dart:collection"; |
13 | 13 |
| 14 part "constants.dart"; |
| 15 part "list_range.dart"; |
14 part "utf_stream.dart"; | 16 part "utf_stream.dart"; |
15 part "utf8.dart"; | 17 part "utf8.dart"; |
16 part "utf16.dart"; | 18 part "utf16.dart"; |
17 part "utf32.dart"; | 19 part "utf32.dart"; |
18 | |
19 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501). | |
20 /** | |
21 * Provide a list of Unicode codepoints for a given string. | |
22 */ | |
23 List<int> stringToCodepoints(String str) { | |
24 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations. | |
25 // So we need to convert. | |
26 return _utf16CodeUnitsToCodepoints(str.codeUnits); | |
27 } | |
28 | |
29 /** | |
30 * Generate a string from the provided Unicode codepoints. | |
31 * | |
32 * *Deprecated* Use [String.fromCharCodes] instead. | |
33 */ | |
34 String codepointsToString(List<int> codepoints) { | |
35 return new String.fromCharCodes(codepoints); | |
36 } | |
37 | |
38 /** | |
39 * Invalid codepoints or encodings may be substituted with the value U+fffd. | |
40 */ | |
41 const int UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xfffd; | |
42 const int UNICODE_BOM = 0xfeff; | |
43 const int UNICODE_UTF_BOM_LO = 0xff; | |
44 const int UNICODE_UTF_BOM_HI = 0xfe; | |
45 | |
46 const int UNICODE_BYTE_ZERO_MASK = 0xff; | |
47 const int UNICODE_BYTE_ONE_MASK = 0xff00; | |
48 const int UNICODE_VALID_RANGE_MAX = 0x10ffff; | |
49 const int UNICODE_PLANE_ONE_MAX = 0xffff; | |
50 const int UNICODE_UTF16_RESERVED_LO = 0xd800; | |
51 const int UNICODE_UTF16_RESERVED_HI = 0xdfff; | |
52 const int UNICODE_UTF16_OFFSET = 0x10000; | |
53 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800; | |
54 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00; | |
55 const int UNICODE_UTF16_HI_MASK = 0xffc00; | |
56 const int UNICODE_UTF16_LO_MASK = 0x3ff; | |
57 | |
58 /** | |
59 * Encode code points as UTF16 code units. | |
60 */ | |
61 List<int> _codepointsToUtf16CodeUnits( | |
62 List<int> codepoints, | |
63 [int offset = 0, | |
64 int length, | |
65 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
66 | |
67 _ListRange listRange = new _ListRange(codepoints, offset, length); | |
68 int encodedLength = 0; | |
69 for (int value in listRange) { | |
70 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
71 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
72 encodedLength++; | |
73 } else if (value > UNICODE_PLANE_ONE_MAX && | |
74 value <= UNICODE_VALID_RANGE_MAX) { | |
75 encodedLength += 2; | |
76 } else { | |
77 encodedLength++; | |
78 } | |
79 } | |
80 | |
81 List<int> codeUnitsBuffer = new List<int>(encodedLength); | |
82 int j = 0; | |
83 for (int value in listRange) { | |
84 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) || | |
85 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
86 codeUnitsBuffer[j++] = value; | |
87 } else if (value > UNICODE_PLANE_ONE_MAX && | |
88 value <= UNICODE_VALID_RANGE_MAX) { | |
89 int base = value - UNICODE_UTF16_OFFSET; | |
90 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE + | |
91 ((base & UNICODE_UTF16_HI_MASK) >> 10); | |
92 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE + | |
93 (base & UNICODE_UTF16_LO_MASK); | |
94 } else if (replacementCodepoint != null) { | |
95 codeUnitsBuffer[j++] = replacementCodepoint; | |
96 } else { | |
97 throw new ArgumentError("Invalid encoding"); | |
98 } | |
99 } | |
100 return codeUnitsBuffer; | |
101 } | |
102 | |
103 /** | |
104 * Decodes the utf16 codeunits to codepoints. | |
105 */ | |
106 List<int> _utf16CodeUnitsToCodepoints( | |
107 List<int> utf16CodeUnits, [int offset = 0, int length, | |
108 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
109 _ListRangeIterator source = | |
110 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
111 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder | |
112 .fromListRangeIterator(source, replacementCodepoint); | |
113 List<int> codepoints = new List<int>(source.remaining); | |
114 int i = 0; | |
115 while (decoder.moveNext()) { | |
116 codepoints[i++] = decoder.current; | |
117 } | |
118 if (i == codepoints.length) { | |
119 return codepoints; | |
120 } else { | |
121 List<int> codepointTrunc = new List<int>(i); | |
122 codepointTrunc.setRange(0, i, codepoints); | |
123 return codepointTrunc; | |
124 } | |
125 } | |
126 | |
127 /** | |
128 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units. | |
129 * The parameters can override the default Unicode replacement character. Set | |
130 * the replacementCharacter to null to throw an ArgumentError | |
131 * rather than replace the bad value. | |
132 */ | |
133 class Utf16CodeUnitDecoder implements Iterator<int> { | |
134 final _ListRangeIterator utf16CodeUnitIterator; | |
135 final int replacementCodepoint; | |
136 int _current = null; | |
137 | |
138 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length, | |
139 int this.replacementCodepoint = | |
140 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
141 utf16CodeUnitIterator = | |
142 (new _ListRange(utf16CodeUnits, offset, length)).iterator; | |
143 | |
144 Utf16CodeUnitDecoder.fromListRangeIterator( | |
145 _ListRangeIterator this.utf16CodeUnitIterator, | |
146 int this.replacementCodepoint); | |
147 | |
148 Iterator<int> get iterator => this; | |
149 | |
150 int get current => _current; | |
151 | |
152 bool moveNext() { | |
153 _current = null; | |
154 if (!utf16CodeUnitIterator.moveNext()) return false; | |
155 | |
156 int value = utf16CodeUnitIterator.current; | |
157 if (value < 0) { | |
158 if (replacementCodepoint != null) { | |
159 _current = replacementCodepoint; | |
160 } else { | |
161 throw new ArgumentError( | |
162 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
163 } | |
164 } else if (value < UNICODE_UTF16_RESERVED_LO || | |
165 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) { | |
166 // transfer directly | |
167 _current = value; | |
168 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
169 utf16CodeUnitIterator.moveNext()) { | |
170 // merge surrogate pair | |
171 int nextValue = utf16CodeUnitIterator.current; | |
172 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE && | |
173 nextValue <= UNICODE_UTF16_RESERVED_HI) { | |
174 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10; | |
175 value += UNICODE_UTF16_OFFSET + | |
176 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE); | |
177 _current = value; | |
178 } else { | |
179 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE && | |
180 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) { | |
181 utf16CodeUnitIterator.backup(); | |
182 } | |
183 if (replacementCodepoint != null) { | |
184 _current = replacementCodepoint; | |
185 } else { | |
186 throw new ArgumentError( | |
187 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
188 } | |
189 } | |
190 } else if (replacementCodepoint != null) { | |
191 _current = replacementCodepoint; | |
192 } else { | |
193 throw new ArgumentError( | |
194 "Invalid UTF16 at ${utf16CodeUnitIterator.position}"); | |
195 } | |
196 return true; | |
197 } | |
198 } | |
199 | |
200 /** | |
201 * _ListRange in an internal type used to create a lightweight Interable on a | |
202 * range within a source list. DO NOT MODIFY the underlying list while | |
203 * iterating over it. The results of doing so are undefined. | |
204 */ | |
205 // TODO(floitsch): Consider removing the extend and switch to implements since | |
206 // that's cheaper to allocate. | |
207 class _ListRange extends IterableBase { | |
208 final List _source; | |
209 final int _offset; | |
210 final int _length; | |
211 | |
212 _ListRange(source, [offset = 0, length]) : | |
213 this._source = source, | |
214 this._offset = offset, | |
215 this._length = (length == null ? source.length - offset : length) { | |
216 if (_offset < 0 || _offset > _source.length) { | |
217 throw new RangeError.value(_offset); | |
218 } | |
219 if (_length != null && (_length < 0)) { | |
220 throw new RangeError.value(_length); | |
221 } | |
222 if (_length + _offset > _source.length) { | |
223 throw new RangeError.value(_length + _offset); | |
224 } | |
225 } | |
226 | |
227 _ListRangeIterator get iterator => | |
228 new _ListRangeIteratorImpl(_source, _offset, _offset + _length); | |
229 | |
230 int get length => _length; | |
231 } | |
232 | |
233 /** | |
234 * The _ListRangeIterator provides more capabilities than a standard iterator, | |
235 * including the ability to get the current position, count remaining items, | |
236 * and move forward/backward within the iterator. | |
237 */ | |
238 abstract class _ListRangeIterator implements Iterator<int> { | |
239 bool moveNext(); | |
240 int get current; | |
241 int get position; | |
242 void backup([by]); | |
243 int get remaining; | |
244 void skip([count]); | |
245 } | |
246 | |
247 class _ListRangeIteratorImpl implements _ListRangeIterator { | |
248 final List<int> _source; | |
249 int _offset; | |
250 final int _end; | |
251 | |
252 _ListRangeIteratorImpl(this._source, int offset, this._end) | |
253 : _offset = offset - 1; | |
254 | |
255 int get current => _source[_offset]; | |
256 | |
257 bool moveNext() => ++_offset < _end; | |
258 | |
259 int get position => _offset; | |
260 | |
261 void backup([int by = 1]) { | |
262 _offset -= by; | |
263 } | |
264 | |
265 int get remaining => _end - _offset - 1; | |
266 | |
267 void skip([int count = 1]) { | |
268 _offset += count; | |
269 } | |
270 } | |
271 | |
OLD | NEW |