Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(271)

Side by Side Diff: pkg/utf/lib/utf.dart

Issue 68563004: Move unicode tests to utf package. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Simplify test. Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 /** 5 /**
6 * Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and 6 * Support for encoding and decoding Unicode characters in UTF-8, UTF-16, and
7 * UTF-32. 7 * UTF-32.
8 */ 8 */
9 library utf; 9 library utf;
10 10
11 import "dart:async"; 11 import "dart:async";
12 import "dart:collection"; 12 import "dart:collection";
13 13
14 part "constants.dart";
15 part "list_range.dart";
14 part "utf_stream.dart"; 16 part "utf_stream.dart";
15 part "utf8.dart"; 17 part "utf8.dart";
16 part "utf16.dart"; 18 part "utf16.dart";
17 part "utf32.dart"; 19 part "utf32.dart";
18
19 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
20 /**
21 * Provide a list of Unicode codepoints for a given string.
22 */
23 List<int> stringToCodepoints(String str) {
24 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.
25 // So we need to convert.
26 return _utf16CodeUnitsToCodepoints(str.codeUnits);
27 }
28
29 /**
30 * Generate a string from the provided Unicode codepoints.
31 *
32 * *Deprecated* Use [String.fromCharCodes] instead.
33 */
34 String codepointsToString(List<int> codepoints) {
35 return new String.fromCharCodes(codepoints);
36 }
37
38 /**
39 * Invalid codepoints or encodings may be substituted with the value U+fffd.
40 */
41 const int UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xfffd;
42 const int UNICODE_BOM = 0xfeff;
43 const int UNICODE_UTF_BOM_LO = 0xff;
44 const int UNICODE_UTF_BOM_HI = 0xfe;
45
46 const int UNICODE_BYTE_ZERO_MASK = 0xff;
47 const int UNICODE_BYTE_ONE_MASK = 0xff00;
48 const int UNICODE_VALID_RANGE_MAX = 0x10ffff;
49 const int UNICODE_PLANE_ONE_MAX = 0xffff;
50 const int UNICODE_UTF16_RESERVED_LO = 0xd800;
51 const int UNICODE_UTF16_RESERVED_HI = 0xdfff;
52 const int UNICODE_UTF16_OFFSET = 0x10000;
53 const int UNICODE_UTF16_SURROGATE_UNIT_0_BASE = 0xd800;
54 const int UNICODE_UTF16_SURROGATE_UNIT_1_BASE = 0xdc00;
55 const int UNICODE_UTF16_HI_MASK = 0xffc00;
56 const int UNICODE_UTF16_LO_MASK = 0x3ff;
57
58 /**
59 * Encode code points as UTF16 code units.
60 */
61 List<int> _codepointsToUtf16CodeUnits(
62 List<int> codepoints,
63 [int offset = 0,
64 int length,
65 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
66
67 _ListRange listRange = new _ListRange(codepoints, offset, length);
68 int encodedLength = 0;
69 for (int value in listRange) {
70 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
71 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
72 encodedLength++;
73 } else if (value > UNICODE_PLANE_ONE_MAX &&
74 value <= UNICODE_VALID_RANGE_MAX) {
75 encodedLength += 2;
76 } else {
77 encodedLength++;
78 }
79 }
80
81 List<int> codeUnitsBuffer = new List<int>(encodedLength);
82 int j = 0;
83 for (int value in listRange) {
84 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) ||
85 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
86 codeUnitsBuffer[j++] = value;
87 } else if (value > UNICODE_PLANE_ONE_MAX &&
88 value <= UNICODE_VALID_RANGE_MAX) {
89 int base = value - UNICODE_UTF16_OFFSET;
90 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +
91 ((base & UNICODE_UTF16_HI_MASK) >> 10);
92 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +
93 (base & UNICODE_UTF16_LO_MASK);
94 } else if (replacementCodepoint != null) {
95 codeUnitsBuffer[j++] = replacementCodepoint;
96 } else {
97 throw new ArgumentError("Invalid encoding");
98 }
99 }
100 return codeUnitsBuffer;
101 }
102
103 /**
104 * Decodes the utf16 codeunits to codepoints.
105 */
106 List<int> _utf16CodeUnitsToCodepoints(
107 List<int> utf16CodeUnits, [int offset = 0, int length,
108 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
109 _ListRangeIterator source =
110 (new _ListRange(utf16CodeUnits, offset, length)).iterator;
111 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder
112 .fromListRangeIterator(source, replacementCodepoint);
113 List<int> codepoints = new List<int>(source.remaining);
114 int i = 0;
115 while (decoder.moveNext()) {
116 codepoints[i++] = decoder.current;
117 }
118 if (i == codepoints.length) {
119 return codepoints;
120 } else {
121 List<int> codepointTrunc = new List<int>(i);
122 codepointTrunc.setRange(0, i, codepoints);
123 return codepointTrunc;
124 }
125 }
126
127 /**
128 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.
129 * The parameters can override the default Unicode replacement character. Set
130 * the replacementCharacter to null to throw an ArgumentError
131 * rather than replace the bad value.
132 */
133 class Utf16CodeUnitDecoder implements Iterator<int> {
134 final _ListRangeIterator utf16CodeUnitIterator;
135 final int replacementCodepoint;
136 int _current = null;
137
138 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,
139 int this.replacementCodepoint =
140 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
141 utf16CodeUnitIterator =
142 (new _ListRange(utf16CodeUnits, offset, length)).iterator;
143
144 Utf16CodeUnitDecoder.fromListRangeIterator(
145 _ListRangeIterator this.utf16CodeUnitIterator,
146 int this.replacementCodepoint);
147
148 Iterator<int> get iterator => this;
149
150 int get current => _current;
151
152 bool moveNext() {
153 _current = null;
154 if (!utf16CodeUnitIterator.moveNext()) return false;
155
156 int value = utf16CodeUnitIterator.current;
157 if (value < 0) {
158 if (replacementCodepoint != null) {
159 _current = replacementCodepoint;
160 } else {
161 throw new ArgumentError(
162 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
163 }
164 } else if (value < UNICODE_UTF16_RESERVED_LO ||
165 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {
166 // transfer directly
167 _current = value;
168 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
169 utf16CodeUnitIterator.moveNext()) {
170 // merge surrogate pair
171 int nextValue = utf16CodeUnitIterator.current;
172 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&
173 nextValue <= UNICODE_UTF16_RESERVED_HI) {
174 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;
175 value += UNICODE_UTF16_OFFSET +
176 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);
177 _current = value;
178 } else {
179 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&
180 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {
181 utf16CodeUnitIterator.backup();
182 }
183 if (replacementCodepoint != null) {
184 _current = replacementCodepoint;
185 } else {
186 throw new ArgumentError(
187 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
188 }
189 }
190 } else if (replacementCodepoint != null) {
191 _current = replacementCodepoint;
192 } else {
193 throw new ArgumentError(
194 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");
195 }
196 return true;
197 }
198 }
199
200 /**
201 * _ListRange in an internal type used to create a lightweight Interable on a
202 * range within a source list. DO NOT MODIFY the underlying list while
203 * iterating over it. The results of doing so are undefined.
204 */
205 // TODO(floitsch): Consider removing the extend and switch to implements since
206 // that's cheaper to allocate.
207 class _ListRange extends IterableBase {
208 final List _source;
209 final int _offset;
210 final int _length;
211
212 _ListRange(source, [offset = 0, length]) :
213 this._source = source,
214 this._offset = offset,
215 this._length = (length == null ? source.length - offset : length) {
216 if (_offset < 0 || _offset > _source.length) {
217 throw new RangeError.value(_offset);
218 }
219 if (_length != null && (_length < 0)) {
220 throw new RangeError.value(_length);
221 }
222 if (_length + _offset > _source.length) {
223 throw new RangeError.value(_length + _offset);
224 }
225 }
226
227 _ListRangeIterator get iterator =>
228 new _ListRangeIteratorImpl(_source, _offset, _offset + _length);
229
230 int get length => _length;
231 }
232
233 /**
234 * The _ListRangeIterator provides more capabilities than a standard iterator,
235 * including the ability to get the current position, count remaining items,
236 * and move forward/backward within the iterator.
237 */
238 abstract class _ListRangeIterator implements Iterator<int> {
239 bool moveNext();
240 int get current;
241 int get position;
242 void backup([by]);
243 int get remaining;
244 void skip([count]);
245 }
246
247 class _ListRangeIteratorImpl implements _ListRangeIterator {
248 final List<int> _source;
249 int _offset;
250 final int _end;
251
252 _ListRangeIteratorImpl(this._source, int offset, this._end)
253 : _offset = offset - 1;
254
255 int get current => _source[_offset];
256
257 bool moveNext() => ++_offset < _end;
258
259 int get position => _offset;
260
261 void backup([int by = 1]) {
262 _offset -= by;
263 }
264
265 int get remaining => _end - _offset - 1;
266
267 void skip([int count = 1]) {
268 _offset += count;
269 }
270 }
271
OLDNEW
« no previous file with comments | « pkg/utf/lib/list_range.dart ('k') | pkg/utf/lib/utf16.dart » ('j') | pkg/utf/lib/utf16.dart » ('J')

Powered by Google App Engine
This is Rietveld 408576698