Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(753)

Side by Side Diff: pkg/utf/lib/utf8.dart

Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « pkg/utf/lib/utf32.dart ('k') | pkg/utf/lib/utf_stream.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of utf;
6
7 const int _UTF8_ONE_BYTE_MAX = 0x7f;
8 const int _UTF8_TWO_BYTE_MAX = 0x7ff;
9 const int _UTF8_THREE_BYTE_MAX = 0xffff;
10
11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;
12
13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;
14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;
15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;
16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;
17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;
18
19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f;
20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf;
21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7;
22
23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;
24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80;
25
26 /**
27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert
28 * as much of the input as needed. Set the replacementCharacter to null to
29 * throw an ArgumentError rather than replace the bad value.
30 */
31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0,
32 int length,
33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint);
35 }
36
37 /**
38 * Produce a String from a List of UTF-8 encoded bytes. The parameters
39 * can set an offset into a list of bytes (as int), limit the length of the
40 * values to be decoded, and override the default Unicode replacement character.
41 * Set the replacementCharacter to null to throw an ArgumentError
42 * rather than replace the bad value.
43 */
44 String decodeUtf8(List<int> bytes, [int offset = 0, int length,
45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
46 return new String.fromCharCodes(
47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint))
48 .decodeRest());
49 }
50
51 /**
52 * Produce a sequence of UTF-8 encoded bytes from the provided string.
53 */
54 List<int> encodeUtf8(String str) =>
55 codepointsToUtf8(stringToCodepoints(str));
56
57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) {
58 while (bytes > 0) {
59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE |
60 (value & _UTF8_LO_SIX_BIT_MASK);
61 value = value >> 6;
62 bytes--;
63 }
64 return value;
65 }
66
67 /**
68 * Encode code points as UTF-8 code units.
69 */
70 List<int> codepointsToUtf8(
71 List<int> codepoints, [int offset = 0, int length]) {
72 _ListRange source = new _ListRange(codepoints, offset, length);
73
74 int encodedLength = 0;
75 for (int value in source) {
76 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
77 encodedLength += 3;
78 } else if (value <= _UTF8_ONE_BYTE_MAX) {
79 encodedLength++;
80 } else if (value <= _UTF8_TWO_BYTE_MAX) {
81 encodedLength += 2;
82 } else if (value <= _UTF8_THREE_BYTE_MAX) {
83 encodedLength += 3;
84 } else if (value <= UNICODE_VALID_RANGE_MAX) {
85 encodedLength += 4;
86 }
87 }
88
89 List<int> encoded = new List<int>(encodedLength);
90 int insertAt = 0;
91 for (int value in source) {
92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]);
94 insertAt += 3;
95 } else if (value <= _UTF8_ONE_BYTE_MAX) {
96 encoded[insertAt] = value;
97 insertAt++;
98 } else if (value <= _UTF8_TWO_BYTE_MAX) {
99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | (
100 _UTF8_FIRST_BYTE_OF_TWO_MASK &
101 _addToEncoding(insertAt, 1, value, encoded));
102 insertAt += 2;
103 } else if (value <= _UTF8_THREE_BYTE_MAX) {
104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | (
105 _UTF8_FIRST_BYTE_OF_THREE_MASK &
106 _addToEncoding(insertAt, 2, value, encoded));
107 insertAt += 3;
108 } else if (value <= UNICODE_VALID_RANGE_MAX) {
109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | (
110 _UTF8_FIRST_BYTE_OF_FOUR_MASK &
111 _addToEncoding(insertAt, 3, value, encoded));
112 insertAt += 4;
113 }
114 }
115 return encoded;
116 }
117
118 // Because UTF-8 specifies byte order, we do not have to follow the pattern
119 // used by UTF-16 & UTF-32 regarding byte order.
120 List<int> utf8ToCodepoints(
121 List<int> utf8EncodedBytes, [int offset = 0, int length,
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
123 return new Utf8Decoder(utf8EncodedBytes, offset, length,
124 replacementCodepoint).decodeRest();
125 }
126
127 /**
128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type
129 * provides an iterator on demand and the iterator will only translate bytes
130 * as requested by the user of the iterator. (Note: results are not cached.)
131 */
132 // TODO(floitsch): Consider removing the extend and switch to implements since
133 // that's cheaper to allocate.
134 class IterableUtf8Decoder extends IterableBase<int> {
135 final List<int> bytes;
136 final int offset;
137 final int length;
138 final int replacementCodepoint;
139
140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,
141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
142
143 Utf8Decoder get iterator =>
144 new Utf8Decoder(bytes, offset, length, replacementCodepoint);
145 }
146
147 /**
148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The
149 * parameters can set an offset into a list of bytes (as int), limit the length
150 * of the values to be decoded, and override the default Unicode replacement
151 * character. Set the replacementCharacter to null to throw an
152 * ArgumentError rather than replace the bad value. The return value
153 * from this method can be used as an Iterable (e.g. in a for-loop).
154 */
155 class Utf8Decoder implements Iterator<int> {
156 final _ListRangeIterator utf8EncodedBytesIterator;
157 final int replacementCodepoint;
158 int _current = null;
159
160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
161 this.replacementCodepoint =
162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
163 utf8EncodedBytesIterator =
164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator;
165
166
167 Utf8Decoder._fromListRangeIterator(_ListRange source, [
168 this.replacementCodepoint =
169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
170 utf8EncodedBytesIterator = source.iterator;
171
172 /** Decode the remaininder of the characters in this decoder
173 * into a [List<int>].
174 */
175 List<int> decodeRest() {
176 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
177 int i = 0;
178 while (moveNext()) {
179 codepoints[i++] = current;
180 }
181 if (i == codepoints.length) {
182 return codepoints;
183 } else {
184 List<int> truncCodepoints = new List<int>(i);
185 truncCodepoints.setRange(0, i, codepoints);
186 return truncCodepoints;
187 }
188 }
189
190 int get current => _current;
191
192 bool moveNext() {
193 _current = null;
194
195 if (!utf8EncodedBytesIterator.moveNext()) return false;
196
197 int value = utf8EncodedBytesIterator.current;
198 int additionalBytes = 0;
199
200 if (value < 0) {
201 if (replacementCodepoint != null) {
202 _current = replacementCodepoint;
203 return true;
204 } else {
205 throw new ArgumentError(
206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
207 }
208 } else if (value <= _UTF8_ONE_BYTE_MAX) {
209 _current = value;
210 return true;
211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
212 if (replacementCodepoint != null) {
213 _current = replacementCodepoint;
214 return true;
215 } else {
216 throw new ArgumentError(
217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
218 }
219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
221 additionalBytes = 1;
222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
224 additionalBytes = 2;
225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
227 additionalBytes = 3;
228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
230 additionalBytes = 4;
231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
233 additionalBytes = 5;
234 } else if (replacementCodepoint != null) {
235 _current = replacementCodepoint;
236 return true;
237 } else {
238 throw new ArgumentError(
239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
240 }
241 int j = 0;
242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
243 int nextValue = utf8EncodedBytesIterator.current;
244 if (nextValue > _UTF8_ONE_BYTE_MAX &&
245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
246 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
247 } else {
248 // if sequence-starting code unit, reposition cursor to start here
249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
250 utf8EncodedBytesIterator.backup();
251 }
252 break;
253 }
254 j++;
255 }
256 bool validSequence = (j == additionalBytes && (
257 value < UNICODE_UTF16_RESERVED_LO ||
258 value > UNICODE_UTF16_RESERVED_HI));
259 bool nonOverlong =
260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
263 bool inRange = value <= UNICODE_VALID_RANGE_MAX;
264 if (validSequence && nonOverlong && inRange) {
265 _current = value;
266 return true;
267 } else if (replacementCodepoint != null) {
268 _current = replacementCodepoint;
269 return true;
270 } else {
271 throw new ArgumentError(
272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
273 }
274 }
275 }
OLDNEW
« no previous file with comments | « pkg/utf/lib/utf32.dart ('k') | pkg/utf/lib/utf_stream.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698