OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 const int _UTF8_ONE_BYTE_MAX = 0x7f; | 5 const int _UTF8_ONE_BYTE_MAX = 0x7f; |
6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
7 const int _UTF8_THREE_BYTE_MAX = 0xffff; | 7 const int _UTF8_THREE_BYTE_MAX = 0xffff; |
8 | 8 |
9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | 9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
10 | 10 |
11 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; | 11 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; |
12 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; | 12 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; |
13 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; | 13 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; |
14 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; | 14 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; |
15 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; | 15 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; |
16 | 16 |
17 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; | 17 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; |
18 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; | 18 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; |
19 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; | 19 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; |
20 | 20 |
21 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; | 21 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; |
22 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; | 22 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; |
23 | 23 |
24 /** | 24 /** |
25 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert | 25 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert |
26 * as much of the input as needed. Set the replacementCharacter to null to | 26 * as much of the input as needed. Set the replacementCharacter to null to |
27 * throw an IllegalArgumentException rather than replace the bad value. | 27 * throw an ArgumentError rather than replace the bad value. |
28 */ | 28 */ |
29 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, | 29 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, |
30 int length, | 30 int length, |
31 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 31 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
32 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); | 32 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); |
33 } | 33 } |
34 | 34 |
35 /** | 35 /** |
36 * Produce a String from a List of UTF-8 encoded bytes. The parameters | 36 * Produce a String from a List of UTF-8 encoded bytes. The parameters |
37 * can set an offset into a list of bytes (as int), limit the length of the | 37 * can set an offset into a list of bytes (as int), limit the length of the |
38 * values to be decoded, and override the default Unicode replacement character. | 38 * values to be decoded, and override the default Unicode replacement character. |
39 * Set the replacementCharacter to null to throw an IllegalArgumentException | 39 * Set the replacementCharacter to null to throw an ArgumentError |
40 * rather than replace the bad value. | 40 * rather than replace the bad value. |
41 */ | 41 */ |
42 String decodeUtf8(List<int> bytes, [int offset = 0, int length, | 42 String decodeUtf8(List<int> bytes, [int offset = 0, int length, |
43 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 43 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
44 return codepointsToString( | 44 return codepointsToString( |
45 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) | 45 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) |
46 .decodeRest()); | 46 .decodeRest()); |
47 } | 47 } |
48 | 48 |
49 /** | 49 /** |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
139 | 139 |
140 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, | 140 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, |
141 replacementCodepoint); | 141 replacementCodepoint); |
142 } | 142 } |
143 | 143 |
144 /** | 144 /** |
145 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 145 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
146 * parameters can set an offset into a list of bytes (as int), limit the length | 146 * parameters can set an offset into a list of bytes (as int), limit the length |
147 * of the values to be decoded, and override the default Unicode replacement | 147 * of the values to be decoded, and override the default Unicode replacement |
148 * character. Set the replacementCharacter to null to throw an | 148 * character. Set the replacementCharacter to null to throw an |
149 * IllegalArgumentException rather than replace the bad value. The return value | 149 * ArgumentError rather than replace the bad value. The return value |
150 * from this method can be used as an Iterable (e.g. in a for-loop). | 150 * from this method can be used as an Iterable (e.g. in a for-loop). |
151 */ | 151 */ |
152 class Utf8Decoder implements Iterator<int> { | 152 class Utf8Decoder implements Iterator<int> { |
153 final _ListRangeIterator utf8EncodedBytesIterator; | 153 final _ListRangeIterator utf8EncodedBytesIterator; |
154 final int replacementCodepoint; | 154 final int replacementCodepoint; |
155 | 155 |
156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
157 int this.replacementCodepoint = | 157 int this.replacementCodepoint = |
158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
159 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, | 159 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, |
(...skipping 26 matching lines...) Expand all Loading... |
186 bool hasNext() => utf8EncodedBytesIterator.hasNext(); | 186 bool hasNext() => utf8EncodedBytesIterator.hasNext(); |
187 | 187 |
188 int next() { | 188 int next() { |
189 int value = utf8EncodedBytesIterator.next(); | 189 int value = utf8EncodedBytesIterator.next(); |
190 int additionalBytes = 0; | 190 int additionalBytes = 0; |
191 | 191 |
192 if (value < 0) { | 192 if (value < 0) { |
193 if (replacementCodepoint != null) { | 193 if (replacementCodepoint != null) { |
194 return replacementCodepoint; | 194 return replacementCodepoint; |
195 } else { | 195 } else { |
196 throw new IllegalArgumentException( | 196 throw new ArgumentError( |
197 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 197 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
198 } | 198 } |
199 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 199 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
200 return value; | 200 return value; |
201 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 201 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
202 if (replacementCodepoint != null) { | 202 if (replacementCodepoint != null) { |
203 return replacementCodepoint; | 203 return replacementCodepoint; |
204 } else { | 204 } else { |
205 throw new IllegalArgumentException( | 205 throw new ArgumentError( |
206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
207 } | 207 } |
208 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | 208 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
209 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | 209 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
210 additionalBytes = 1; | 210 additionalBytes = 1; |
211 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | 211 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
212 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | 212 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
213 additionalBytes = 2; | 213 additionalBytes = 2; |
214 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | 214 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
215 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | 215 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
216 additionalBytes = 3; | 216 additionalBytes = 3; |
217 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | 217 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
218 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | 218 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
219 additionalBytes = 4; | 219 additionalBytes = 4; |
220 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | 220 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
221 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | 221 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
222 additionalBytes = 5; | 222 additionalBytes = 5; |
223 } else if (replacementCodepoint != null) { | 223 } else if (replacementCodepoint != null) { |
224 return replacementCodepoint; | 224 return replacementCodepoint; |
225 } else { | 225 } else { |
226 throw new IllegalArgumentException( | 226 throw new ArgumentError( |
227 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 227 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
228 } | 228 } |
229 int j = 0; | 229 int j = 0; |
230 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) { | 230 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) { |
231 int nextValue = utf8EncodedBytesIterator.next(); | 231 int nextValue = utf8EncodedBytesIterator.next(); |
232 if (nextValue > _UTF8_ONE_BYTE_MAX && | 232 if (nextValue > _UTF8_ONE_BYTE_MAX && |
233 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 233 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
234 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 234 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
235 } else { | 235 } else { |
236 // if sequence-starting code unit, reposition cursor to start here | 236 // if sequence-starting code unit, reposition cursor to start here |
(...skipping 10 matching lines...) Expand all Loading... |
247 bool nonOverlong = | 247 bool nonOverlong = |
248 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | 248 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
249 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 249 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
250 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 250 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
251 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 251 bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
252 if (validSequence && nonOverlong && inRange) { | 252 if (validSequence && nonOverlong && inRange) { |
253 return value; | 253 return value; |
254 } else if (replacementCodepoint != null) { | 254 } else if (replacementCodepoint != null) { |
255 return replacementCodepoint; | 255 return replacementCodepoint; |
256 } else { | 256 } else { |
257 throw new IllegalArgumentException( | 257 throw new ArgumentError( |
258 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 258 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
259 } | 259 } |
260 } | 260 } |
261 } | 261 } |
OLD | NEW |