| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 const int _UTF8_ONE_BYTE_MAX = 0x7f; | 5 const int _UTF8_ONE_BYTE_MAX = 0x7f; |
| 6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
| 7 const int _UTF8_THREE_BYTE_MAX = 0xffff; | 7 const int _UTF8_THREE_BYTE_MAX = 0xffff; |
| 8 | 8 |
| 9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | 9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
| 10 | 10 |
| 11 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; | 11 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; |
| 12 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; | 12 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; |
| 13 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; | 13 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; |
| 14 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; | 14 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; |
| 15 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; | 15 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; |
| 16 | 16 |
| 17 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; | 17 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; |
| 18 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; | 18 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; |
| 19 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; | 19 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; |
| 20 | 20 |
| 21 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; | 21 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; |
| 22 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; | 22 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; |
| 23 | 23 |
| 24 /** | 24 /** |
| 25 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert | 25 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert |
| 26 * as much of the input as needed. Set the replacementCharacter to null to | 26 * as much of the input as needed. Set the replacementCharacter to null to |
| 27 * throw an IllegalArgumentException rather than replace the bad value. | 27 * throw an ArgumentError rather than replace the bad value. |
| 28 */ | 28 */ |
| 29 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, | 29 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, |
| 30 int length, | 30 int length, |
| 31 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 31 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 32 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); | 32 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); |
| 33 } | 33 } |
| 34 | 34 |
| 35 /** | 35 /** |
| 36 * Produce a String from a List of UTF-8 encoded bytes. The parameters | 36 * Produce a String from a List of UTF-8 encoded bytes. The parameters |
| 37 * can set an offset into a list of bytes (as int), limit the length of the | 37 * can set an offset into a list of bytes (as int), limit the length of the |
| 38 * values to be decoded, and override the default Unicode replacement character. | 38 * values to be decoded, and override the default Unicode replacement character. |
| 39 * Set the replacementCharacter to null to throw an IllegalArgumentException | 39 * Set the replacementCharacter to null to throw an ArgumentError |
| 40 * rather than replace the bad value. | 40 * rather than replace the bad value. |
| 41 */ | 41 */ |
| 42 String decodeUtf8(List<int> bytes, [int offset = 0, int length, | 42 String decodeUtf8(List<int> bytes, [int offset = 0, int length, |
| 43 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 43 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 44 return codepointsToString( | 44 return codepointsToString( |
| 45 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) | 45 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) |
| 46 .decodeRest()); | 46 .decodeRest()); |
| 47 } | 47 } |
| 48 | 48 |
| 49 /** | 49 /** |
| (...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 139 | 139 |
| 140 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, | 140 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, |
| 141 replacementCodepoint); | 141 replacementCodepoint); |
| 142 } | 142 } |
| 143 | 143 |
| 144 /** | 144 /** |
| 145 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 145 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
| 146 * parameters can set an offset into a list of bytes (as int), limit the length | 146 * parameters can set an offset into a list of bytes (as int), limit the length |
| 147 * of the values to be decoded, and override the default Unicode replacement | 147 * of the values to be decoded, and override the default Unicode replacement |
| 148 * character. Set the replacementCharacter to null to throw an | 148 * character. Set the replacementCharacter to null to throw an |
| 149 * IllegalArgumentException rather than replace the bad value. The return value | 149 * ArgumentError rather than replace the bad value. The return value |
| 150 * from this method can be used as an Iterable (e.g. in a for-loop). | 150 * from this method can be used as an Iterable (e.g. in a for-loop). |
| 151 */ | 151 */ |
| 152 class Utf8Decoder implements Iterator<int> { | 152 class Utf8Decoder implements Iterator<int> { |
| 153 final _ListRangeIterator utf8EncodedBytesIterator; | 153 final _ListRangeIterator utf8EncodedBytesIterator; |
| 154 final int replacementCodepoint; | 154 final int replacementCodepoint; |
| 155 | 155 |
| 156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
| 157 int this.replacementCodepoint = | 157 int this.replacementCodepoint = |
| 158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 159 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, | 159 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, |
| (...skipping 26 matching lines...) Expand all Loading... |
| 186 bool hasNext() => utf8EncodedBytesIterator.hasNext(); | 186 bool hasNext() => utf8EncodedBytesIterator.hasNext(); |
| 187 | 187 |
| 188 int next() { | 188 int next() { |
| 189 int value = utf8EncodedBytesIterator.next(); | 189 int value = utf8EncodedBytesIterator.next(); |
| 190 int additionalBytes = 0; | 190 int additionalBytes = 0; |
| 191 | 191 |
| 192 if (value < 0) { | 192 if (value < 0) { |
| 193 if (replacementCodepoint != null) { | 193 if (replacementCodepoint != null) { |
| 194 return replacementCodepoint; | 194 return replacementCodepoint; |
| 195 } else { | 195 } else { |
| 196 throw new IllegalArgumentException( | 196 throw new ArgumentError( |
| 197 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 197 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 198 } | 198 } |
| 199 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 199 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| 200 return value; | 200 return value; |
| 201 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 201 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 202 if (replacementCodepoint != null) { | 202 if (replacementCodepoint != null) { |
| 203 return replacementCodepoint; | 203 return replacementCodepoint; |
| 204 } else { | 204 } else { |
| 205 throw new IllegalArgumentException( | 205 throw new ArgumentError( |
| 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 207 } | 207 } |
| 208 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | 208 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
| 209 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | 209 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
| 210 additionalBytes = 1; | 210 additionalBytes = 1; |
| 211 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | 211 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
| 212 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | 212 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
| 213 additionalBytes = 2; | 213 additionalBytes = 2; |
| 214 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | 214 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
| 215 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | 215 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
| 216 additionalBytes = 3; | 216 additionalBytes = 3; |
| 217 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | 217 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
| 218 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | 218 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
| 219 additionalBytes = 4; | 219 additionalBytes = 4; |
| 220 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | 220 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
| 221 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | 221 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
| 222 additionalBytes = 5; | 222 additionalBytes = 5; |
| 223 } else if (replacementCodepoint != null) { | 223 } else if (replacementCodepoint != null) { |
| 224 return replacementCodepoint; | 224 return replacementCodepoint; |
| 225 } else { | 225 } else { |
| 226 throw new IllegalArgumentException( | 226 throw new ArgumentError( |
| 227 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 227 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 228 } | 228 } |
| 229 int j = 0; | 229 int j = 0; |
| 230 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) { | 230 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) { |
| 231 int nextValue = utf8EncodedBytesIterator.next(); | 231 int nextValue = utf8EncodedBytesIterator.next(); |
| 232 if (nextValue > _UTF8_ONE_BYTE_MAX && | 232 if (nextValue > _UTF8_ONE_BYTE_MAX && |
| 233 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 233 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 234 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 234 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
| 235 } else { | 235 } else { |
| 236 // if sequence-starting code unit, reposition cursor to start here | 236 // if sequence-starting code unit, reposition cursor to start here |
| (...skipping 10 matching lines...) Expand all Loading... |
| 247 bool nonOverlong = | 247 bool nonOverlong = |
| 248 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | 248 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
| 249 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 249 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
| 250 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 250 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
| 251 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 251 bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
| 252 if (validSequence && nonOverlong && inRange) { | 252 if (validSequence && nonOverlong && inRange) { |
| 253 return value; | 253 return value; |
| 254 } else if (replacementCodepoint != null) { | 254 } else if (replacementCodepoint != null) { |
| 255 return replacementCodepoint; | 255 return replacementCodepoint; |
| 256 } else { | 256 } else { |
| 257 throw new IllegalArgumentException( | 257 throw new ArgumentError( |
| 258 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 258 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
| 259 } | 259 } |
| 260 } | 260 } |
| 261 } | 261 } |
| OLD | NEW |