| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 part of dart.convert; | |
| 6 | |
| 7 /** The Unicode Replacement character `U+FFFD` (�). */ | |
| 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | |
| 9 | |
| 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ | |
| 11 const int UNICODE_BOM_CHARACTER_RUNE = 0xFEFF; | |
| 12 | |
| 13 /** | |
| 14 * An instance of the default implementation of the [Utf8Codec]. | |
| 15 * | |
| 16 * This instance provides a convenient access to the most common UTF-8 | |
| 17 * use cases. | |
| 18 * | |
| 19 * Examples: | |
| 20 * | |
| 21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); | |
| 22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, | |
| 23 * 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]); | |
| 24 */ | |
| 25 const Utf8Codec UTF8 = const Utf8Codec(); | |
| 26 | |
| 27 /** | |
| 28 * A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes | |
| 29 * UTF-8 code units to strings. | |
| 30 */ | |
| 31 class Utf8Codec extends Encoding { | |
| 32 final bool _allowMalformed; | |
| 33 | |
| 34 /** | |
| 35 * Instantiates a new [Utf8Codec]. | |
| 36 * | |
| 37 * The optional [allowMalformed] argument defines how [decoder] (and [decode]) | |
| 38 * deal with invalid or unterminated character sequences. | |
| 39 * | |
| 40 * If it is `true` (and not overridden at the method invocation) [decode] and | |
| 41 * the [decoder] replace invalid (or unterminated) octet | |
| 42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | |
| 43 * they throw a [FormatException]. | |
| 44 */ | |
| 45 const Utf8Codec({ bool allowMalformed: false }) | |
| 46 : _allowMalformed = allowMalformed; | |
| 47 | |
| 48 String get name => "utf-8"; | |
| 49 | |
| 50 /** | |
| 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | |
| 52 * corresponding string. | |
| 53 * | |
| 54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this | |
| 55 * character is discarded. | |
| 56 * | |
| 57 * If [allowMalformed] is `true` the decoder replaces invalid (or | |
| 58 * unterminated) character sequences with the Unicode Replacement character | |
| 59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. | |
| 60 * | |
| 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that | |
| 62 * was used to instantiate `this`. | |
| 63 */ | |
| 64 String decode(List<int> codeUnits, { bool allowMalformed }) { | |
| 65 if (allowMalformed == null) allowMalformed = _allowMalformed; | |
| 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | |
| 67 } | |
| 68 | |
| 69 Utf8Encoder get encoder => const Utf8Encoder(); | |
| 70 Utf8Decoder get decoder { | |
| 71 return new Utf8Decoder(allowMalformed: _allowMalformed); | |
| 72 } | |
| 73 } | |
| 74 | |
| 75 /** | |
| 76 * This class converts strings to their UTF-8 code units (a list of | |
| 77 * unsigned 8-bit integers). | |
| 78 */ | |
| 79 class Utf8Encoder extends Converter<String, List<int>> { | |
| 80 | |
| 81 const Utf8Encoder(); | |
| 82 | |
| 83 /** | |
| 84 * Converts [string] to its UTF-8 code units (a list of | |
| 85 * unsigned 8-bit integers). | |
| 86 * | |
| 87 * If [start] and [end] are provided, only the substring | |
| 88 * `string.substring(start, end)` is converted. | |
| 89 */ | |
| 90 List<int> convert(String string, [int start = 0, int end]) { | |
| 91 int stringLength = string.length; | |
| 92 RangeError.checkValidRange(start, end, stringLength); | |
| 93 if (end == null) end = stringLength; | |
| 94 int length = end - start; | |
| 95 if (length == 0) return new Uint8List(0); | |
| 96 // Create a new encoder with a length that is guaranteed to be big enough. | |
| 97 // A single code unit uses at most 3 bytes, a surrogate pair at most 4. | |
| 98 _Utf8Encoder encoder = new _Utf8Encoder.withBufferSize(length * 3); | |
| 99 int endPosition = encoder._fillBuffer(string, start, end); | |
| 100 assert(endPosition >= end - 1); | |
| 101 if (endPosition != end) { | |
| 102 // Encoding skipped the last code unit. | |
| 103 // That can only happen if the last code unit is a leadsurrogate. | |
| 104 // Force encoding of the lead surrogate by itself. | |
| 105 int lastCodeUnit = string.codeUnitAt(end - 1); | |
| 106 assert(_isLeadSurrogate(lastCodeUnit)); | |
| 107 // We use a non-surrogate as `nextUnit` so that _writeSurrogate just | |
| 108 // writes the lead-surrogate. | |
| 109 bool wasCombined = encoder._writeSurrogate(lastCodeUnit, 0); | |
| 110 assert(!wasCombined); | |
| 111 } | |
| 112 return encoder._buffer.sublist(0, encoder._bufferIndex); | |
| 113 } | |
| 114 | |
| 115 /** | |
| 116 * Starts a chunked conversion. | |
| 117 * | |
| 118 * The converter works more efficiently if the given [sink] is a | |
| 119 * [ByteConversionSink]. | |
| 120 */ | |
| 121 StringConversionSink startChunkedConversion(Sink<List<int>> sink) { | |
| 122 if (sink is! ByteConversionSink) { | |
| 123 sink = new ByteConversionSink.from(sink); | |
| 124 } | |
| 125 return new _Utf8EncoderSink(sink); | |
| 126 } | |
| 127 | |
| 128 // Override the base-classes bind, to provide a better type. | |
| 129 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream); | |
| 130 } | |
| 131 | |
| 132 /** | |
| 133 * This class encodes Strings to UTF-8 code units (unsigned 8 bit integers). | |
| 134 */ | |
| 135 // TODO(floitsch): make this class public. | |
| 136 class _Utf8Encoder { | |
| 137 int _carry = 0; | |
| 138 int _bufferIndex = 0; | |
| 139 final List<int> _buffer; | |
| 140 | |
| 141 static const _DEFAULT_BYTE_BUFFER_SIZE = 1024; | |
| 142 | |
| 143 _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE); | |
| 144 | |
| 145 _Utf8Encoder.withBufferSize(int bufferSize) | |
| 146 : _buffer = _createBuffer(bufferSize); | |
| 147 | |
| 148 /** | |
| 149 * Allow an implementation to pick the most efficient way of storing bytes. | |
| 150 */ | |
| 151 static List<int> _createBuffer(int size) => new Uint8List(size); | |
| 152 | |
| 153 /** | |
| 154 * Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and | |
| 155 * writes it to [_buffer]. | |
| 156 * | |
| 157 * Returns true if the [nextCodeUnit] was combined with the | |
| 158 * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing | |
| 159 * surrogate and has not been written yet. | |
| 160 * | |
| 161 * It is safe to pass 0 for [nextCodeUnit] in which case only the leading | |
| 162 * surrogate is written. | |
| 163 */ | |
| 164 bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) { | |
| 165 if (_isTailSurrogate(nextCodeUnit)) { | |
| 166 int rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit); | |
| 167 // If the rune is encoded with 2 code-units then it must be encoded | |
| 168 // with 4 bytes in UTF-8. | |
| 169 assert(rune > _THREE_BYTE_LIMIT); | |
| 170 assert(rune <= _FOUR_BYTE_LIMIT); | |
| 171 _buffer[_bufferIndex++] = 0xF0 | (rune >> 18); | |
| 172 _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f); | |
| 173 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); | |
| 174 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
| 175 return true; | |
| 176 } else { | |
| 177 // TODO(floitsch): allow to throw on malformed strings. | |
| 178 // Encode the half-surrogate directly into UTF-8. This yields | |
| 179 // invalid UTF-8, but we started out with invalid UTF-16. | |
| 180 | |
| 181 // Surrogates are always encoded in 3 bytes in UTF-8. | |
| 182 _buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12); | |
| 183 _buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f); | |
| 184 _buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f); | |
| 185 return false; | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 /** | |
| 190 * Fills the [_buffer] with as many characters as possible. | |
| 191 * | |
| 192 * Does not encode any trailing lead-surrogate. This must be done by the | |
| 193 * caller. | |
| 194 * | |
| 195 * Returns the position in the string. The returned index points to the | |
| 196 * first code unit that hasn't been encoded. | |
| 197 */ | |
| 198 int _fillBuffer(String str, int start, int end) { | |
| 199 if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) { | |
| 200 // Don't handle a trailing lead-surrogate in this loop. The caller has | |
| 201 // to deal with those. | |
| 202 end--; | |
| 203 } | |
| 204 int stringIndex; | |
| 205 for (stringIndex = start; stringIndex < end; stringIndex++) { | |
| 206 int codeUnit = str.codeUnitAt(stringIndex); | |
| 207 // ASCII has the same representation in UTF-8 and UTF-16. | |
| 208 if (codeUnit <= _ONE_BYTE_LIMIT) { | |
| 209 if (_bufferIndex >= _buffer.length) break; | |
| 210 _buffer[_bufferIndex++] = codeUnit; | |
| 211 } else if (_isLeadSurrogate(codeUnit)) { | |
| 212 if (_bufferIndex + 3 >= _buffer.length) break; | |
| 213 // Note that it is safe to read the next code unit. We decremented | |
| 214 // [end] above when the last valid code unit was a leading surrogate. | |
| 215 int nextCodeUnit = str.codeUnitAt(stringIndex + 1); | |
| 216 bool wasCombined = _writeSurrogate(codeUnit, nextCodeUnit); | |
| 217 if (wasCombined) stringIndex++; | |
| 218 } else { | |
| 219 int rune = codeUnit; | |
| 220 if (rune <= _TWO_BYTE_LIMIT) { | |
| 221 if (_bufferIndex + 1 >= _buffer.length) break; | |
| 222 _buffer[_bufferIndex++] = 0xC0 | (rune >> 6); | |
| 223 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
| 224 } else { | |
| 225 assert(rune <= _THREE_BYTE_LIMIT); | |
| 226 if (_bufferIndex + 2 >= _buffer.length) break; | |
| 227 _buffer[_bufferIndex++] = 0xE0 | (rune >> 12); | |
| 228 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); | |
| 229 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
| 230 } | |
| 231 } | |
| 232 } | |
| 233 return stringIndex; | |
| 234 } | |
| 235 } | |
| 236 | |
| 237 /** | |
| 238 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit | |
| 239 * integers). | |
| 240 */ | |
| 241 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { | |
| 242 | |
| 243 final ByteConversionSink _sink; | |
| 244 | |
| 245 _Utf8EncoderSink(this._sink); | |
| 246 | |
| 247 void close() { | |
| 248 if (_carry != 0) { | |
| 249 // addSlice will call close again, but then the carry must be equal to 0. | |
| 250 addSlice("", 0, 0, true); | |
| 251 return; | |
| 252 } | |
| 253 _sink.close(); | |
| 254 } | |
| 255 | |
| 256 void addSlice(String str, int start, int end, bool isLast) { | |
| 257 _bufferIndex = 0; | |
| 258 | |
| 259 if (start == end && !isLast) { | |
| 260 return; | |
| 261 } | |
| 262 | |
| 263 if (_carry != 0) { | |
| 264 int nextCodeUnit = 0; | |
| 265 if (start != end) { | |
| 266 nextCodeUnit = str.codeUnitAt(start); | |
| 267 } else { | |
| 268 assert(isLast); | |
| 269 } | |
| 270 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); | |
| 271 // Either we got a non-empty string, or we must not have been combined. | |
| 272 assert(!wasCombined || start != end ); | |
| 273 if (wasCombined) start++; | |
| 274 _carry = 0; | |
| 275 } | |
| 276 do { | |
| 277 start = _fillBuffer(str, start, end); | |
| 278 bool isLastSlice = isLast && (start == end); | |
| 279 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { | |
| 280 if (isLast && _bufferIndex < _buffer.length - 3) { | |
| 281 // There is still space for the last incomplete surrogate. | |
| 282 // We use a non-surrogate as second argument. This way the | |
| 283 // function will just add the surrogate-half to the buffer. | |
| 284 bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0); | |
| 285 assert(!hasBeenCombined); | |
| 286 } else { | |
| 287 // Otherwise store it in the carry. If isLast is true, then | |
| 288 // close will flush the last carry. | |
| 289 _carry = str.codeUnitAt(start); | |
| 290 } | |
| 291 start++; | |
| 292 } | |
| 293 _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice); | |
| 294 _bufferIndex = 0; | |
| 295 } while (start < end); | |
| 296 if (isLast) close(); | |
| 297 } | |
| 298 | |
| 299 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it | |
| 300 // needs to deal with malformed input. | |
| 301 } | |
| 302 | |
| 303 /** | |
| 304 * This class converts UTF-8 code units (lists of unsigned 8-bit integers) | |
| 305 * to a string. | |
| 306 */ | |
| 307 class Utf8Decoder extends Converter<List<int>, String> { | |
| 308 final bool _allowMalformed; | |
| 309 | |
| 310 /** | |
| 311 * Instantiates a new [Utf8Decoder]. | |
| 312 * | |
| 313 * The optional [allowMalformed] argument defines how [convert] deals | |
| 314 * with invalid or unterminated character sequences. | |
| 315 * | |
| 316 * If it is `true` [convert] replaces invalid (or unterminated) character | |
| 317 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | |
| 318 * it throws a [FormatException]. | |
| 319 */ | |
| 320 const Utf8Decoder({ bool allowMalformed: false }) | |
| 321 : this._allowMalformed = allowMalformed; | |
| 322 | |
| 323 /** | |
| 324 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | |
| 325 * corresponding string. | |
| 326 * | |
| 327 * Uses the code units from [start] to, but no including, [end]. | |
| 328 * If [end] is omitted, it defaults to `codeUnits.length`. | |
| 329 * | |
| 330 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this | |
| 331 * character is discarded. | |
| 332 */ | |
| 333 String convert(List<int> codeUnits, [int start = 0, int end]) { | |
| 334 // Allow the implementation to intercept and specialize based on the type | |
| 335 // of codeUnits. | |
| 336 String result = _convertIntercepted(_allowMalformed, codeUnits, start, end); | |
| 337 if (result != null) { | |
| 338 return result; | |
| 339 } | |
| 340 | |
| 341 int length = codeUnits.length; | |
| 342 RangeError.checkValidRange(start, end, length); | |
| 343 if (end == null) end = length; | |
| 344 StringBuffer buffer = new StringBuffer(); | |
| 345 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); | |
| 346 decoder.convert(codeUnits, start, end); | |
| 347 decoder.close(); | |
| 348 return buffer.toString(); | |
| 349 } | |
| 350 | |
| 351 /** | |
| 352 * Starts a chunked conversion. | |
| 353 * | |
| 354 * The converter works more efficiently if the given [sink] is a | |
| 355 * [StringConversionSink]. | |
| 356 */ | |
| 357 ByteConversionSink startChunkedConversion(Sink<String> sink) { | |
| 358 StringConversionSink stringSink; | |
| 359 if (sink is StringConversionSink) { | |
| 360 stringSink = sink; | |
| 361 } else { | |
| 362 stringSink = new StringConversionSink.from(sink); | |
| 363 } | |
| 364 return stringSink.asUtf8Sink(_allowMalformed); | |
| 365 } | |
| 366 | |
| 367 // Override the base-classes bind, to provide a better type. | |
| 368 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); | |
| 369 | |
| 370 external Converter<List<int>, dynamic/*=T*/> fuse/*<T>*/( | |
| 371 Converter<String, dynamic/*=T*/> next); | |
| 372 | |
| 373 external static String _convertIntercepted( | |
| 374 bool allowMalformed, List<int> codeUnits, int start, int end); | |
| 375 } | |
| 376 | |
| 377 // UTF-8 constants. | |
| 378 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits | |
| 379 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits | |
| 380 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits | |
| 381 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. | |
| 382 | |
| 383 // UTF-16 constants. | |
| 384 const int _SURROGATE_MASK = 0xF800; | |
| 385 const int _SURROGATE_TAG_MASK = 0xFC00; | |
| 386 const int _SURROGATE_VALUE_MASK = 0x3FF; | |
| 387 const int _LEAD_SURROGATE_MIN = 0xD800; | |
| 388 const int _TAIL_SURROGATE_MIN = 0xDC00; | |
| 389 | |
| 390 bool _isLeadSurrogate(int codeUnit) => | |
| 391 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | |
| 392 bool _isTailSurrogate(int codeUnit) => | |
| 393 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | |
| 394 int _combineSurrogatePair(int lead, int tail) => | |
| 395 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | |
| 396 | (tail & _SURROGATE_VALUE_MASK); | |
| 397 | |
| 398 /** | |
| 399 * Decodes UTF-8. | |
| 400 * | |
| 401 * The decoder handles chunked input. | |
| 402 */ | |
| 403 // TODO(floitsch): make this class public. | |
| 404 class _Utf8Decoder { | |
| 405 final bool _allowMalformed; | |
| 406 final StringSink _stringSink; | |
| 407 bool _isFirstCharacter = true; | |
| 408 int _value = 0; | |
| 409 int _expectedUnits = 0; | |
| 410 int _extraUnits = 0; | |
| 411 | |
| 412 _Utf8Decoder(this._stringSink, this._allowMalformed); | |
| 413 | |
| 414 bool get hasPartialInput => _expectedUnits > 0; | |
| 415 | |
| 416 // Limits of one through four byte encodings. | |
| 417 static const List<int> _LIMITS = const <int>[ | |
| 418 _ONE_BYTE_LIMIT, | |
| 419 _TWO_BYTE_LIMIT, | |
| 420 _THREE_BYTE_LIMIT, | |
| 421 _FOUR_BYTE_LIMIT ]; | |
| 422 | |
| 423 void close() { | |
| 424 flush(); | |
| 425 } | |
| 426 | |
| 427 /** | |
| 428 * Flushes this decoder as if closed. | |
| 429 * | |
| 430 * This method throws if the input was partial and the decoder was | |
| 431 * constructed with `allowMalformed` set to `false`. | |
| 432 */ | |
| 433 void flush() { | |
| 434 if (hasPartialInput) { | |
| 435 if (!_allowMalformed) { | |
| 436 throw new FormatException("Unfinished UTF-8 octet sequence"); | |
| 437 } | |
| 438 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
| 439 _value = 0; | |
| 440 _expectedUnits = 0; | |
| 441 _extraUnits = 0; | |
| 442 } | |
| 443 } | |
| 444 | |
| 445 void convert(List<int> codeUnits, int startIndex, int endIndex) { | |
| 446 int value = _value; | |
| 447 int expectedUnits = _expectedUnits; | |
| 448 int extraUnits = _extraUnits; | |
| 449 _value = 0; | |
| 450 _expectedUnits = 0; | |
| 451 _extraUnits = 0; | |
| 452 | |
| 453 int scanOneByteCharacters(units, int from) { | |
| 454 final to = endIndex; | |
| 455 final mask = _ONE_BYTE_LIMIT; | |
| 456 for (var i = from; i < to; i++) { | |
| 457 final unit = units[i]; | |
| 458 if ((unit & mask) != unit) return i - from; | |
| 459 } | |
| 460 return to - from; | |
| 461 } | |
| 462 | |
| 463 void addSingleBytes(int from, int to) { | |
| 464 assert(from >= startIndex && from <= endIndex); | |
| 465 assert(to >= startIndex && to <= endIndex); | |
| 466 _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); | |
| 467 } | |
| 468 | |
| 469 int i = startIndex; | |
| 470 loop: while (true) { | |
| 471 multibyte: if (expectedUnits > 0) { | |
| 472 do { | |
| 473 if (i == endIndex) { | |
| 474 break loop; | |
| 475 } | |
| 476 int unit = codeUnits[i]; | |
| 477 if ((unit & 0xC0) != 0x80) { | |
| 478 expectedUnits = 0; | |
| 479 if (!_allowMalformed) { | |
| 480 throw new FormatException( | |
| 481 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
| 482 } | |
| 483 _isFirstCharacter = false; | |
| 484 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
| 485 break multibyte; | |
| 486 } else { | |
| 487 value = (value << 6) | (unit & 0x3f); | |
| 488 expectedUnits--; | |
| 489 i++; | |
| 490 } | |
| 491 } while (expectedUnits > 0); | |
| 492 if (value <= _LIMITS[extraUnits - 1]) { | |
| 493 // Overly long encoding. The value could be encoded with a shorter | |
| 494 // encoding. | |
| 495 if (!_allowMalformed) { | |
| 496 throw new FormatException( | |
| 497 "Overlong encoding of 0x${value.toRadixString(16)}"); | |
| 498 } | |
| 499 expectedUnits = extraUnits = 0; | |
| 500 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
| 501 } | |
| 502 if (value > _FOUR_BYTE_LIMIT) { | |
| 503 if (!_allowMalformed) { | |
| 504 throw new FormatException("Character outside valid Unicode range: " | |
| 505 "0x${value.toRadixString(16)}"); | |
| 506 } | |
| 507 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
| 508 } | |
| 509 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | |
| 510 _stringSink.writeCharCode(value); | |
| 511 } | |
| 512 _isFirstCharacter = false; | |
| 513 } | |
| 514 | |
| 515 while (i < endIndex) { | |
| 516 int oneBytes = scanOneByteCharacters(codeUnits, i); | |
| 517 if (oneBytes > 0) { | |
| 518 _isFirstCharacter = false; | |
| 519 addSingleBytes(i, i + oneBytes); | |
| 520 i += oneBytes; | |
| 521 if (i == endIndex) break; | |
| 522 } | |
| 523 int unit = codeUnits[i++]; | |
| 524 // TODO(floitsch): the way we test we could potentially allow | |
| 525 // units that are too large, if they happen to have the | |
| 526 // right bit-pattern. (Same is true for the multibyte loop above). | |
| 527 // TODO(floitsch): optimize this loop. See: | |
| 528 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | |
| 529 if (unit < 0) { | |
| 530 // TODO(floitsch): should this be unit <= 0 ? | |
| 531 if (!_allowMalformed) { | |
| 532 throw new FormatException( | |
| 533 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); | |
| 534 } | |
| 535 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
| 536 } else { | |
| 537 assert(unit > _ONE_BYTE_LIMIT); | |
| 538 if ((unit & 0xE0) == 0xC0) { | |
| 539 value = unit & 0x1F; | |
| 540 expectedUnits = extraUnits = 1; | |
| 541 continue loop; | |
| 542 } | |
| 543 if ((unit & 0xF0) == 0xE0) { | |
| 544 value = unit & 0x0F; | |
| 545 expectedUnits = extraUnits = 2; | |
| 546 continue loop; | |
| 547 } | |
| 548 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | |
| 549 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | |
| 550 value = unit & 0x07; | |
| 551 expectedUnits = extraUnits = 3; | |
| 552 continue loop; | |
| 553 } | |
| 554 if (!_allowMalformed) { | |
| 555 throw new FormatException( | |
| 556 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
| 557 } | |
| 558 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
| 559 expectedUnits = extraUnits = 0; | |
| 560 _isFirstCharacter = false; | |
| 561 _stringSink.writeCharCode(value); | |
| 562 } | |
| 563 } | |
| 564 break loop; | |
| 565 } | |
| 566 if (expectedUnits > 0) { | |
| 567 _value = value; | |
| 568 _expectedUnits = expectedUnits; | |
| 569 _extraUnits = extraUnits; | |
| 570 } | |
| 571 } | |
| 572 } | |
| OLD | NEW |