OLD | NEW |
| (Empty) |
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of dart.convert; | |
6 | |
7 /** The Unicode Replacement character `U+FFFD` (�). */ | |
8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | |
9 | |
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ | |
11 const int UNICODE_BOM_CHARACTER_RUNE = 0xFEFF; | |
12 | |
13 /** | |
14 * An instance of the default implementation of the [Utf8Codec]. | |
15 * | |
16 * This instance provides a convenient access to the most common UTF-8 | |
17 * use cases. | |
18 * | |
19 * Examples: | |
20 * | |
21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); | |
22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, | |
23 * 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]); | |
24 */ | |
25 const Utf8Codec UTF8 = const Utf8Codec(); | |
26 | |
27 /** | |
28 * A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes | |
29 * UTF-8 code units to strings. | |
30 */ | |
31 class Utf8Codec extends Encoding { | |
32 final bool _allowMalformed; | |
33 | |
34 /** | |
35 * Instantiates a new [Utf8Codec]. | |
36 * | |
37 * The optional [allowMalformed] argument defines how [decoder] (and [decode]) | |
38 * deal with invalid or unterminated character sequences. | |
39 * | |
40 * If it is `true` (and not overridden at the method invocation) [decode] and | |
41 * the [decoder] replace invalid (or unterminated) octet | |
42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | |
43 * they throw a [FormatException]. | |
44 */ | |
45 const Utf8Codec({ bool allowMalformed: false }) | |
46 : _allowMalformed = allowMalformed; | |
47 | |
48 String get name => "utf-8"; | |
49 | |
50 /** | |
51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | |
52 * corresponding string. | |
53 * | |
54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this | |
55 * character is discarded. | |
56 * | |
57 * If [allowMalformed] is `true` the decoder replaces invalid (or | |
58 * unterminated) character sequences with the Unicode Replacement character | |
59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. | |
60 * | |
61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that | |
62 * was used to instantiate `this`. | |
63 */ | |
64 String decode(List<int> codeUnits, { bool allowMalformed }) { | |
65 if (allowMalformed == null) allowMalformed = _allowMalformed; | |
66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | |
67 } | |
68 | |
69 Utf8Encoder get encoder => const Utf8Encoder(); | |
70 Utf8Decoder get decoder { | |
71 return new Utf8Decoder(allowMalformed: _allowMalformed); | |
72 } | |
73 } | |
74 | |
75 /** | |
76 * This class converts strings to their UTF-8 code units (a list of | |
77 * unsigned 8-bit integers). | |
78 */ | |
79 class Utf8Encoder extends Converter<String, List<int>> { | |
80 | |
81 const Utf8Encoder(); | |
82 | |
83 /** | |
84 * Converts [string] to its UTF-8 code units (a list of | |
85 * unsigned 8-bit integers). | |
86 * | |
87 * If [start] and [end] are provided, only the substring | |
88 * `string.substring(start, end)` is converted. | |
89 */ | |
90 List<int> convert(String string, [int start = 0, int end]) { | |
91 int stringLength = string.length; | |
92 RangeError.checkValidRange(start, end, stringLength); | |
93 if (end == null) end = stringLength; | |
94 int length = end - start; | |
95 if (length == 0) return new Uint8List(0); | |
96 // Create a new encoder with a length that is guaranteed to be big enough. | |
97 // A single code unit uses at most 3 bytes, a surrogate pair at most 4. | |
98 _Utf8Encoder encoder = new _Utf8Encoder.withBufferSize(length * 3); | |
99 int endPosition = encoder._fillBuffer(string, start, end); | |
100 assert(endPosition >= end - 1); | |
101 if (endPosition != end) { | |
102 // Encoding skipped the last code unit. | |
103 // That can only happen if the last code unit is a leadsurrogate. | |
104 // Force encoding of the lead surrogate by itself. | |
105 int lastCodeUnit = string.codeUnitAt(end - 1); | |
106 assert(_isLeadSurrogate(lastCodeUnit)); | |
107 // We use a non-surrogate as `nextUnit` so that _writeSurrogate just | |
108 // writes the lead-surrogate. | |
109 bool wasCombined = encoder._writeSurrogate(lastCodeUnit, 0); | |
110 assert(!wasCombined); | |
111 } | |
112 return encoder._buffer.sublist(0, encoder._bufferIndex); | |
113 } | |
114 | |
115 /** | |
116 * Starts a chunked conversion. | |
117 * | |
118 * The converter works more efficiently if the given [sink] is a | |
119 * [ByteConversionSink]. | |
120 */ | |
121 StringConversionSink startChunkedConversion(Sink<List<int>> sink) { | |
122 if (sink is! ByteConversionSink) { | |
123 sink = new ByteConversionSink.from(sink); | |
124 } | |
125 return new _Utf8EncoderSink(sink); | |
126 } | |
127 | |
128 // Override the base-classes bind, to provide a better type. | |
129 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream); | |
130 } | |
131 | |
132 /** | |
133 * This class encodes Strings to UTF-8 code units (unsigned 8 bit integers). | |
134 */ | |
135 // TODO(floitsch): make this class public. | |
136 class _Utf8Encoder { | |
137 int _carry = 0; | |
138 int _bufferIndex = 0; | |
139 final List<int> _buffer; | |
140 | |
141 static const _DEFAULT_BYTE_BUFFER_SIZE = 1024; | |
142 | |
143 _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE); | |
144 | |
145 _Utf8Encoder.withBufferSize(int bufferSize) | |
146 : _buffer = _createBuffer(bufferSize); | |
147 | |
148 /** | |
149 * Allow an implementation to pick the most efficient way of storing bytes. | |
150 */ | |
151 static List<int> _createBuffer(int size) => new Uint8List(size); | |
152 | |
153 /** | |
154 * Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and | |
155 * writes it to [_buffer]. | |
156 * | |
157 * Returns true if the [nextCodeUnit] was combined with the | |
158 * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing | |
159 * surrogate and has not been written yet. | |
160 * | |
161 * It is safe to pass 0 for [nextCodeUnit] in which case only the leading | |
162 * surrogate is written. | |
163 */ | |
164 bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) { | |
165 if (_isTailSurrogate(nextCodeUnit)) { | |
166 int rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit); | |
167 // If the rune is encoded with 2 code-units then it must be encoded | |
168 // with 4 bytes in UTF-8. | |
169 assert(rune > _THREE_BYTE_LIMIT); | |
170 assert(rune <= _FOUR_BYTE_LIMIT); | |
171 _buffer[_bufferIndex++] = 0xF0 | (rune >> 18); | |
172 _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f); | |
173 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); | |
174 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
175 return true; | |
176 } else { | |
177 // TODO(floitsch): allow to throw on malformed strings. | |
178 // Encode the half-surrogate directly into UTF-8. This yields | |
179 // invalid UTF-8, but we started out with invalid UTF-16. | |
180 | |
181 // Surrogates are always encoded in 3 bytes in UTF-8. | |
182 _buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12); | |
183 _buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f); | |
184 _buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f); | |
185 return false; | |
186 } | |
187 } | |
188 | |
189 /** | |
190 * Fills the [_buffer] with as many characters as possible. | |
191 * | |
192 * Does not encode any trailing lead-surrogate. This must be done by the | |
193 * caller. | |
194 * | |
195 * Returns the position in the string. The returned index points to the | |
196 * first code unit that hasn't been encoded. | |
197 */ | |
198 int _fillBuffer(String str, int start, int end) { | |
199 if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) { | |
200 // Don't handle a trailing lead-surrogate in this loop. The caller has | |
201 // to deal with those. | |
202 end--; | |
203 } | |
204 int stringIndex; | |
205 for (stringIndex = start; stringIndex < end; stringIndex++) { | |
206 int codeUnit = str.codeUnitAt(stringIndex); | |
207 // ASCII has the same representation in UTF-8 and UTF-16. | |
208 if (codeUnit <= _ONE_BYTE_LIMIT) { | |
209 if (_bufferIndex >= _buffer.length) break; | |
210 _buffer[_bufferIndex++] = codeUnit; | |
211 } else if (_isLeadSurrogate(codeUnit)) { | |
212 if (_bufferIndex + 3 >= _buffer.length) break; | |
213 // Note that it is safe to read the next code unit. We decremented | |
214 // [end] above when the last valid code unit was a leading surrogate. | |
215 int nextCodeUnit = str.codeUnitAt(stringIndex + 1); | |
216 bool wasCombined = _writeSurrogate(codeUnit, nextCodeUnit); | |
217 if (wasCombined) stringIndex++; | |
218 } else { | |
219 int rune = codeUnit; | |
220 if (rune <= _TWO_BYTE_LIMIT) { | |
221 if (_bufferIndex + 1 >= _buffer.length) break; | |
222 _buffer[_bufferIndex++] = 0xC0 | (rune >> 6); | |
223 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
224 } else { | |
225 assert(rune <= _THREE_BYTE_LIMIT); | |
226 if (_bufferIndex + 2 >= _buffer.length) break; | |
227 _buffer[_bufferIndex++] = 0xE0 | (rune >> 12); | |
228 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); | |
229 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
230 } | |
231 } | |
232 } | |
233 return stringIndex; | |
234 } | |
235 } | |
236 | |
237 /** | |
238 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit | |
239 * integers). | |
240 */ | |
241 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin { | |
242 | |
243 final ByteConversionSink _sink; | |
244 | |
245 _Utf8EncoderSink(this._sink); | |
246 | |
247 void close() { | |
248 if (_carry != 0) { | |
249 // addSlice will call close again, but then the carry must be equal to 0. | |
250 addSlice("", 0, 0, true); | |
251 return; | |
252 } | |
253 _sink.close(); | |
254 } | |
255 | |
256 void addSlice(String str, int start, int end, bool isLast) { | |
257 _bufferIndex = 0; | |
258 | |
259 if (start == end && !isLast) { | |
260 return; | |
261 } | |
262 | |
263 if (_carry != 0) { | |
264 int nextCodeUnit = 0; | |
265 if (start != end) { | |
266 nextCodeUnit = str.codeUnitAt(start); | |
267 } else { | |
268 assert(isLast); | |
269 } | |
270 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); | |
271 // Either we got a non-empty string, or we must not have been combined. | |
272 assert(!wasCombined || start != end ); | |
273 if (wasCombined) start++; | |
274 _carry = 0; | |
275 } | |
276 do { | |
277 start = _fillBuffer(str, start, end); | |
278 bool isLastSlice = isLast && (start == end); | |
279 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { | |
280 if (isLast && _bufferIndex < _buffer.length - 3) { | |
281 // There is still space for the last incomplete surrogate. | |
282 // We use a non-surrogate as second argument. This way the | |
283 // function will just add the surrogate-half to the buffer. | |
284 bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0); | |
285 assert(!hasBeenCombined); | |
286 } else { | |
287 // Otherwise store it in the carry. If isLast is true, then | |
288 // close will flush the last carry. | |
289 _carry = str.codeUnitAt(start); | |
290 } | |
291 start++; | |
292 } | |
293 _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice); | |
294 _bufferIndex = 0; | |
295 } while (start < end); | |
296 if (isLast) close(); | |
297 } | |
298 | |
299 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it | |
300 // needs to deal with malformed input. | |
301 } | |
302 | |
303 /** | |
304 * This class converts UTF-8 code units (lists of unsigned 8-bit integers) | |
305 * to a string. | |
306 */ | |
307 class Utf8Decoder extends Converter<List<int>, String> { | |
308 final bool _allowMalformed; | |
309 | |
310 /** | |
311 * Instantiates a new [Utf8Decoder]. | |
312 * | |
313 * The optional [allowMalformed] argument defines how [convert] deals | |
314 * with invalid or unterminated character sequences. | |
315 * | |
316 * If it is `true` [convert] replaces invalid (or unterminated) character | |
317 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | |
318 * it throws a [FormatException]. | |
319 */ | |
320 const Utf8Decoder({ bool allowMalformed: false }) | |
321 : this._allowMalformed = allowMalformed; | |
322 | |
323 /** | |
324 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | |
325 * corresponding string. | |
326 * | |
327 * Uses the code units from [start] to, but no including, [end]. | |
328 * If [end] is omitted, it defaults to `codeUnits.length`. | |
329 * | |
330 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this | |
331 * character is discarded. | |
332 */ | |
333 String convert(List<int> codeUnits, [int start = 0, int end]) { | |
334 // Allow the implementation to intercept and specialize based on the type | |
335 // of codeUnits. | |
336 String result = _convertIntercepted(_allowMalformed, codeUnits, start, end); | |
337 if (result != null) { | |
338 return result; | |
339 } | |
340 | |
341 int length = codeUnits.length; | |
342 RangeError.checkValidRange(start, end, length); | |
343 if (end == null) end = length; | |
344 StringBuffer buffer = new StringBuffer(); | |
345 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); | |
346 decoder.convert(codeUnits, start, end); | |
347 decoder.close(); | |
348 return buffer.toString(); | |
349 } | |
350 | |
351 /** | |
352 * Starts a chunked conversion. | |
353 * | |
354 * The converter works more efficiently if the given [sink] is a | |
355 * [StringConversionSink]. | |
356 */ | |
357 ByteConversionSink startChunkedConversion(Sink<String> sink) { | |
358 StringConversionSink stringSink; | |
359 if (sink is StringConversionSink) { | |
360 stringSink = sink; | |
361 } else { | |
362 stringSink = new StringConversionSink.from(sink); | |
363 } | |
364 return stringSink.asUtf8Sink(_allowMalformed); | |
365 } | |
366 | |
367 // Override the base-classes bind, to provide a better type. | |
368 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); | |
369 | |
370 external Converter<List<int>, dynamic/*=T*/> fuse/*<T>*/( | |
371 Converter<String, dynamic/*=T*/> next); | |
372 | |
373 external static String _convertIntercepted( | |
374 bool allowMalformed, List<int> codeUnits, int start, int end); | |
375 } | |
376 | |
377 // UTF-8 constants. | |
378 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits | |
379 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits | |
380 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits | |
381 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. | |
382 | |
383 // UTF-16 constants. | |
384 const int _SURROGATE_MASK = 0xF800; | |
385 const int _SURROGATE_TAG_MASK = 0xFC00; | |
386 const int _SURROGATE_VALUE_MASK = 0x3FF; | |
387 const int _LEAD_SURROGATE_MIN = 0xD800; | |
388 const int _TAIL_SURROGATE_MIN = 0xDC00; | |
389 | |
390 bool _isLeadSurrogate(int codeUnit) => | |
391 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | |
392 bool _isTailSurrogate(int codeUnit) => | |
393 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | |
394 int _combineSurrogatePair(int lead, int tail) => | |
395 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | |
396 | (tail & _SURROGATE_VALUE_MASK); | |
397 | |
398 /** | |
399 * Decodes UTF-8. | |
400 * | |
401 * The decoder handles chunked input. | |
402 */ | |
403 // TODO(floitsch): make this class public. | |
404 class _Utf8Decoder { | |
405 final bool _allowMalformed; | |
406 final StringSink _stringSink; | |
407 bool _isFirstCharacter = true; | |
408 int _value = 0; | |
409 int _expectedUnits = 0; | |
410 int _extraUnits = 0; | |
411 | |
412 _Utf8Decoder(this._stringSink, this._allowMalformed); | |
413 | |
414 bool get hasPartialInput => _expectedUnits > 0; | |
415 | |
416 // Limits of one through four byte encodings. | |
417 static const List<int> _LIMITS = const <int>[ | |
418 _ONE_BYTE_LIMIT, | |
419 _TWO_BYTE_LIMIT, | |
420 _THREE_BYTE_LIMIT, | |
421 _FOUR_BYTE_LIMIT ]; | |
422 | |
423 void close() { | |
424 flush(); | |
425 } | |
426 | |
427 /** | |
428 * Flushes this decoder as if closed. | |
429 * | |
430 * This method throws if the input was partial and the decoder was | |
431 * constructed with `allowMalformed` set to `false`. | |
432 */ | |
433 void flush() { | |
434 if (hasPartialInput) { | |
435 if (!_allowMalformed) { | |
436 throw new FormatException("Unfinished UTF-8 octet sequence"); | |
437 } | |
438 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
439 _value = 0; | |
440 _expectedUnits = 0; | |
441 _extraUnits = 0; | |
442 } | |
443 } | |
444 | |
445 void convert(List<int> codeUnits, int startIndex, int endIndex) { | |
446 int value = _value; | |
447 int expectedUnits = _expectedUnits; | |
448 int extraUnits = _extraUnits; | |
449 _value = 0; | |
450 _expectedUnits = 0; | |
451 _extraUnits = 0; | |
452 | |
453 int scanOneByteCharacters(units, int from) { | |
454 final to = endIndex; | |
455 final mask = _ONE_BYTE_LIMIT; | |
456 for (var i = from; i < to; i++) { | |
457 final unit = units[i]; | |
458 if ((unit & mask) != unit) return i - from; | |
459 } | |
460 return to - from; | |
461 } | |
462 | |
463 void addSingleBytes(int from, int to) { | |
464 assert(from >= startIndex && from <= endIndex); | |
465 assert(to >= startIndex && to <= endIndex); | |
466 _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); | |
467 } | |
468 | |
469 int i = startIndex; | |
470 loop: while (true) { | |
471 multibyte: if (expectedUnits > 0) { | |
472 do { | |
473 if (i == endIndex) { | |
474 break loop; | |
475 } | |
476 int unit = codeUnits[i]; | |
477 if ((unit & 0xC0) != 0x80) { | |
478 expectedUnits = 0; | |
479 if (!_allowMalformed) { | |
480 throw new FormatException( | |
481 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
482 } | |
483 _isFirstCharacter = false; | |
484 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
485 break multibyte; | |
486 } else { | |
487 value = (value << 6) | (unit & 0x3f); | |
488 expectedUnits--; | |
489 i++; | |
490 } | |
491 } while (expectedUnits > 0); | |
492 if (value <= _LIMITS[extraUnits - 1]) { | |
493 // Overly long encoding. The value could be encoded with a shorter | |
494 // encoding. | |
495 if (!_allowMalformed) { | |
496 throw new FormatException( | |
497 "Overlong encoding of 0x${value.toRadixString(16)}"); | |
498 } | |
499 expectedUnits = extraUnits = 0; | |
500 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
501 } | |
502 if (value > _FOUR_BYTE_LIMIT) { | |
503 if (!_allowMalformed) { | |
504 throw new FormatException("Character outside valid Unicode range: " | |
505 "0x${value.toRadixString(16)}"); | |
506 } | |
507 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
508 } | |
509 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | |
510 _stringSink.writeCharCode(value); | |
511 } | |
512 _isFirstCharacter = false; | |
513 } | |
514 | |
515 while (i < endIndex) { | |
516 int oneBytes = scanOneByteCharacters(codeUnits, i); | |
517 if (oneBytes > 0) { | |
518 _isFirstCharacter = false; | |
519 addSingleBytes(i, i + oneBytes); | |
520 i += oneBytes; | |
521 if (i == endIndex) break; | |
522 } | |
523 int unit = codeUnits[i++]; | |
524 // TODO(floitsch): the way we test we could potentially allow | |
525 // units that are too large, if they happen to have the | |
526 // right bit-pattern. (Same is true for the multibyte loop above). | |
527 // TODO(floitsch): optimize this loop. See: | |
528 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | |
529 if (unit < 0) { | |
530 // TODO(floitsch): should this be unit <= 0 ? | |
531 if (!_allowMalformed) { | |
532 throw new FormatException( | |
533 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); | |
534 } | |
535 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
536 } else { | |
537 assert(unit > _ONE_BYTE_LIMIT); | |
538 if ((unit & 0xE0) == 0xC0) { | |
539 value = unit & 0x1F; | |
540 expectedUnits = extraUnits = 1; | |
541 continue loop; | |
542 } | |
543 if ((unit & 0xF0) == 0xE0) { | |
544 value = unit & 0x0F; | |
545 expectedUnits = extraUnits = 2; | |
546 continue loop; | |
547 } | |
548 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | |
549 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | |
550 value = unit & 0x07; | |
551 expectedUnits = extraUnits = 3; | |
552 continue loop; | |
553 } | |
554 if (!_allowMalformed) { | |
555 throw new FormatException( | |
556 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
557 } | |
558 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
559 expectedUnits = extraUnits = 0; | |
560 _isFirstCharacter = false; | |
561 _stringSink.writeCharCode(value); | |
562 } | |
563 } | |
564 break loop; | |
565 } | |
566 if (expectedUnits > 0) { | |
567 _value = value; | |
568 _expectedUnits = expectedUnits; | |
569 _extraUnits = extraUnits; | |
570 } | |
571 } | |
572 } | |
OLD | NEW |