Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(194)

Side by Side Diff: pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart

Issue 2698353003: unfork DDC's copy of most SDK libraries (Closed)
Patch Set: revert core_patch Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of dart.convert;
6
7 /** The Unicode Replacement character `U+FFFD` (�). */
8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;
9
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */
11 const int UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;
12
13 /**
14 * An instance of the default implementation of the [Utf8Codec].
15 *
16 * This instance provides a convenient access to the most common UTF-8
17 * use cases.
18 *
19 * Examples:
20 *
21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");
22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,
23 * 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]);
24 */
25 const Utf8Codec UTF8 = const Utf8Codec();
26
27 /**
28 * A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes
29 * UTF-8 code units to strings.
30 */
31 class Utf8Codec extends Encoding {
32 final bool _allowMalformed;
33
34 /**
35 * Instantiates a new [Utf8Codec].
36 *
37 * The optional [allowMalformed] argument defines how [decoder] (and [decode])
38 * deal with invalid or unterminated character sequences.
39 *
40 * If it is `true` (and not overridden at the method invocation) [decode] and
41 * the [decoder] replace invalid (or unterminated) octet
42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
43 * they throw a [FormatException].
44 */
45 const Utf8Codec({ bool allowMalformed: false })
46 : _allowMalformed = allowMalformed;
47
48 String get name => "utf-8";
49
50 /**
51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
52 * corresponding string.
53 *
54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
55 * character is discarded.
56 *
57 * If [allowMalformed] is `true` the decoder replaces invalid (or
58 * unterminated) character sequences with the Unicode Replacement character
59 * `U+FFFD` (�). Otherwise it throws a [FormatException].
60 *
61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that
62 * was used to instantiate `this`.
63 */
64 String decode(List<int> codeUnits, { bool allowMalformed }) {
65 if (allowMalformed == null) allowMalformed = _allowMalformed;
66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
67 }
68
69 Utf8Encoder get encoder => const Utf8Encoder();
70 Utf8Decoder get decoder {
71 return new Utf8Decoder(allowMalformed: _allowMalformed);
72 }
73 }
74
75 /**
76 * This class converts strings to their UTF-8 code units (a list of
77 * unsigned 8-bit integers).
78 */
79 class Utf8Encoder extends Converter<String, List<int>> {
80
81 const Utf8Encoder();
82
83 /**
84 * Converts [string] to its UTF-8 code units (a list of
85 * unsigned 8-bit integers).
86 *
87 * If [start] and [end] are provided, only the substring
88 * `string.substring(start, end)` is converted.
89 */
90 List<int> convert(String string, [int start = 0, int end]) {
91 int stringLength = string.length;
92 RangeError.checkValidRange(start, end, stringLength);
93 if (end == null) end = stringLength;
94 int length = end - start;
95 if (length == 0) return new Uint8List(0);
96 // Create a new encoder with a length that is guaranteed to be big enough.
97 // A single code unit uses at most 3 bytes, a surrogate pair at most 4.
98 _Utf8Encoder encoder = new _Utf8Encoder.withBufferSize(length * 3);
99 int endPosition = encoder._fillBuffer(string, start, end);
100 assert(endPosition >= end - 1);
101 if (endPosition != end) {
102 // Encoding skipped the last code unit.
103 // That can only happen if the last code unit is a leadsurrogate.
104 // Force encoding of the lead surrogate by itself.
105 int lastCodeUnit = string.codeUnitAt(end - 1);
106 assert(_isLeadSurrogate(lastCodeUnit));
107 // We use a non-surrogate as `nextUnit` so that _writeSurrogate just
108 // writes the lead-surrogate.
109 bool wasCombined = encoder._writeSurrogate(lastCodeUnit, 0);
110 assert(!wasCombined);
111 }
112 return encoder._buffer.sublist(0, encoder._bufferIndex);
113 }
114
115 /**
116 * Starts a chunked conversion.
117 *
118 * The converter works more efficiently if the given [sink] is a
119 * [ByteConversionSink].
120 */
121 StringConversionSink startChunkedConversion(Sink<List<int>> sink) {
122 if (sink is! ByteConversionSink) {
123 sink = new ByteConversionSink.from(sink);
124 }
125 return new _Utf8EncoderSink(sink);
126 }
127
128 // Override the base-classes bind, to provide a better type.
129 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream);
130 }
131
132 /**
133 * This class encodes Strings to UTF-8 code units (unsigned 8 bit integers).
134 */
135 // TODO(floitsch): make this class public.
136 class _Utf8Encoder {
137 int _carry = 0;
138 int _bufferIndex = 0;
139 final List<int> _buffer;
140
141 static const _DEFAULT_BYTE_BUFFER_SIZE = 1024;
142
143 _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE);
144
145 _Utf8Encoder.withBufferSize(int bufferSize)
146 : _buffer = _createBuffer(bufferSize);
147
148 /**
149 * Allow an implementation to pick the most efficient way of storing bytes.
150 */
151 static List<int> _createBuffer(int size) => new Uint8List(size);
152
153 /**
154 * Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and
155 * writes it to [_buffer].
156 *
157 * Returns true if the [nextCodeUnit] was combined with the
158 * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing
159 * surrogate and has not been written yet.
160 *
161 * It is safe to pass 0 for [nextCodeUnit] in which case only the leading
162 * surrogate is written.
163 */
164 bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {
165 if (_isTailSurrogate(nextCodeUnit)) {
166 int rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit);
167 // If the rune is encoded with 2 code-units then it must be encoded
168 // with 4 bytes in UTF-8.
169 assert(rune > _THREE_BYTE_LIMIT);
170 assert(rune <= _FOUR_BYTE_LIMIT);
171 _buffer[_bufferIndex++] = 0xF0 | (rune >> 18);
172 _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f);
173 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);
174 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
175 return true;
176 } else {
177 // TODO(floitsch): allow to throw on malformed strings.
178 // Encode the half-surrogate directly into UTF-8. This yields
179 // invalid UTF-8, but we started out with invalid UTF-16.
180
181 // Surrogates are always encoded in 3 bytes in UTF-8.
182 _buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12);
183 _buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f);
184 _buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f);
185 return false;
186 }
187 }
188
189 /**
190 * Fills the [_buffer] with as many characters as possible.
191 *
192 * Does not encode any trailing lead-surrogate. This must be done by the
193 * caller.
194 *
195 * Returns the position in the string. The returned index points to the
196 * first code unit that hasn't been encoded.
197 */
198 int _fillBuffer(String str, int start, int end) {
199 if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) {
200 // Don't handle a trailing lead-surrogate in this loop. The caller has
201 // to deal with those.
202 end--;
203 }
204 int stringIndex;
205 for (stringIndex = start; stringIndex < end; stringIndex++) {
206 int codeUnit = str.codeUnitAt(stringIndex);
207 // ASCII has the same representation in UTF-8 and UTF-16.
208 if (codeUnit <= _ONE_BYTE_LIMIT) {
209 if (_bufferIndex >= _buffer.length) break;
210 _buffer[_bufferIndex++] = codeUnit;
211 } else if (_isLeadSurrogate(codeUnit)) {
212 if (_bufferIndex + 3 >= _buffer.length) break;
213 // Note that it is safe to read the next code unit. We decremented
214 // [end] above when the last valid code unit was a leading surrogate.
215 int nextCodeUnit = str.codeUnitAt(stringIndex + 1);
216 bool wasCombined = _writeSurrogate(codeUnit, nextCodeUnit);
217 if (wasCombined) stringIndex++;
218 } else {
219 int rune = codeUnit;
220 if (rune <= _TWO_BYTE_LIMIT) {
221 if (_bufferIndex + 1 >= _buffer.length) break;
222 _buffer[_bufferIndex++] = 0xC0 | (rune >> 6);
223 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
224 } else {
225 assert(rune <= _THREE_BYTE_LIMIT);
226 if (_bufferIndex + 2 >= _buffer.length) break;
227 _buffer[_bufferIndex++] = 0xE0 | (rune >> 12);
228 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);
229 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
230 }
231 }
232 }
233 return stringIndex;
234 }
235 }
236
237 /**
238 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit
239 * integers).
240 */
241 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {
242
243 final ByteConversionSink _sink;
244
245 _Utf8EncoderSink(this._sink);
246
247 void close() {
248 if (_carry != 0) {
249 // addSlice will call close again, but then the carry must be equal to 0.
250 addSlice("", 0, 0, true);
251 return;
252 }
253 _sink.close();
254 }
255
256 void addSlice(String str, int start, int end, bool isLast) {
257 _bufferIndex = 0;
258
259 if (start == end && !isLast) {
260 return;
261 }
262
263 if (_carry != 0) {
264 int nextCodeUnit = 0;
265 if (start != end) {
266 nextCodeUnit = str.codeUnitAt(start);
267 } else {
268 assert(isLast);
269 }
270 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit);
271 // Either we got a non-empty string, or we must not have been combined.
272 assert(!wasCombined || start != end );
273 if (wasCombined) start++;
274 _carry = 0;
275 }
276 do {
277 start = _fillBuffer(str, start, end);
278 bool isLastSlice = isLast && (start == end);
279 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {
280 if (isLast && _bufferIndex < _buffer.length - 3) {
281 // There is still space for the last incomplete surrogate.
282 // We use a non-surrogate as second argument. This way the
283 // function will just add the surrogate-half to the buffer.
284 bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);
285 assert(!hasBeenCombined);
286 } else {
287 // Otherwise store it in the carry. If isLast is true, then
288 // close will flush the last carry.
289 _carry = str.codeUnitAt(start);
290 }
291 start++;
292 }
293 _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);
294 _bufferIndex = 0;
295 } while (start < end);
296 if (isLast) close();
297 }
298
299 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it
300 // needs to deal with malformed input.
301 }
302
303 /**
304 * This class converts UTF-8 code units (lists of unsigned 8-bit integers)
305 * to a string.
306 */
307 class Utf8Decoder extends Converter<List<int>, String> {
308 final bool _allowMalformed;
309
310 /**
311 * Instantiates a new [Utf8Decoder].
312 *
313 * The optional [allowMalformed] argument defines how [convert] deals
314 * with invalid or unterminated character sequences.
315 *
316 * If it is `true` [convert] replaces invalid (or unterminated) character
317 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
318 * it throws a [FormatException].
319 */
320 const Utf8Decoder({ bool allowMalformed: false })
321 : this._allowMalformed = allowMalformed;
322
323 /**
324 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
325 * corresponding string.
326 *
327 * Uses the code units from [start] to, but no including, [end].
328 * If [end] is omitted, it defaults to `codeUnits.length`.
329 *
330 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
331 * character is discarded.
332 */
333 String convert(List<int> codeUnits, [int start = 0, int end]) {
334 // Allow the implementation to intercept and specialize based on the type
335 // of codeUnits.
336 String result = _convertIntercepted(_allowMalformed, codeUnits, start, end);
337 if (result != null) {
338 return result;
339 }
340
341 int length = codeUnits.length;
342 RangeError.checkValidRange(start, end, length);
343 if (end == null) end = length;
344 StringBuffer buffer = new StringBuffer();
345 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);
346 decoder.convert(codeUnits, start, end);
347 decoder.close();
348 return buffer.toString();
349 }
350
351 /**
352 * Starts a chunked conversion.
353 *
354 * The converter works more efficiently if the given [sink] is a
355 * [StringConversionSink].
356 */
357 ByteConversionSink startChunkedConversion(Sink<String> sink) {
358 StringConversionSink stringSink;
359 if (sink is StringConversionSink) {
360 stringSink = sink;
361 } else {
362 stringSink = new StringConversionSink.from(sink);
363 }
364 return stringSink.asUtf8Sink(_allowMalformed);
365 }
366
367 // Override the base-classes bind, to provide a better type.
368 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream);
369
370 external Converter<List<int>, dynamic/*=T*/> fuse/*<T>*/(
371 Converter<String, dynamic/*=T*/> next);
372
373 external static String _convertIntercepted(
374 bool allowMalformed, List<int> codeUnits, int start, int end);
375 }
376
377 // UTF-8 constants.
378 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits
379 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits
380 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits
381 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.
382
383 // UTF-16 constants.
384 const int _SURROGATE_MASK = 0xF800;
385 const int _SURROGATE_TAG_MASK = 0xFC00;
386 const int _SURROGATE_VALUE_MASK = 0x3FF;
387 const int _LEAD_SURROGATE_MIN = 0xD800;
388 const int _TAIL_SURROGATE_MIN = 0xDC00;
389
390 bool _isLeadSurrogate(int codeUnit) =>
391 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
392 bool _isTailSurrogate(int codeUnit) =>
393 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
394 int _combineSurrogatePair(int lead, int tail) =>
395 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)
396 | (tail & _SURROGATE_VALUE_MASK);
397
398 /**
399 * Decodes UTF-8.
400 *
401 * The decoder handles chunked input.
402 */
403 // TODO(floitsch): make this class public.
404 class _Utf8Decoder {
405 final bool _allowMalformed;
406 final StringSink _stringSink;
407 bool _isFirstCharacter = true;
408 int _value = 0;
409 int _expectedUnits = 0;
410 int _extraUnits = 0;
411
412 _Utf8Decoder(this._stringSink, this._allowMalformed);
413
414 bool get hasPartialInput => _expectedUnits > 0;
415
416 // Limits of one through four byte encodings.
417 static const List<int> _LIMITS = const <int>[
418 _ONE_BYTE_LIMIT,
419 _TWO_BYTE_LIMIT,
420 _THREE_BYTE_LIMIT,
421 _FOUR_BYTE_LIMIT ];
422
423 void close() {
424 flush();
425 }
426
427 /**
428 * Flushes this decoder as if closed.
429 *
430 * This method throws if the input was partial and the decoder was
431 * constructed with `allowMalformed` set to `false`.
432 */
433 void flush() {
434 if (hasPartialInput) {
435 if (!_allowMalformed) {
436 throw new FormatException("Unfinished UTF-8 octet sequence");
437 }
438 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
439 _value = 0;
440 _expectedUnits = 0;
441 _extraUnits = 0;
442 }
443 }
444
445 void convert(List<int> codeUnits, int startIndex, int endIndex) {
446 int value = _value;
447 int expectedUnits = _expectedUnits;
448 int extraUnits = _extraUnits;
449 _value = 0;
450 _expectedUnits = 0;
451 _extraUnits = 0;
452
453 int scanOneByteCharacters(units, int from) {
454 final to = endIndex;
455 final mask = _ONE_BYTE_LIMIT;
456 for (var i = from; i < to; i++) {
457 final unit = units[i];
458 if ((unit & mask) != unit) return i - from;
459 }
460 return to - from;
461 }
462
463 void addSingleBytes(int from, int to) {
464 assert(from >= startIndex && from <= endIndex);
465 assert(to >= startIndex && to <= endIndex);
466 _stringSink.write(new String.fromCharCodes(codeUnits, from, to));
467 }
468
469 int i = startIndex;
470 loop: while (true) {
471 multibyte: if (expectedUnits > 0) {
472 do {
473 if (i == endIndex) {
474 break loop;
475 }
476 int unit = codeUnits[i];
477 if ((unit & 0xC0) != 0x80) {
478 expectedUnits = 0;
479 if (!_allowMalformed) {
480 throw new FormatException(
481 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
482 }
483 _isFirstCharacter = false;
484 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
485 break multibyte;
486 } else {
487 value = (value << 6) | (unit & 0x3f);
488 expectedUnits--;
489 i++;
490 }
491 } while (expectedUnits > 0);
492 if (value <= _LIMITS[extraUnits - 1]) {
493 // Overly long encoding. The value could be encoded with a shorter
494 // encoding.
495 if (!_allowMalformed) {
496 throw new FormatException(
497 "Overlong encoding of 0x${value.toRadixString(16)}");
498 }
499 expectedUnits = extraUnits = 0;
500 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
501 }
502 if (value > _FOUR_BYTE_LIMIT) {
503 if (!_allowMalformed) {
504 throw new FormatException("Character outside valid Unicode range: "
505 "0x${value.toRadixString(16)}");
506 }
507 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
508 }
509 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) {
510 _stringSink.writeCharCode(value);
511 }
512 _isFirstCharacter = false;
513 }
514
515 while (i < endIndex) {
516 int oneBytes = scanOneByteCharacters(codeUnits, i);
517 if (oneBytes > 0) {
518 _isFirstCharacter = false;
519 addSingleBytes(i, i + oneBytes);
520 i += oneBytes;
521 if (i == endIndex) break;
522 }
523 int unit = codeUnits[i++];
524 // TODO(floitsch): the way we test we could potentially allow
525 // units that are too large, if they happen to have the
526 // right bit-pattern. (Same is true for the multibyte loop above).
527 // TODO(floitsch): optimize this loop. See:
528 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80
529 if (unit < 0) {
530 // TODO(floitsch): should this be unit <= 0 ?
531 if (!_allowMalformed) {
532 throw new FormatException(
533 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");
534 }
535 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
536 } else {
537 assert(unit > _ONE_BYTE_LIMIT);
538 if ((unit & 0xE0) == 0xC0) {
539 value = unit & 0x1F;
540 expectedUnits = extraUnits = 1;
541 continue loop;
542 }
543 if ((unit & 0xF0) == 0xE0) {
544 value = unit & 0x0F;
545 expectedUnits = extraUnits = 2;
546 continue loop;
547 }
548 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
549 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
550 value = unit & 0x07;
551 expectedUnits = extraUnits = 3;
552 continue loop;
553 }
554 if (!_allowMalformed) {
555 throw new FormatException(
556 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
557 }
558 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
559 expectedUnits = extraUnits = 0;
560 _isFirstCharacter = false;
561 _stringSink.writeCharCode(value);
562 }
563 }
564 break loop;
565 }
566 if (expectedUnits > 0) {
567 _value = value;
568 _expectedUnits = expectedUnits;
569 _extraUnits = extraUnits;
570 }
571 }
572 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698