sdk/lib/convert/utf.dart - Issue 19187002: Replace old utf8 decoder with new one.

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 19187002: Replace old utf8 decoder with new one. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Add comments. Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of dart.convert;	5 part of dart.convert;

6	6

7 /**	7 /**

8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of	8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of

9 * unsigned 8-bit integers).	9 * unsigned 8-bit integers).

10 */	10 */

11 class Utf8Encoder extends Converter<String, List<int>> {	11 class Utf8Encoder extends Converter<String, List<int>> {

12 /**	12 /**

13 * Converts [string] to its UTF-8 code units (a list of	13 * Converts [string] to its UTF-8 code units (a list of

14 * unsigned 8-bit integers).	14 * unsigned 8-bit integers).

15 */	15 */

16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string);	16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string);

17 }	17 }

18	18

19 /**	19 /**

20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers)	20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers)

21 * to a string.	21 * to a string.

22 */	22 */

23 class Utf8Decoder extends Converter<List<int>, String> {	23 class Utf8Decoder extends Converter<List<int>, String> {

	24 final bool _allowMalformed;

	25

	26 /**

	27 * Instantiates a new [Utf8Decoder].

	28 *

	29 * The optional [allowMalformed] argument defines how [convert] deals

	30 * with invalid or unterminated character sequences.

	31 *

	32 * If it is `true` [convert] replaces invalid (or unterminated) character

	33 * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise
	Lasse Reichstein Nielsen 2013/07/16 12:23:03 U+FFFD U+FFFD floitsch 2013/07/16 14:25:24 Done. Show quoted text On 2013/07/16 12:23:03, Lasse Reichstein Nielsen wrote: > U+FFFD Done.
	34 * it throws a [FormatException].

	35 */

	36 Utf8Decoder({ bool allowMalformed: false })

	37 : this._allowMalformed = allowMalformed;

	38

24 /**	39 /**

25 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the	40 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

26 * corresponding string.	41 * corresponding string.

27 */	42 */

28 // TODO(floitsch): allow to configure the decoder (for example the replacement	43 String convert(List<int> codeUnits) {

29 // character).	44 StringBuffer buffer = new StringBuffer();

30 String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits);	45 _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);

	46 decoder.convert(codeUnits, 0, codeUnits.length, buffer);

	47 decoder.close(buffer);

	48 return buffer.toString();

	49 }

31 }	50 }

	51

	52 // UTF-8 constants.

	53 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes

	54 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes

	55 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes

	56 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.

	57

	58 // UTF-16 constants.

	59 const int _SURROGATE_MASK = 0xF800;

	60 const int _SURROGATE_TAG_MASK = 0xFC00;

	61 const int _SURROGATE_VALUE_MASK = 0x3FF;

	62 const int _LEAD_SURROGATE_MIN = 0xD800;

	63 const int _TAIL_SURROGATE_MIN = 0xDC00;

	64

	65 const int _REPLACEMENT_CHARACTER = 0xFFFD;

	66

	67 bool _isSurrogate(int codeUnit) =>

	68 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;

	69 bool _isLeadSurrogate(int codeUnit) =>

	70 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;

	71 bool _isTailSurrogate(int codeUnit) =>

	72 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;

	73 int _combineSurrogatePair(int lead, int tail) =>

	74 0x10000 \| ((lead & _SURROGATE_VALUE_MASK) << 10)

	75 \| (tail & _SURROGATE_VALUE_MASK);

	76

	77

	78 /**

	79 * Decodes utf-8.
	Lasse Reichstein Nielsen 2013/07/16 12:23:03 UTF-8. UTF-8. floitsch 2013/07/16 14:25:24 Done. Show quoted text On 2013/07/16 12:23:03, Lasse Reichstein Nielsen wrote: > UTF-8. Done.
	80 *

	81 * The decoder handles chunked input.

	82 */

	83 // TODO(floitsch): do we want to make this class public?
	Lasse Reichstein Nielsen 2013/07/16 12:23:03 Sure, why not? Sure, why not? floitsch 2013/07/16 14:25:24 Later. But rephrased TODO. Show quoted text On 2013/07/16 12:23:03, Lasse Reichstein Nielsen wrote: > Sure, why not? Later. But rephrased TODO.
	84 class _Utf8Decoder {

	85 final bool _allowMalformed;

	86 int _value = 0;

	87 int _expectedUnits = 0;

	88 int _extraUnits = 0;

	89

	90 _Utf8Decoder(this._allowMalformed);

	91

	92 bool get hasPartialInput => _expectedUnits > 0;

	93

	94 // Limits of one through four byte encodings.

	95 static const List<int> _LIMITS = const <int>[

	96 _ONE_BYTE_LIMIT,

	97 _TWO_BYTE_LIMIT,

	98 _THREE_BYTE_LIMIT,

	99 _FOUR_BYTE_LIMIT ];

	100

	101 void close(StringSink sink) {

	102 if (hasPartialInput) {

	103 _throwIfNecessary("Unfinished UTF-8 encoding");

	104 sink.writeCharCode(_REPLACEMENT_CHARACTER);

	105 }

	106 }

	107

	108 void convert(List<int> codeUnits, int startIndex, int endIndex,

	109 StringSink sink) {

	110 int value = _value;

	111 int expectedUnits = _expectedUnits;

	112 int extraUnits = _extraUnits;

	113 _value = 0;

	114 _expectedUnits = 0;

	115 _extraUnits = 0;

	116

	117 int i = startIndex;

	118 loop: while (true) {

	119 multibyte: if (expectedUnits > 0) {

	120 do {

	121 if (i == endIndex) {

	122 break loop;

	123 }

	124 int unit = codeUnits[i];

	125 if ((unit & 0xC0) != 0x80) {

	126 expectedUnits = 0;

	127 _throwIfNecessary(

	128 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

	129 sink.writeCharCode(_REPLACEMENT_CHARACTER);

	130 break multibyte;

	131 } else {

	132 value = (value << 6) \| (unit & 0x3f);

	133 expectedUnits--;

	134 i++;

	135 }

	136 } while (expectedUnits > 0);

	137 if (value <= _LIMITS[extraUnits - 1]) {

	138 // Overly long encoding. The value could be encoded with a shorter

	139 // encoding.

	140 _throwIfNecessary(

	141 "Overlong encoding of 0x${value.toRadixString(16)}");

	142 value = _REPLACEMENT_CHARACTER;

	143 }

	144 sink.writeCharCode(value);

	145 }

	146

	147 while (i < endIndex) {

	148 int unit = codeUnits[i++];

	149 if (unit <= _ONE_BYTE_LIMIT) {

	150 sink.writeCharCode(unit);

	151 } else {

	152 if ((unit & 0xE0) == 0xC0) {

	153 value = unit & 0x1F;

	154 expectedUnits = extraUnits = 1;

	155 continue loop;

	156 }

	157 if ((unit & 0xF0) == 0xE0) {

	158 value = unit & 0x0F;

	159 expectedUnits = extraUnits = 2;

	160 continue loop;

	161 }

	162 if ((unit & 0xF8) == 0xF0) {

	163 value = unit & 0x07;

	164 expectedUnits = extraUnits = 3;

	165 continue loop;

	166 }

	167 _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
	Lasse Reichstein Nielsen 2013/07/16 12:23:03 Seems inefficient to create the string and not use Seems inefficient to create the string and not use it. I'd rather inline the _throwIfNecessary, or move the string interpolation into _throwIfNecessary (e.g., only pass the constant string before "0x", and the bad value, as arguments, and build the rest if necessary) floitsch 2013/07/16 14:25:24 inlined. Show quoted text On 2013/07/16 12:23:03, Lasse Reichstein Nielsen wrote: > Seems inefficient to create the string and not use it. > I'd rather inline the _throwIfNecessary, or move the string interpolation into > _throwIfNecessary (e.g., only pass the constant string before "0x", and the bad > value, as arguments, and build the rest if necessary) inlined.
	168 value = _REPLACEMENT_CHARACTER;

	169 expectedUnits = extraUnits = 0;

	170 sink.writeCharCode(value);

	171 }

	172 }

	173 break loop;

	174 }

	175 if (expectedUnits > 0) {

	176 _value = value;

	177 _expectedUnits = expectedUnits;

	178 _extraUnits = extraUnits;

	179 }

	180 }

	181

	182 void _throwIfNecessary(String message) {

	183 if (!_allowMalformed) {

	184 throw new FormatException(message);

	185 }

	186 }

	187 }

OLD	NEW

« sdk/lib/codec/encoding.dart ('K') | « sdk/lib/codec/encoding.dart ('k') | tests/lib/convert/utf82_test.dart » ('j') | tests/lib/convert/utf82_test.dart » ('J')