Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of dart.convert; | 5 part of dart.convert; |
| 6 | 6 |
| 7 /** | 7 /** |
| 8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of | 8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of |
| 9 * unsigned 8-bit integers). | 9 * unsigned 8-bit integers). |
| 10 */ | 10 */ |
| 11 class Utf8Encoder extends Converter<String, List<int>> { | 11 class Utf8Encoder extends Converter<String, List<int>> { |
| 12 /** | 12 /** |
| 13 * Converts [string] to its UTF-8 code units (a list of | 13 * Converts [string] to its UTF-8 code units (a list of |
| 14 * unsigned 8-bit integers). | 14 * unsigned 8-bit integers). |
| 15 */ | 15 */ |
| 16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); | 16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); |
| 17 } | 17 } |
| 18 | 18 |
| 19 /** | 19 /** |
| 20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) | 20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) |
| 21 * to a string. | 21 * to a string. |
| 22 */ | 22 */ |
| 23 class Utf8Decoder extends Converter<List<int>, String> { | 23 class Utf8Decoder extends Converter<List<int>, String> { |
| 24 final bool _allowMalformed; | |
| 25 | |
| 26 /** | |
| 27 * Instantiates a new [Utf8Decoder]. | |
| 28 * | |
| 29 * The optional [allowMalformed] argument defines how [convert] deals | |
| 30 * with invalid or unterminated character sequences. | |
| 31 * | |
| 32 * If it is `true` [convert] replaces invalid (or unterminated) character | |
| 33 * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise | |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
U+FFFD
floitsch
2013/07/16 14:25:24
Done.
| |
| 34 * it throws a [FormatException]. | |
| 35 */ | |
| 36 Utf8Decoder({ bool allowMalformed: false }) | |
| 37 : this._allowMalformed = allowMalformed; | |
| 38 | |
| 24 /** | 39 /** |
| 25 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 40 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| 26 * corresponding string. | 41 * corresponding string. |
| 27 */ | 42 */ |
| 28 // TODO(floitsch): allow to configure the decoder (for example the replacement | 43 String convert(List<int> codeUnits) { |
| 29 // character). | 44 StringBuffer buffer = new StringBuffer(); |
| 30 String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); | 45 _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
| 46 decoder.convert(codeUnits, 0, codeUnits.length, buffer); | |
| 47 decoder.close(buffer); | |
| 48 return buffer.toString(); | |
| 49 } | |
| 31 } | 50 } |
| 51 | |
| 52 // UTF-8 constants. | |
| 53 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes | |
| 54 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes | |
| 55 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes | |
| 56 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. | |
| 57 | |
| 58 // UTF-16 constants. | |
| 59 const int _SURROGATE_MASK = 0xF800; | |
| 60 const int _SURROGATE_TAG_MASK = 0xFC00; | |
| 61 const int _SURROGATE_VALUE_MASK = 0x3FF; | |
| 62 const int _LEAD_SURROGATE_MIN = 0xD800; | |
| 63 const int _TAIL_SURROGATE_MIN = 0xDC00; | |
| 64 | |
| 65 const int _REPLACEMENT_CHARACTER = 0xFFFD; | |
| 66 | |
| 67 bool _isSurrogate(int codeUnit) => | |
| 68 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; | |
| 69 bool _isLeadSurrogate(int codeUnit) => | |
| 70 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | |
| 71 bool _isTailSurrogate(int codeUnit) => | |
| 72 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | |
| 73 int _combineSurrogatePair(int lead, int tail) => | |
| 74 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) | |
| 75 | (tail & _SURROGATE_VALUE_MASK); | |
| 76 | |
| 77 | |
| 78 /** | |
| 79 * Decodes utf-8. | |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
UTF-8.
floitsch
2013/07/16 14:25:24
Done.
| |
| 80 * | |
| 81 * The decoder handles chunked input. | |
| 82 */ | |
| 83 // TODO(floitsch): do we want to make this class public? | |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Sure, why not?
floitsch
2013/07/16 14:25:24
Later. But rephrased TODO.
| |
| 84 class _Utf8Decoder { | |
| 85 final bool _allowMalformed; | |
| 86 int _value = 0; | |
| 87 int _expectedUnits = 0; | |
| 88 int _extraUnits = 0; | |
| 89 | |
| 90 _Utf8Decoder(this._allowMalformed); | |
| 91 | |
| 92 bool get hasPartialInput => _expectedUnits > 0; | |
| 93 | |
| 94 // Limits of one through four byte encodings. | |
| 95 static const List<int> _LIMITS = const <int>[ | |
| 96 _ONE_BYTE_LIMIT, | |
| 97 _TWO_BYTE_LIMIT, | |
| 98 _THREE_BYTE_LIMIT, | |
| 99 _FOUR_BYTE_LIMIT ]; | |
| 100 | |
| 101 void close(StringSink sink) { | |
| 102 if (hasPartialInput) { | |
| 103 _throwIfNecessary("Unfinished UTF-8 encoding"); | |
| 104 sink.writeCharCode(_REPLACEMENT_CHARACTER); | |
| 105 } | |
| 106 } | |
| 107 | |
| 108 void convert(List<int> codeUnits, int startIndex, int endIndex, | |
| 109 StringSink sink) { | |
| 110 int value = _value; | |
| 111 int expectedUnits = _expectedUnits; | |
| 112 int extraUnits = _extraUnits; | |
| 113 _value = 0; | |
| 114 _expectedUnits = 0; | |
| 115 _extraUnits = 0; | |
| 116 | |
| 117 int i = startIndex; | |
| 118 loop: while (true) { | |
| 119 multibyte: if (expectedUnits > 0) { | |
| 120 do { | |
| 121 if (i == endIndex) { | |
| 122 break loop; | |
| 123 } | |
| 124 int unit = codeUnits[i]; | |
| 125 if ((unit & 0xC0) != 0x80) { | |
| 126 expectedUnits = 0; | |
| 127 _throwIfNecessary( | |
| 128 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
| 129 sink.writeCharCode(_REPLACEMENT_CHARACTER); | |
| 130 break multibyte; | |
| 131 } else { | |
| 132 value = (value << 6) | (unit & 0x3f); | |
| 133 expectedUnits--; | |
| 134 i++; | |
| 135 } | |
| 136 } while (expectedUnits > 0); | |
| 137 if (value <= _LIMITS[extraUnits - 1]) { | |
| 138 // Overly long encoding. The value could be encoded with a shorter | |
| 139 // encoding. | |
| 140 _throwIfNecessary( | |
| 141 "Overlong encoding of 0x${value.toRadixString(16)}"); | |
| 142 value = _REPLACEMENT_CHARACTER; | |
| 143 } | |
| 144 sink.writeCharCode(value); | |
| 145 } | |
| 146 | |
| 147 while (i < endIndex) { | |
| 148 int unit = codeUnits[i++]; | |
| 149 if (unit <= _ONE_BYTE_LIMIT) { | |
| 150 sink.writeCharCode(unit); | |
| 151 } else { | |
| 152 if ((unit & 0xE0) == 0xC0) { | |
| 153 value = unit & 0x1F; | |
| 154 expectedUnits = extraUnits = 1; | |
| 155 continue loop; | |
| 156 } | |
| 157 if ((unit & 0xF0) == 0xE0) { | |
| 158 value = unit & 0x0F; | |
| 159 expectedUnits = extraUnits = 2; | |
| 160 continue loop; | |
| 161 } | |
| 162 if ((unit & 0xF8) == 0xF0) { | |
| 163 value = unit & 0x07; | |
| 164 expectedUnits = extraUnits = 3; | |
| 165 continue loop; | |
| 166 } | |
| 167 _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
|
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Seems inefficient to create the string and not use
floitsch
2013/07/16 14:25:24
inlined.
| |
| 168 value = _REPLACEMENT_CHARACTER; | |
| 169 expectedUnits = extraUnits = 0; | |
| 170 sink.writeCharCode(value); | |
| 171 } | |
| 172 } | |
| 173 break loop; | |
| 174 } | |
| 175 if (expectedUnits > 0) { | |
| 176 _value = value; | |
| 177 _expectedUnits = expectedUnits; | |
| 178 _extraUnits = extraUnits; | |
| 179 } | |
| 180 } | |
| 181 | |
| 182 void _throwIfNecessary(String message) { | |
| 183 if (!_allowMalformed) { | |
| 184 throw new FormatException(message); | |
| 185 } | |
| 186 } | |
| 187 } | |
| OLD | NEW |