OLD | NEW |
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.convert; | 5 part of dart.convert; |
6 | 6 |
7 /** | 7 /** |
8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of | 8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of |
9 * unsigned 8-bit integers). | 9 * unsigned 8-bit integers). |
10 */ | 10 */ |
11 class Utf8Encoder extends Converter<String, List<int>> { | 11 class Utf8Encoder extends Converter<String, List<int>> { |
12 /** | 12 /** |
13 * Converts [string] to its UTF-8 code units (a list of | 13 * Converts [string] to its UTF-8 code units (a list of |
14 * unsigned 8-bit integers). | 14 * unsigned 8-bit integers). |
15 */ | 15 */ |
16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); | 16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); |
17 } | 17 } |
18 | 18 |
19 /** | 19 /** |
20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) | 20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) |
21 * to a string. | 21 * to a string. |
22 */ | 22 */ |
23 class Utf8Decoder extends Converter<List<int>, String> { | 23 class Utf8Decoder extends Converter<List<int>, String> { |
| 24 final bool _allowMalformed; |
| 25 |
| 26 /** |
| 27 * Instantiates a new [Utf8Decoder]. |
| 28 * |
| 29 * The optional [allowMalformed] argument defines how [convert] deals |
| 30 * with invalid or unterminated character sequences. |
| 31 * |
| 32 * If it is `true` [convert] replaces invalid (or unterminated) character |
| 33 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
| 34 * it throws a [FormatException]. |
| 35 */ |
| 36 Utf8Decoder({ bool allowMalformed: false }) |
| 37 : this._allowMalformed = allowMalformed; |
| 38 |
24 /** | 39 /** |
25 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 40 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
26 * corresponding string. | 41 * corresponding string. |
27 */ | 42 */ |
28 // TODO(floitsch): allow to configure the decoder (for example the replacement | 43 String convert(List<int> codeUnits) { |
29 // character). | 44 StringBuffer buffer = new StringBuffer(); |
30 String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); | 45 _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
| 46 decoder.convert(codeUnits, 0, codeUnits.length, buffer); |
| 47 decoder.close(buffer); |
| 48 return buffer.toString(); |
| 49 } |
31 } | 50 } |
| 51 |
| 52 // UTF-8 constants. |
| 53 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes |
| 54 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes |
| 55 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes |
| 56 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. |
| 57 |
| 58 // UTF-16 constants. |
| 59 const int _SURROGATE_MASK = 0xF800; |
| 60 const int _SURROGATE_TAG_MASK = 0xFC00; |
| 61 const int _SURROGATE_VALUE_MASK = 0x3FF; |
| 62 const int _LEAD_SURROGATE_MIN = 0xD800; |
| 63 const int _TAIL_SURROGATE_MIN = 0xDC00; |
| 64 |
| 65 const int _REPLACEMENT_CHARACTER = 0xFFFD; |
| 66 const int _BOM_CHARACTER = 0xFEFF; |
| 67 |
| 68 bool _isSurrogate(int codeUnit) => |
| 69 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; |
| 70 bool _isLeadSurrogate(int codeUnit) => |
| 71 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
| 72 bool _isTailSurrogate(int codeUnit) => |
| 73 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
| 74 int _combineSurrogatePair(int lead, int tail) => |
| 75 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) |
| 76 | (tail & _SURROGATE_VALUE_MASK); |
| 77 |
| 78 |
| 79 /** |
| 80 * Decodes UTF-8. |
| 81 * |
| 82 * The decoder handles chunked input. |
| 83 */ |
| 84 // TODO(floitsch): make this class public. |
| 85 class _Utf8Decoder { |
| 86 final bool _allowMalformed; |
| 87 bool _isFirstCharacter = true; |
| 88 int _value = 0; |
| 89 int _expectedUnits = 0; |
| 90 int _extraUnits = 0; |
| 91 |
| 92 _Utf8Decoder(this._allowMalformed); |
| 93 |
| 94 bool get hasPartialInput => _expectedUnits > 0; |
| 95 |
| 96 // Limits of one through four byte encodings. |
| 97 static const List<int> _LIMITS = const <int>[ |
| 98 _ONE_BYTE_LIMIT, |
| 99 _TWO_BYTE_LIMIT, |
| 100 _THREE_BYTE_LIMIT, |
| 101 _FOUR_BYTE_LIMIT ]; |
| 102 |
| 103 void close(StringSink sink) { |
| 104 if (hasPartialInput) { |
| 105 if (!_allowMalformed) { |
| 106 throw new FormatException("Unfinished UTF-8 octet sequence"); |
| 107 } |
| 108 sink.writeCharCode(_REPLACEMENT_CHARACTER); |
| 109 } |
| 110 } |
| 111 |
| 112 void convert(List<int> codeUnits, int startIndex, int endIndex, |
| 113 StringSink sink) { |
| 114 int value = _value; |
| 115 int expectedUnits = _expectedUnits; |
| 116 int extraUnits = _extraUnits; |
| 117 _value = 0; |
| 118 _expectedUnits = 0; |
| 119 _extraUnits = 0; |
| 120 |
| 121 int i = startIndex; |
| 122 loop: while (true) { |
| 123 multibyte: if (expectedUnits > 0) { |
| 124 do { |
| 125 if (i == endIndex) { |
| 126 break loop; |
| 127 } |
| 128 int unit = codeUnits[i]; |
| 129 if ((unit & 0xC0) != 0x80) { |
| 130 expectedUnits = 0; |
| 131 if (!_allowMalformed) { |
| 132 throw new FormatException( |
| 133 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| 134 } |
| 135 _isFirstCharacter = false; |
| 136 sink.writeCharCode(_REPLACEMENT_CHARACTER); |
| 137 break multibyte; |
| 138 } else { |
| 139 value = (value << 6) | (unit & 0x3f); |
| 140 expectedUnits--; |
| 141 i++; |
| 142 } |
| 143 } while (expectedUnits > 0); |
| 144 if (value <= _LIMITS[extraUnits - 1]) { |
| 145 // Overly long encoding. The value could be encoded with a shorter |
| 146 // encoding. |
| 147 if (!_allowMalformed) { |
| 148 throw new FormatException( |
| 149 "Overlong encoding of 0x${value.toRadixString(16)}"); |
| 150 } |
| 151 expectedUnits = extraUnits = 0; |
| 152 value = _REPLACEMENT_CHARACTER; |
| 153 } |
| 154 if (value > _FOUR_BYTE_LIMIT) { |
| 155 if (!_allowMalformed) { |
| 156 throw new FormatException("Character outside valid Unicode range: " |
| 157 "0x${value.toRadixString(16)}"); |
| 158 } |
| 159 value = _REPLACEMENT_CHARACTER; |
| 160 } |
| 161 if (!_isFirstCharacter || value != _BOM_CHARACTER) { |
| 162 sink.writeCharCode(value); |
| 163 } |
| 164 _isFirstCharacter = false; |
| 165 } |
| 166 |
| 167 while (i < endIndex) { |
| 168 int unit = codeUnits[i++]; |
| 169 if (unit <= _ONE_BYTE_LIMIT) { |
| 170 _isFirstCharacter = false; |
| 171 sink.writeCharCode(unit); |
| 172 } else { |
| 173 if ((unit & 0xE0) == 0xC0) { |
| 174 value = unit & 0x1F; |
| 175 expectedUnits = extraUnits = 1; |
| 176 continue loop; |
| 177 } |
| 178 if ((unit & 0xF0) == 0xE0) { |
| 179 value = unit & 0x0F; |
| 180 expectedUnits = extraUnits = 2; |
| 181 continue loop; |
| 182 } |
| 183 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
| 184 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
| 185 value = unit & 0x07; |
| 186 expectedUnits = extraUnits = 3; |
| 187 continue loop; |
| 188 } |
| 189 if (!_allowMalformed) { |
| 190 throw new FormatException( |
| 191 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| 192 } |
| 193 value = _REPLACEMENT_CHARACTER; |
| 194 expectedUnits = extraUnits = 0; |
| 195 _isFirstCharacter = false; |
| 196 sink.writeCharCode(value); |
| 197 } |
| 198 } |
| 199 break loop; |
| 200 } |
| 201 if (expectedUnits > 0) { |
| 202 _value = value; |
| 203 _expectedUnits = expectedUnits; |
| 204 _extraUnits = extraUnits; |
| 205 } |
| 206 } |
| 207 } |
OLD | NEW |