OLD | NEW |
---|---|
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.convert; | 5 part of dart.convert; |
6 | 6 |
7 /** | 7 /** |
8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of | 8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of |
9 * unsigned 8-bit integers). | 9 * unsigned 8-bit integers). |
10 */ | 10 */ |
11 class Utf8Encoder extends Converter<String, List<int>> { | 11 class Utf8Encoder extends Converter<String, List<int>> { |
12 /** | 12 /** |
13 * Converts [string] to its UTF-8 code units (a list of | 13 * Converts [string] to its UTF-8 code units (a list of |
14 * unsigned 8-bit integers). | 14 * unsigned 8-bit integers). |
15 */ | 15 */ |
16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); | 16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); |
17 } | 17 } |
18 | 18 |
19 /** | 19 /** |
20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) | 20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) |
21 * to a string. | 21 * to a string. |
22 */ | 22 */ |
23 class Utf8Decoder extends Converter<List<int>, String> { | 23 class Utf8Decoder extends Converter<List<int>, String> { |
24 final bool _allowMalformed; | |
25 | |
26 /** | |
27 * Instantiates a new [Utf8Decoder]. | |
28 * | |
29 * The optional [allowMalformed] argument defines how [convert] deals | |
30 * with invalid or unterminated character sequences. | |
31 * | |
32 * If it is `true` [convert] replaces invalid (or unterminated) character | |
33 * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise | |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
U+FFFD
floitsch
2013/07/16 14:25:24
Done.
| |
34 * it throws a [FormatException]. | |
35 */ | |
36 Utf8Decoder({ bool allowMalformed: false }) | |
37 : this._allowMalformed = allowMalformed; | |
38 | |
24 /** | 39 /** |
25 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 40 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
26 * corresponding string. | 41 * corresponding string. |
27 */ | 42 */ |
28 // TODO(floitsch): allow to configure the decoder (for example the replacement | 43 String convert(List<int> codeUnits) { |
29 // character). | 44 StringBuffer buffer = new StringBuffer(); |
30 String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); | 45 _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed); |
46 decoder.convert(codeUnits, 0, codeUnits.length, buffer); | |
47 decoder.close(buffer); | |
48 return buffer.toString(); | |
49 } | |
31 } | 50 } |
51 | |
52 // UTF-8 constants. | |
53 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes | |
54 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes | |
55 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes | |
56 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max. | |
57 | |
58 // UTF-16 constants. | |
59 const int _SURROGATE_MASK = 0xF800; | |
60 const int _SURROGATE_TAG_MASK = 0xFC00; | |
61 const int _SURROGATE_VALUE_MASK = 0x3FF; | |
62 const int _LEAD_SURROGATE_MIN = 0xD800; | |
63 const int _TAIL_SURROGATE_MIN = 0xDC00; | |
64 | |
65 const int _REPLACEMENT_CHARACTER = 0xFFFD; | |
66 | |
67 bool _isSurrogate(int codeUnit) => | |
68 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; | |
69 bool _isLeadSurrogate(int codeUnit) => | |
70 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | |
71 bool _isTailSurrogate(int codeUnit) => | |
72 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | |
73 int _combineSurrogatePair(int lead, int tail) => | |
74 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) | |
75 | (tail & _SURROGATE_VALUE_MASK); | |
76 | |
77 | |
78 /** | |
79 * Decodes utf-8. | |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
UTF-8.
floitsch
2013/07/16 14:25:24
Done.
| |
80 * | |
81 * The decoder handles chunked input. | |
82 */ | |
83 // TODO(floitsch): do we want to make this class public? | |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Sure, why not?
floitsch
2013/07/16 14:25:24
Later. But rephrased TODO.
| |
84 class _Utf8Decoder { | |
85 final bool _allowMalformed; | |
86 int _value = 0; | |
87 int _expectedUnits = 0; | |
88 int _extraUnits = 0; | |
89 | |
90 _Utf8Decoder(this._allowMalformed); | |
91 | |
92 bool get hasPartialInput => _expectedUnits > 0; | |
93 | |
94 // Limits of one through four byte encodings. | |
95 static const List<int> _LIMITS = const <int>[ | |
96 _ONE_BYTE_LIMIT, | |
97 _TWO_BYTE_LIMIT, | |
98 _THREE_BYTE_LIMIT, | |
99 _FOUR_BYTE_LIMIT ]; | |
100 | |
101 void close(StringSink sink) { | |
102 if (hasPartialInput) { | |
103 _throwIfNecessary("Unfinished UTF-8 encoding"); | |
104 sink.writeCharCode(_REPLACEMENT_CHARACTER); | |
105 } | |
106 } | |
107 | |
108 void convert(List<int> codeUnits, int startIndex, int endIndex, | |
109 StringSink sink) { | |
110 int value = _value; | |
111 int expectedUnits = _expectedUnits; | |
112 int extraUnits = _extraUnits; | |
113 _value = 0; | |
114 _expectedUnits = 0; | |
115 _extraUnits = 0; | |
116 | |
117 int i = startIndex; | |
118 loop: while (true) { | |
119 multibyte: if (expectedUnits > 0) { | |
120 do { | |
121 if (i == endIndex) { | |
122 break loop; | |
123 } | |
124 int unit = codeUnits[i]; | |
125 if ((unit & 0xC0) != 0x80) { | |
126 expectedUnits = 0; | |
127 _throwIfNecessary( | |
128 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
129 sink.writeCharCode(_REPLACEMENT_CHARACTER); | |
130 break multibyte; | |
131 } else { | |
132 value = (value << 6) | (unit & 0x3f); | |
133 expectedUnits--; | |
134 i++; | |
135 } | |
136 } while (expectedUnits > 0); | |
137 if (value <= _LIMITS[extraUnits - 1]) { | |
138 // Overly long encoding. The value could be encoded with a shorter | |
139 // encoding. | |
140 _throwIfNecessary( | |
141 "Overlong encoding of 0x${value.toRadixString(16)}"); | |
142 value = _REPLACEMENT_CHARACTER; | |
143 } | |
144 sink.writeCharCode(value); | |
145 } | |
146 | |
147 while (i < endIndex) { | |
148 int unit = codeUnits[i++]; | |
149 if (unit <= _ONE_BYTE_LIMIT) { | |
150 sink.writeCharCode(unit); | |
151 } else { | |
152 if ((unit & 0xE0) == 0xC0) { | |
153 value = unit & 0x1F; | |
154 expectedUnits = extraUnits = 1; | |
155 continue loop; | |
156 } | |
157 if ((unit & 0xF0) == 0xE0) { | |
158 value = unit & 0x0F; | |
159 expectedUnits = extraUnits = 2; | |
160 continue loop; | |
161 } | |
162 if ((unit & 0xF8) == 0xF0) { | |
163 value = unit & 0x07; | |
164 expectedUnits = extraUnits = 3; | |
165 continue loop; | |
166 } | |
167 _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
Lasse Reichstein Nielsen
2013/07/16 12:23:03
Seems inefficient to create the string and not use
floitsch
2013/07/16 14:25:24
inlined.
| |
168 value = _REPLACEMENT_CHARACTER; | |
169 expectedUnits = extraUnits = 0; | |
170 sink.writeCharCode(value); | |
171 } | |
172 } | |
173 break loop; | |
174 } | |
175 if (expectedUnits > 0) { | |
176 _value = value; | |
177 _expectedUnits = expectedUnits; | |
178 _extraUnits = extraUnits; | |
179 } | |
180 } | |
181 | |
182 void _throwIfNecessary(String message) { | |
183 if (!_allowMalformed) { | |
184 throw new FormatException(message); | |
185 } | |
186 } | |
187 } | |
OLD | NEW |