OLD | NEW |
| (Empty) |
1 part of dart.convert; | |
2 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | |
3 const int UNICODE_BOM_CHARACTER_RUNE = 0xFEFF; | |
4 const Utf8Codec UTF8 = const Utf8Codec(); | |
5 class Utf8Codec extends Encoding {final bool _allowMalformed; | |
6 const Utf8Codec({ | |
7 bool allowMalformed : false} | |
8 ) : _allowMalformed = allowMalformed; | |
9 String get name => "utf-8"; | |
10 String decode(List<int> codeUnits, { | |
11 bool allowMalformed} | |
12 ) { | |
13 if (allowMalformed == null) allowMalformed = _allowMalformed; | |
14 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | |
15 } | |
16 Utf8Encoder get encoder => new Utf8Encoder(); | |
17 Utf8Decoder get decoder { | |
18 return new Utf8Decoder(allowMalformed: _allowMalformed); | |
19 } | |
20 } | |
21 class Utf8Encoder extends Converter<String, List<int>> {const Utf8Encoder(); | |
22 List<int> convert(String string, [int start = 0, int end]) { | |
23 int stringLength = string.length; | |
24 RangeError.checkValidRange(start, end, stringLength); | |
25 if (end == null) end = stringLength; | |
26 int length = end - start; | |
27 if (length == 0) return new Uint8List(0); | |
28 _Utf8Encoder encoder = new _Utf8Encoder.withBufferSize(length * 3); | |
29 int endPosition = encoder._fillBuffer(string, start, end); | |
30 assert (endPosition >= end - 1); if (endPosition != end) { | |
31 int lastCodeUnit = string.codeUnitAt(end - 1); | |
32 assert (_isLeadSurrogate(lastCodeUnit)); bool wasCombined = encoder._writeSur
rogate(lastCodeUnit, 0); | |
33 assert (!wasCombined);} | |
34 return encoder._buffer.sublist(0, encoder._bufferIndex); | |
35 } | |
36 StringConversionSink startChunkedConversion(Sink<List<int>> sink) { | |
37 if (sink is! ByteConversionSink) { | |
38 sink = new ByteConversionSink.from(sink); | |
39 } | |
40 return new _Utf8EncoderSink(DEVC$RT.cast(sink, DEVC$RT.type((Sink<List<int>> _)
{ | |
41 } | |
42 ), ByteConversionSink, "ImplicitCast", """line 125, column 33 of dart:convert/ut
f.dart: """, sink is ByteConversionSink, true)); | |
43 } | |
44 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream); | |
45 } | |
46 class _Utf8Encoder {int _carry = 0; | |
47 int _bufferIndex = 0; | |
48 final List<int> _buffer; | |
49 static const _DEFAULT_BYTE_BUFFER_SIZE = 1024; | |
50 _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE); | |
51 _Utf8Encoder.withBufferSize(int bufferSize) : _buffer = _createBuffer(bufferSiz
e); | |
52 static List<int> _createBuffer(int size) => new Uint8List(size); | |
53 bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) { | |
54 if (_isTailSurrogate(nextCodeUnit)) { | |
55 int rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit); | |
56 assert (rune > _THREE_BYTE_LIMIT); assert (rune <= _FOUR_BYTE_LIMIT); _buffer[_
bufferIndex++] = 0xF0 | (rune >> 18); | |
57 _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f); | |
58 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); | |
59 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
60 return true; | |
61 } | |
62 else { | |
63 _buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12); | |
64 _buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f); | |
65 _buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f); | |
66 return false; | |
67 } | |
68 } | |
69 int _fillBuffer(String str, int start, int end) { | |
70 if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) { | |
71 end--; | |
72 } | |
73 int stringIndex; | |
74 for (stringIndex = start; stringIndex < end; stringIndex++) { | |
75 int codeUnit = str.codeUnitAt(stringIndex); | |
76 if (codeUnit <= _ONE_BYTE_LIMIT) { | |
77 if (_bufferIndex >= _buffer.length) break; | |
78 _buffer[_bufferIndex++] = codeUnit; | |
79 } | |
80 else if (_isLeadSurrogate(codeUnit)) { | |
81 if (_bufferIndex + 3 >= _buffer.length) break; | |
82 int nextCodeUnit = str.codeUnitAt(stringIndex + 1); | |
83 bool wasCombined = _writeSurrogate(codeUnit, nextCodeUnit); | |
84 if (wasCombined) stringIndex++; | |
85 } | |
86 else { | |
87 int rune = codeUnit; | |
88 if (rune <= _TWO_BYTE_LIMIT) { | |
89 if (_bufferIndex + 1 >= _buffer.length) break; | |
90 _buffer[_bufferIndex++] = 0xC0 | (rune >> 6); | |
91 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
92 } | |
93 else { | |
94 assert (rune <= _THREE_BYTE_LIMIT); if (_bufferIndex + 2 >= _buffer.length)
break; | |
95 _buffer[_bufferIndex++] = 0xE0 | (rune >> 12); | |
96 _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); | |
97 _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); | |
98 } | |
99 } | |
100 } | |
101 return stringIndex; | |
102 } | |
103 } | |
104 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {fin
al ByteConversionSink _sink; | |
105 _Utf8EncoderSink(this._sink); | |
106 void close() { | |
107 if (_carry != 0) { | |
108 addSlice("", 0, 0, true); | |
109 return;} | |
110 _sink.close(); | |
111 } | |
112 void addSlice(String str, int start, int end, bool isLast) { | |
113 _bufferIndex = 0; | |
114 if (start == end && !isLast) { | |
115 return;} | |
116 if (_carry != 0) { | |
117 int nextCodeUnit = 0; | |
118 if (start != end) { | |
119 nextCodeUnit = str.codeUnitAt(start); | |
120 } | |
121 else { | |
122 assert (isLast);} | |
123 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit); | |
124 assert (!wasCombined || start != end); if (wasCombined) start++; | |
125 _carry = 0; | |
126 } | |
127 do { | |
128 start = _fillBuffer(str, start, end); | |
129 bool isLastSlice = isLast && (start == end); | |
130 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { | |
131 if (isLast && _bufferIndex < _buffer.length - 3) { | |
132 bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0); | |
133 assert (!hasBeenCombined);} | |
134 else { | |
135 _carry = str.codeUnitAt(start); | |
136 } | |
137 start++; | |
138 } | |
139 _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice); | |
140 _bufferIndex = 0; | |
141 } | |
142 while (start < end); if (isLast) close(); | |
143 } | |
144 } | |
145 class Utf8Decoder extends Converter<List<int>, String> {final bool _allowMalfor
med; | |
146 const Utf8Decoder({ | |
147 bool allowMalformed : false} | |
148 ) : this._allowMalformed = allowMalformed; | |
149 String convert(List<int> codeUnits, [int start = 0, int end]) { | |
150 int length = codeUnits.length; | |
151 RangeError.checkValidRange(start, end, length); | |
152 if (end == null) end = length; | |
153 StringBuffer buffer = new StringBuffer(); | |
154 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); | |
155 decoder.convert(codeUnits, start, end); | |
156 decoder.close(); | |
157 return buffer.toString(); | |
158 } | |
159 ByteConversionSink startChunkedConversion(Sink<String> sink) { | |
160 StringConversionSink stringSink; | |
161 if (sink is StringConversionSink) { | |
162 stringSink = sink; | |
163 } | |
164 else { | |
165 stringSink = new StringConversionSink.from(sink); | |
166 } | |
167 return stringSink.asUtf8Sink(_allowMalformed); | |
168 } | |
169 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); | |
170 external Converter<List<int>, dynamic> fuse(Converter<String, dynamic> next); | |
171 } | |
172 const int _ONE_BYTE_LIMIT = 0x7f; | |
173 const int _TWO_BYTE_LIMIT = 0x7ff; | |
174 const int _THREE_BYTE_LIMIT = 0xffff; | |
175 const int _FOUR_BYTE_LIMIT = 0x10ffff; | |
176 const int _SURROGATE_MASK = 0xF800; | |
177 const int _SURROGATE_TAG_MASK = 0xFC00; | |
178 const int _SURROGATE_VALUE_MASK = 0x3FF; | |
179 const int _LEAD_SURROGATE_MIN = 0xD800; | |
180 const int _TAIL_SURROGATE_MIN = 0xDC00; | |
181 bool _isSurrogate(int codeUnit) => (codeUnit & _SURROGATE_MASK) == _LEAD_SURROG
ATE_MIN; | |
182 bool _isLeadSurrogate(int codeUnit) => (codeUnit & _SURROGATE_TAG_MASK) == _LEA
D_SURROGATE_MIN; | |
183 bool _isTailSurrogate(int codeUnit) => (codeUnit & _SURROGATE_TAG_MASK) == _TAI
L_SURROGATE_MIN; | |
184 int _combineSurrogatePair(int lead, int tail) => 0x10000 + ((lead & _SURROGATE_
VALUE_MASK) << 10) | (tail & _SURROGATE_VALUE_MASK); | |
185 class _Utf8Decoder {final bool _allowMalformed; | |
186 final StringSink _stringSink; | |
187 bool _isFirstCharacter = true; | |
188 int _value = 0; | |
189 int _expectedUnits = 0; | |
190 int _extraUnits = 0; | |
191 _Utf8Decoder(this._stringSink, this._allowMalformed); | |
192 bool get hasPartialInput => _expectedUnits > 0; | |
193 static const List<int> _LIMITS = const <int> [_ONE_BYTE_LIMIT, _TWO_BYTE_LIMIT,
_THREE_BYTE_LIMIT, _FOUR_BYTE_LIMIT]; | |
194 void close() { | |
195 flush(); | |
196 } | |
197 void flush() { | |
198 if (hasPartialInput) { | |
199 if (!_allowMalformed) { | |
200 throw new FormatException("Unfinished UTF-8 octet sequence"); | |
201 } | |
202 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
203 _value = 0; | |
204 _expectedUnits = 0; | |
205 _extraUnits = 0; | |
206 } | |
207 } | |
208 void convert(List<int> codeUnits, int startIndex, int endIndex) { | |
209 int value = _value; | |
210 int expectedUnits = _expectedUnits; | |
211 int extraUnits = _extraUnits; | |
212 _value = 0; | |
213 _expectedUnits = 0; | |
214 _extraUnits = 0; | |
215 int scanOneByteCharacters(units, int from) { | |
216 final to = endIndex; | |
217 final mask = _ONE_BYTE_LIMIT; | |
218 for (var i = from; i < to; i++) { | |
219 final unit = units[i]; | |
220 if ((unit & mask) != unit) return i - from; | |
221 } | |
222 return to - from; | |
223 } | |
224 void addSingleBytes(int from, int to) { | |
225 assert (from >= startIndex && from <= endIndex); assert (to >= startIndex && to
<= endIndex); _stringSink.write(new String.fromCharCodes(codeUnits, from, to)); | |
226 } | |
227 int i = startIndex; | |
228 loop: while (true) { | |
229 multibyte: if (expectedUnits > 0) { | |
230 do { | |
231 if (i == endIndex) { | |
232 break loop; | |
233 } | |
234 int unit = codeUnits[i]; | |
235 if ((unit & 0xC0) != 0x80) { | |
236 expectedUnits = 0; | |
237 if (!_allowMalformed) { | |
238 throw new FormatException("Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
239 } | |
240 _isFirstCharacter = false; | |
241 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
242 break multibyte; | |
243 } | |
244 else { | |
245 value = (value << 6) | (unit & 0x3f); | |
246 expectedUnits--; | |
247 i++; | |
248 } | |
249 } | |
250 while (expectedUnits > 0); if (value <= _LIMITS[extraUnits - 1]) { | |
251 if (!_allowMalformed) { | |
252 throw new FormatException("Overlong encoding of 0x${value.toRadixString(16)}"); | |
253 } | |
254 expectedUnits = extraUnits = 0; | |
255 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
256 } | |
257 if (value > _FOUR_BYTE_LIMIT) { | |
258 if (!_allowMalformed) { | |
259 throw new FormatException("Character outside valid Unicode range: " "0x${value.t
oRadixString(16)}"); | |
260 } | |
261 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
262 } | |
263 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | |
264 _stringSink.writeCharCode(value); | |
265 } | |
266 _isFirstCharacter = false; | |
267 } | |
268 while (i < endIndex) { | |
269 int oneBytes = scanOneByteCharacters(codeUnits, i); | |
270 if (oneBytes > 0) { | |
271 _isFirstCharacter = false; | |
272 addSingleBytes(i, i + oneBytes); | |
273 i += oneBytes; | |
274 if (i == endIndex) break; | |
275 } | |
276 int unit = codeUnits[i++]; | |
277 if (unit < 0) { | |
278 if (!_allowMalformed) { | |
279 throw new FormatException("Negative UTF-8 code unit: -0x${(-unit).toRadixString(
16)}"); | |
280 } | |
281 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | |
282 } | |
283 else { | |
284 assert (unit > _ONE_BYTE_LIMIT); if ((unit & 0xE0) == 0xC0) { | |
285 value = unit & 0x1F; | |
286 expectedUnits = extraUnits = 1; | |
287 continue loop; | |
288 } | |
289 if ((unit & 0xF0) == 0xE0) { | |
290 value = unit & 0x0F; | |
291 expectedUnits = extraUnits = 2; | |
292 continue loop; | |
293 } | |
294 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | |
295 value = unit & 0x07; | |
296 expectedUnits = extraUnits = 3; | |
297 continue loop; | |
298 } | |
299 if (!_allowMalformed) { | |
300 throw new FormatException("Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | |
301 } | |
302 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | |
303 expectedUnits = extraUnits = 0; | |
304 _isFirstCharacter = false; | |
305 _stringSink.writeCharCode(value); | |
306 } | |
307 } | |
308 break loop; | |
309 } | |
310 if (expectedUnits > 0) { | |
311 _value = value; | |
312 _expectedUnits = expectedUnits; | |
313 _extraUnits = extraUnits; | |
314 } | |
315 } | |
316 } | |
OLD | NEW |