OLD | NEW |
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.io; | 5 part of dart.io; |
6 | 6 |
| 7 const SYSTEM_ENCODING = const SystemEncoding(); |
| 8 |
7 /** | 9 /** |
8 * String encodings. | 10 * The system encoding is the current code page on Windows and UTF-8 on |
| 11 * Linux and Mac. |
9 */ | 12 */ |
10 class Encoding { | 13 class SystemEncoding extends Encoding { |
11 static const Encoding UTF_8 = const Encoding._internal("utf-8"); | 14 const SystemEncoding(); |
12 static const Encoding ISO_8859_1 = const Encoding._internal("iso-8859-1"); | |
13 static const Encoding ASCII = const Encoding._internal("us-ascii"); | |
14 | 15 |
15 /** | 16 List<int> encode(String input) => encoder.convert(input); |
16 * SYSTEM encoding is the current code page on Windows and UTF-8 on | 17 String decode(List<int> encoded) => decoder.convert(encoded); |
17 * Linux and Mac. | |
18 */ | |
19 static const Encoding SYSTEM = const Encoding._internal("system"); | |
20 | 18 |
21 // All aliasses (in lowercase) of supported encoding from | 19 Converter<String, List<int>> get encoder { |
22 // http://www.iana.org/assignments/character-sets/character-sets.xml. | 20 if (Platform.operatingSystem == "windows") { |
23 static Map<String, Encoding> _nameToEncoding = <String, Encoding> { | 21 return const _WindowsCodePageEncoder(); |
24 // ISO_8859-1:1987. | 22 } else { |
25 "iso_8859-1:1987": ISO_8859_1, | 23 return const Utf8Encoder(); |
26 "iso-ir-100": ISO_8859_1, | 24 } |
27 "iso_8859-1": ISO_8859_1, | 25 } |
28 "iso-8859-1": ISO_8859_1, | |
29 "latin1": ISO_8859_1, | |
30 "l1": ISO_8859_1, | |
31 "ibm819": ISO_8859_1, | |
32 "cp819": ISO_8859_1, | |
33 "csisolatin1": ISO_8859_1, | |
34 | 26 |
35 // US-ASCII. | 27 Converter<List<int>, String> get decoder { |
36 "iso-ir-6": ASCII, | 28 if (Platform.operatingSystem == "windows") { |
37 "ansi_x3.4-1968": ASCII, | 29 return const _WindowsCodePageDecoder(); |
38 "ansi_x3.4-1986": ASCII, | 30 } else { |
39 "iso_646.irv:1991": ASCII, | 31 return const Utf8Decoder(); |
40 "iso646-us": ASCII, | 32 } |
41 "us-ascii": ASCII, | 33 } |
42 "us": ASCII, | 34 } |
43 "ibm367": ASCII, | |
44 "cp367": ASCII, | |
45 "csascii": ASCII, | |
46 "ascii": ASCII, // This is not in the IANA official names. | |
47 | 35 |
48 // UTF-8. | 36 class _WindowsCodePageEncoder extends Converter<String, List<int>> { |
49 "csutf8": UTF_8, | |
50 "utf-8": UTF_8 | |
51 }; | |
52 | 37 |
53 /** | 38 const _WindowsCodePageEncoder(); |
54 * Gets an [Encoding] object from the name of the character set | 39 |
55 * name. The names used are the IANA official names for the | 40 List<int> convert(String input) { |
56 * character set (see | 41 List<int> encoded = _encodeString(input); |
57 * http://www.iana.org/assignments/character-sets/character-sets.xml). | 42 if (encoded == null) { |
58 * | 43 throw new FormatException("Invalid character for encoding"); |
59 * The [name] passed is case insensitive. | 44 } |
60 * | 45 return encoded; |
61 * If character set is not supported [:null:] is returned. | |
62 */ | |
63 static Encoding fromName(String name) { | |
64 if (name == null) return null; | |
65 name = name.toLowerCase(); | |
66 return _nameToEncoding[name]; | |
67 } | 46 } |
68 | 47 |
69 /** | 48 /** |
70 * Name of the encoding. This will be the lower-case version of one of the | 49 * Starts a chunked conversion. |
71 * IANA official names for the character set (see | |
72 * http://www.iana.org/assignments/character-sets/character-sets.xml) | |
73 */ | 50 */ |
74 final String name; | 51 StringConversionSink startChunkedConversion( |
| 52 ChunkedConversionSink<List<int>> sink) { |
| 53 return new _WindowsCodePageEncoderSink(sink); |
| 54 } |
75 | 55 |
76 const Encoding._internal(String this.name); | 56 // Override the base-class' bind, to provide a better type. |
| 57 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream); |
| 58 |
| 59 external static List<int> _encodeString(String string); |
77 } | 60 } |
78 | 61 |
79 const UTF_8 = Encoding.UTF_8; | 62 class _WindowsCodePageEncoderSink extends StringConversionSinkBase { |
80 const ISO_8859_1 = Encoding.ISO_8859_1; | 63 // TODO(floitsch): provide more efficient conversions when the input is |
81 const ASCII = Encoding.ASCII; | 64 // not a String. |
82 | 65 |
83 /** | 66 final ByteConversionSink _sink; |
84 * Stream transformer that can decode a stream of bytes into a stream of | |
85 * strings using [encoding]. | |
86 * | |
87 * Invalid or forbidden byte-sequences will not produce errors, but will instead | |
88 * insert [replacementChar] in the decoded strings. | |
89 */ | |
90 class StringDecoder implements StreamTransformer<List<int>, String> { | |
91 var _decoder; | |
92 | 67 |
93 static const _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xFFFD; | 68 _WindowsCodePageEncoderSink(this._sink); |
94 | 69 |
95 /** | 70 void close() { |
96 * Decodes a stream of bytes into a `String` with an optional | 71 _sink.close(); |
97 * [encoding] and [replacementChar]. | |
98 * | |
99 * The default value for [encoding] is [Encoding.UTF_8]. | |
100 * | |
101 * The default value for [replacementChar] is code point U+FFFD. | |
102 * | |
103 * Completes with the decoded `String` when the stream is done. | |
104 */ | |
105 static Future<String> decode( | |
106 Stream<List<int>> stream, | |
107 [Encoding encoding = Encoding.UTF_8, | |
108 int replacementChar = _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
109 if (replacementChar != null && | |
110 replacementChar != _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT) { | |
111 throw new UnsupportedError("replacement character must be null or " | |
112 "the Unicode replacement character"); | |
113 } | |
114 return stream | |
115 .transform(new StringDecoder(encoding, replacementChar)) | |
116 .fold( | |
117 new StringBuffer(), | |
118 (prev, data) => prev..write(data)) | |
119 .then((sb) => sb.toString()); | |
120 } | 72 } |
121 | 73 |
122 /** | 74 void add(String string) { |
123 * Create a new [StringDecoder] with an optional [encoding] and | 75 List<int> encoded = _WindowsCodePageEncoder._encodeString(string); |
124 * [replacementChar]. | 76 if (encoded == null) { |
125 * | 77 throw new FormatException("Invalid character for encoding"); |
126 * The default value for [encoding] is [Encoding.UTF_8]. | |
127 * | |
128 * The default value for [replacementChar] is code point U+FFFD. | |
129 */ | |
130 StringDecoder([Encoding encoding = Encoding.UTF_8, int replacementChar]) { | |
131 switch (encoding) { | |
132 case Encoding.UTF_8: | |
133 if (replacementChar != null && | |
134 replacementChar != _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT) { | |
135 throw new UnsupportedError("replacement character must be null or " | |
136 "the Unicode replacement character"); | |
137 } | |
138 _decoder = new Utf8Decoder(allowMalformed: true); | |
139 break; | |
140 case Encoding.ASCII: | |
141 if (replacementChar == null) { | |
142 replacementChar = '?'.codeUnitAt(0); | |
143 } else if (replacementChar > 127) { | |
144 throw new ArgumentError("Invalid replacement character for ASCII"); | |
145 } | |
146 _decoder = new _AsciiDecoder(replacementChar); | |
147 break; | |
148 case Encoding.ISO_8859_1: | |
149 if (replacementChar == null) { | |
150 replacementChar = '?'.codeUnitAt(0); | |
151 } else if (replacementChar > 255) { | |
152 throw new ArgumentError( | |
153 "Invalid replacement character for ISO_8859_1"); | |
154 } | |
155 _decoder = new _Latin1Decoder(replacementChar); | |
156 break; | |
157 case Encoding.SYSTEM: | |
158 if (Platform.operatingSystem == "windows") { | |
159 _decoder = new _WindowsCodePageDecoder(); | |
160 } else { | |
161 if (replacementChar != null) { | |
162 // TODO(ajohnsen): Handle replacement character. | |
163 throw new UnsupportedError( | |
164 "Replacement character is not supported for SYSTEM encoding"); | |
165 } | |
166 _decoder = new Utf8Decoder(allowMalformed: true); | |
167 } | |
168 break; | |
169 default: | |
170 throw new ArgumentError("Unsupported encoding '$encoding'"); | |
171 } | 78 } |
| 79 _sink.add(encoded); |
172 } | 80 } |
173 | 81 |
174 Stream<String> bind(Stream<List<int>> stream) => _decoder.bind(stream); | 82 void addSlice(String source, int start, int end, bool isLast) { |
175 } | 83 if (start != 0 || end != source.length) { |
176 | 84 source = source.substring(start, end); |
177 | |
178 /** | |
179 * Stream transformer that can encode a stream of strings info a stream of | |
180 * bytes using [encoding]. | |
181 * | |
182 * Strings that cannot be represented in the given encoding will result in an | |
183 * error and a close event on the stream. | |
184 */ | |
185 class StringEncoder implements StreamTransformer<String, List<int>> { | |
186 var _encoder; | |
187 | |
188 /** | |
189 * Create a new [StringDecoder] with an optional [encoding] and | |
190 * [replacementChar]. | |
191 */ | |
192 StringEncoder([Encoding encoding = Encoding.UTF_8]) { | |
193 switch (encoding) { | |
194 case Encoding.UTF_8: | |
195 _encoder = new Utf8Encoder(); | |
196 break; | |
197 case Encoding.ASCII: | |
198 _encoder = new _AsciiEncoder(); | |
199 break; | |
200 case Encoding.ISO_8859_1: | |
201 _encoder = new _Latin1Encoder(); | |
202 break; | |
203 case Encoding.SYSTEM: | |
204 if (Platform.operatingSystem == "windows") { | |
205 _encoder = new _WindowsCodePageEncoder(); | |
206 } else { | |
207 _encoder = new Utf8Encoder(); | |
208 } | |
209 break; | |
210 default: | |
211 throw new ArgumentError("Unsupported encoding '$encoding'"); | |
212 } | 85 } |
213 } | 86 add(source); |
214 | 87 if (isLast) close(); |
215 Stream<List<int>> bind(Stream<String> stream) => _encoder.bind(stream); | |
216 } | |
217 | |
218 | |
219 // Utility function to synchronously decode a list of bytes. | |
220 String _decodeString(List<int> bytes, [Encoding encoding = Encoding.UTF_8]) { | |
221 if (bytes.length == 0) return ""; | |
222 if (encoding == Encoding.UTF_8) { | |
223 return UTF8.decode(bytes, allowMalformed: true); | |
224 } | |
225 var string; | |
226 var error; | |
227 var controller = new StreamController(sync: true); | |
228 controller.stream | |
229 .transform(new StringDecoder(encoding)) | |
230 .listen((data) { | |
231 // The StringEncoder decodes every encoding (except UTF-8) in one go. | |
232 assert(string == null); | |
233 string = data; | |
234 }, onError: (e) => error = e); | |
235 controller.add(bytes); | |
236 controller.close(); | |
237 if (error != null) throw error; | |
238 assert(string != null); | |
239 return string; | |
240 } | |
241 | |
242 | |
243 // Utility function to synchronously encode a String. | |
244 // Will throw an exception if the encoding is invalid. | |
245 List<int> _encodeString(String string, [Encoding encoding = Encoding.UTF_8]) { | |
246 if (string.length == 0) return []; | |
247 if (encoding == Encoding.UTF_8) return UTF8.encode(string); | |
248 var bytes; | |
249 var controller = new StreamController(sync: true); | |
250 controller.stream | |
251 .transform(new StringEncoder(encoding)) | |
252 .listen((data) { | |
253 // The StringEncoder encodes every encoding (except UTF-8) in one go. | |
254 assert(bytes == null); | |
255 bytes = data; | |
256 }); | |
257 controller.add(string); | |
258 controller.close(); | |
259 assert(bytes != null); | |
260 return bytes; | |
261 } | |
262 | |
263 | |
264 abstract class _SingleByteDecoder | |
265 extends StreamEventTransformer<List<int>, String> { | |
266 final int _replacementChar; | |
267 | |
268 _SingleByteDecoder(this._replacementChar); | |
269 | |
270 void handleData(List<int> data, EventSink<String> sink) { | |
271 var buffer = new List<int>(data.length); | |
272 for (int i = 0; i < data.length; i++) { | |
273 int char = _decodeByte(data[i]); | |
274 if (char < 0) char = _replacementChar; | |
275 buffer[i] = char; | |
276 } | |
277 sink.add(new String.fromCharCodes(buffer)); | |
278 } | |
279 | |
280 int _decodeByte(int byte); | |
281 } | |
282 | |
283 | |
284 // Utility class for decoding ascii data delivered as a stream of | |
285 // bytes. | |
286 class _AsciiDecoder extends _SingleByteDecoder { | |
287 _AsciiDecoder(int replacementChar) : super(replacementChar); | |
288 | |
289 int _decodeByte(int byte) => ((byte & 0x7f) == byte) ? byte : -1; | |
290 } | |
291 | |
292 | |
293 // Utility class for decoding Latin-1 data delivered as a stream of | |
294 // bytes. | |
295 class _Latin1Decoder extends _SingleByteDecoder { | |
296 _Latin1Decoder(int replacementChar) : super(replacementChar); | |
297 | |
298 int _decodeByte(int byte) => ((byte & 0xFF) == byte) ? byte : -1; | |
299 } | |
300 | |
301 | |
302 abstract class _SingleByteEncoder | |
303 extends StreamEventTransformer<String, List<int>> { | |
304 void handleData(String data, EventSink<List<int>> sink) { | |
305 var bytes = _encode(data); | |
306 if (bytes == null) { | |
307 sink.addError(new FormatException("Invalid character for encoding")); | |
308 sink.close(); | |
309 } else { | |
310 sink.add(bytes); | |
311 } | |
312 } | |
313 | |
314 List<int> _encode(String string); | |
315 } | |
316 | |
317 | |
318 // Utility class for encoding a string into an ASCII byte stream. | |
319 class _AsciiEncoder extends _SingleByteEncoder { | |
320 List<int> _encode(String string) { | |
321 var bytes = string.codeUnits; | |
322 for (var byte in bytes) { | |
323 if (byte > 127) return null; | |
324 } | |
325 return bytes; | |
326 } | 88 } |
327 } | 89 } |
328 | 90 |
329 | 91 |
330 // Utility class for encoding a string into a Latin1 byte stream. | 92 class _WindowsCodePageDecoder extends Converter<List<int>, String> { |
331 class _Latin1Encoder extends _SingleByteEncoder { | 93 |
332 List<int> _encode(String string) { | 94 const _WindowsCodePageDecoder(); |
333 var bytes = string.codeUnits; | 95 |
334 for (var byte in bytes) { | 96 String convert(List<int> input) { |
335 if (byte > 255) return null; | 97 return _decodeBytes(input); |
336 } | |
337 return bytes; | |
338 } | 98 } |
339 } | |
340 | 99 |
| 100 /** |
| 101 * Starts a chunked conversion. |
| 102 */ |
| 103 ByteConversionSink startChunkedConversion( |
| 104 ChunkedConversionSink<String> sink) { |
| 105 return new _WindowsCodePageDecoderSink(sink); |
| 106 } |
341 | 107 |
342 // Utility class for encoding a string into a current windows | 108 // Override the base-class' bind, to provide a better type. |
343 // code page byte list. | 109 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); |
344 // Implemented on top of a _SingleByteEncoder, even though it's not really a | |
345 // single byte encoder, to avoid copying boilerplate. | |
346 class _WindowsCodePageEncoder extends _SingleByteEncoder { | |
347 List<int> _encode(String string) => _encodeString(string); | |
348 | |
349 external static List<int> _encodeString(String string); | |
350 } | |
351 | |
352 | |
353 // Utility class for decoding Windows current code page data delivered | |
354 // as a stream of bytes. | |
355 class _WindowsCodePageDecoder extends StreamEventTransformer<List<int>, String>
{ | |
356 void handleData(List<int> data, EventSink<String> sink) { | |
357 sink.add(_decodeBytes(data)); | |
358 } | |
359 | 110 |
360 external static String _decodeBytes(List<int> bytes); | 111 external static String _decodeBytes(List<int> bytes); |
361 } | 112 } |
| 113 |
| 114 class _WindowsCodePageDecoderSink extends ByteConversionSinkBase { |
| 115 // TODO(floitsch): provide more efficient conversions when the input is |
| 116 // a slice. |
| 117 |
| 118 final StringConversionSink _sink; |
| 119 |
| 120 _WindowsCodePageDecoderSink(this._sink); |
| 121 |
| 122 void close() { |
| 123 _sink.close(); |
| 124 } |
| 125 |
| 126 void add(List<int> bytes) { |
| 127 _sink.add(_WindowsCodePageDecoder._decodeBytes(bytes)); |
| 128 } |
| 129 } |
OLD | NEW |