OLD | NEW |
---|---|
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.io; | 5 part of dart.io; |
6 | 6 |
7 // All aliases (in lowercase) of supported encoding from | |
8 // http://www.iana.org/assignments/character-sets/character-sets.xml. | |
9 Map<String, Encoding> _nameToEncoding = <String, Encoding> { | |
10 // ISO_8859-1:1987. | |
11 "iso_8859-1:1987": LATIN1, | |
12 "iso-ir-100": LATIN1, | |
13 "iso_8859-1": LATIN1, | |
14 "iso-8859-1": LATIN1, | |
15 "latin1": LATIN1, | |
16 "l1": LATIN1, | |
17 "ibm819": LATIN1, | |
18 "cp819": LATIN1, | |
19 "csisolatin1": LATIN1, | |
20 | |
21 // US-ASCII. | |
22 "iso-ir-6": ASCII, | |
23 "ansi_x3.4-1968": ASCII, | |
24 "ansi_x3.4-1986": ASCII, | |
25 "iso_646.irv:1991": ASCII, | |
26 "iso646-us": ASCII, | |
27 "us-ascii": ASCII, | |
28 "us": ASCII, | |
29 "ibm367": ASCII, | |
30 "cp367": ASCII, | |
31 "csascii": ASCII, | |
32 "ascii": ASCII, // This is not in the IANA official names. | |
33 | |
34 // UTF-8. | |
35 "csutf8": UTF8, | |
36 "utf-8": UTF8 | |
37 }; | |
38 | |
7 /** | 39 /** |
8 * String encodings. | 40 * Gets an [Encoding] object from the name of the character set |
41 * name. The names used are the IANA official names for the | |
42 * character set (see | |
43 * http://www.iana.org/assignments/character-sets/character-sets.xml). | |
44 * | |
45 * The [name] passed is case insensitive. | |
46 * | |
47 * If character set is not supported [:null:] is returned. | |
9 */ | 48 */ |
10 class Encoding { | 49 Encoding encodingFromName(String name) { |
Søren Gjesse
2013/08/26 08:03:15
Should this move to dart:convert as well? With the
floitsch
2013/08/26 09:33:40
Moved to Encoding.getByName.
No option to registe
| |
11 static const Encoding UTF_8 = const Encoding._internal("utf-8"); | |
12 static const Encoding ISO_8859_1 = const Encoding._internal("iso-8859-1"); | |
13 static const Encoding ASCII = const Encoding._internal("us-ascii"); | |
14 | |
15 /** | |
16 * SYSTEM encoding is the current code page on Windows and UTF-8 on | |
17 * Linux and Mac. | |
18 */ | |
19 static const Encoding SYSTEM = const Encoding._internal("system"); | |
20 | |
21 // All aliasses (in lowercase) of supported encoding from | |
22 // http://www.iana.org/assignments/character-sets/character-sets.xml. | |
23 static Map<String, Encoding> _nameToEncoding = <String, Encoding> { | |
24 // ISO_8859-1:1987. | |
25 "iso_8859-1:1987": ISO_8859_1, | |
26 "iso-ir-100": ISO_8859_1, | |
27 "iso_8859-1": ISO_8859_1, | |
28 "iso-8859-1": ISO_8859_1, | |
29 "latin1": ISO_8859_1, | |
30 "l1": ISO_8859_1, | |
31 "ibm819": ISO_8859_1, | |
32 "cp819": ISO_8859_1, | |
33 "csisolatin1": ISO_8859_1, | |
34 | |
35 // US-ASCII. | |
36 "iso-ir-6": ASCII, | |
37 "ansi_x3.4-1968": ASCII, | |
38 "ansi_x3.4-1986": ASCII, | |
39 "iso_646.irv:1991": ASCII, | |
40 "iso646-us": ASCII, | |
41 "us-ascii": ASCII, | |
42 "us": ASCII, | |
43 "ibm367": ASCII, | |
44 "cp367": ASCII, | |
45 "csascii": ASCII, | |
46 "ascii": ASCII, // This is not in the IANA official names. | |
47 | |
48 // UTF-8. | |
49 "csutf8": UTF_8, | |
50 "utf-8": UTF_8 | |
51 }; | |
52 | |
53 /** | |
54 * Gets an [Encoding] object from the name of the character set | |
55 * name. The names used are the IANA official names for the | |
56 * character set (see | |
57 * http://www.iana.org/assignments/character-sets/character-sets.xml). | |
58 * | |
59 * The [name] passed is case insensitive. | |
60 * | |
61 * If character set is not supported [:null:] is returned. | |
62 */ | |
63 static Encoding fromName(String name) { | |
64 if (name == null) return null; | 50 if (name == null) return null; |
65 name = name.toLowerCase(); | 51 name = name.toLowerCase(); |
66 return _nameToEncoding[name]; | 52 return _nameToEncoding[name]; |
53 } | |
54 | |
55 const SYSTEM_ENCODING = const SystemEncoding(); | |
56 | |
57 /** | |
58 * The system encoding is the current code page on Windows and UTF-8 on | |
59 * Linux and Mac. | |
60 */ | |
61 class SystemEncoding extends Encoding { | |
62 const SystemEncoding(); | |
63 | |
64 List<int> encode(String input) => encoder.convert(input); | |
65 String decode(List<int> encoded) => decoder.convert(encoded); | |
66 | |
67 Converter<String, List<int>> get encoder { | |
68 if (Platform.operatingSystem == "windows") { | |
69 return const _WindowsCodePageEncoder(); | |
70 } else { | |
71 return const Utf8Encoder(); | |
72 } | |
73 } | |
74 | |
75 Converter<List<int>, String> get decoder { | |
76 if (Platform.operatingSystem == "windows") { | |
77 return const _WindowsCodePageDecoder(); | |
78 } else { | |
79 return const Utf8Decoder(); | |
80 } | |
81 } | |
82 } | |
83 | |
84 class _WindowsCodePageEncoder extends Converter<String, List<int>> { | |
85 | |
86 const _WindowsCodePageEncoder(); | |
87 | |
88 List<int> convert(String input) { | |
89 List<int> encoded = _encodeString(input); | |
90 if (encoded == null) { | |
91 throw new FormatException("Invalid character for encoding"); | |
92 } | |
93 return encoded; | |
67 } | 94 } |
68 | 95 |
69 /** | 96 /** |
70 * Name of the encoding. This will be the lower-case version of one of the | 97 * Starts a chunked conversion. |
71 * IANA official names for the character set (see | |
72 * http://www.iana.org/assignments/character-sets/character-sets.xml) | |
73 */ | 98 */ |
74 final String name; | 99 StringConversionSink startChunkedConversion( |
100 ChunkedConversionSink<List<int>> sink) { | |
101 return new _WindowsCodePageEncoderSink(sink); | |
102 } | |
75 | 103 |
76 const Encoding._internal(String this.name); | 104 // Override the base-class' bind, to provide a better type. |
105 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream); | |
106 | |
107 external static List<int> _encodeString(String string); | |
77 } | 108 } |
78 | 109 |
79 const UTF_8 = Encoding.UTF_8; | 110 class _WindowsCodePageEncoderSink extends StringConversionSinkBase { |
80 const ISO_8859_1 = Encoding.ISO_8859_1; | 111 // TODO(floitsch): provide more efficient conversions when the input is |
81 const ASCII = Encoding.ASCII; | 112 // not a String. |
82 | 113 |
83 /** | 114 final ByteConversionSink _sink; |
84 * Stream transformer that can decode a stream of bytes into a stream of | |
85 * strings using [encoding]. | |
86 * | |
87 * Invalid or forbidden byte-sequences will not produce errors, but will instead | |
88 * insert [replacementChar] in the decoded strings. | |
89 */ | |
90 class StringDecoder implements StreamTransformer<List<int>, String> { | |
91 var _decoder; | |
92 | 115 |
93 static const _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xFFFD; | 116 _WindowsCodePageEncoderSink(this._sink); |
94 | 117 |
95 /** | 118 void close() { |
96 * Decodes a stream of bytes into a `String` with an optional | 119 _sink.close(); |
97 * [encoding] and [replacementChar]. | |
98 * | |
99 * The default value for [encoding] is [Encoding.UTF_8]. | |
100 * | |
101 * The default value for [replacementChar] is code point U+FFFD. | |
102 * | |
103 * Completes with the decoded `String` when the stream is done. | |
104 */ | |
105 static Future<String> decode( | |
106 Stream<List<int>> stream, | |
107 [Encoding encoding = Encoding.UTF_8, | |
108 int replacementChar = _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
109 if (replacementChar != null && | |
110 replacementChar != _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT) { | |
111 throw new UnsupportedError("replacement character must be null or " | |
112 "the Unicode replacement character"); | |
113 } | |
114 return stream | |
115 .transform(new StringDecoder(encoding, replacementChar)) | |
116 .fold( | |
117 new StringBuffer(), | |
118 (prev, data) => prev..write(data)) | |
119 .then((sb) => sb.toString()); | |
120 } | 120 } |
121 | 121 |
122 /** | 122 void add(String string) { |
123 * Create a new [StringDecoder] with an optional [encoding] and | 123 List<int> encoded = _WindowsCodePageByteEncoder._encodeString(string); |
124 * [replacementChar]. | 124 if (encoded == null) { |
125 * | 125 throw new FormatException("Invalid character for encoding"); |
126 * The default value for [encoding] is [Encoding.UTF_8]. | |
127 * | |
128 * The default value for [replacementChar] is code point U+FFFD. | |
129 */ | |
130 StringDecoder([Encoding encoding = Encoding.UTF_8, int replacementChar]) { | |
131 switch (encoding) { | |
132 case Encoding.UTF_8: | |
133 if (replacementChar != null && | |
134 replacementChar != _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT) { | |
135 throw new UnsupportedError("replacement character must be null or " | |
136 "the Unicode replacement character"); | |
137 } | |
138 _decoder = new Utf8Decoder(allowMalformed: true); | |
139 break; | |
140 case Encoding.ASCII: | |
141 if (replacementChar == null) { | |
142 replacementChar = '?'.codeUnitAt(0); | |
143 } else if (replacementChar > 127) { | |
144 throw new ArgumentError("Invalid replacement character for ASCII"); | |
145 } | |
146 _decoder = new _AsciiDecoder(replacementChar); | |
147 break; | |
148 case Encoding.ISO_8859_1: | |
149 if (replacementChar == null) { | |
150 replacementChar = '?'.codeUnitAt(0); | |
151 } else if (replacementChar > 255) { | |
152 throw new ArgumentError( | |
153 "Invalid replacement character for ISO_8859_1"); | |
154 } | |
155 _decoder = new _Latin1Decoder(replacementChar); | |
156 break; | |
157 case Encoding.SYSTEM: | |
158 if (Platform.operatingSystem == "windows") { | |
159 _decoder = new _WindowsCodePageDecoder(); | |
160 } else { | |
161 if (replacementChar != null) { | |
162 // TODO(ajohnsen): Handle replacement character. | |
163 throw new UnsupportedError( | |
164 "Replacement character is not supported for SYSTEM encoding"); | |
165 } | |
166 _decoder = new Utf8Decoder(allowMalformed: true); | |
167 } | |
168 break; | |
169 default: | |
170 throw new ArgumentError("Unsupported encoding '$encoding'"); | |
171 } | 126 } |
127 _sink.add(encoded); | |
172 } | 128 } |
173 | 129 |
174 Stream<String> bind(Stream<List<int>> stream) => _decoder.bind(stream); | 130 void addSlice(String source, int start, int end, bool isLast) { |
175 } | 131 if (start != 0 || end != source.length) { |
176 | 132 source = source.substring(start, end); |
177 | |
178 /** | |
179 * Stream transformer that can encode a stream of strings info a stream of | |
180 * bytes using [encoding]. | |
181 * | |
182 * Strings that cannot be represented in the given encoding will result in an | |
183 * error and a close event on the stream. | |
184 */ | |
185 class StringEncoder implements StreamTransformer<String, List<int>> { | |
186 var _encoder; | |
187 | |
188 /** | |
189 * Create a new [StringDecoder] with an optional [encoding] and | |
190 * [replacementChar]. | |
191 */ | |
192 StringEncoder([Encoding encoding = Encoding.UTF_8]) { | |
193 switch (encoding) { | |
194 case Encoding.UTF_8: | |
195 _encoder = new Utf8Encoder(); | |
196 break; | |
197 case Encoding.ASCII: | |
198 _encoder = new _AsciiEncoder(); | |
199 break; | |
200 case Encoding.ISO_8859_1: | |
201 _encoder = new _Latin1Encoder(); | |
202 break; | |
203 case Encoding.SYSTEM: | |
204 if (Platform.operatingSystem == "windows") { | |
205 _encoder = new _WindowsCodePageEncoder(); | |
206 } else { | |
207 _encoder = new Utf8Encoder(); | |
208 } | |
209 break; | |
210 default: | |
211 throw new ArgumentError("Unsupported encoding '$encoding'"); | |
212 } | 133 } |
213 } | 134 add(source); |
214 | 135 if (isLast) close(); |
215 Stream<List<int>> bind(Stream<String> stream) => _encoder.bind(stream); | |
216 } | |
217 | |
218 | |
219 // Utility function to synchronously decode a list of bytes. | |
220 String _decodeString(List<int> bytes, [Encoding encoding = Encoding.UTF_8]) { | |
221 if (bytes.length == 0) return ""; | |
222 if (encoding == Encoding.UTF_8) { | |
223 return UTF8.decode(bytes, allowMalformed: true); | |
224 } | |
225 var string; | |
226 var error; | |
227 var controller = new StreamController(sync: true); | |
228 controller.stream | |
229 .transform(new StringDecoder(encoding)) | |
230 .listen((data) { | |
231 // The StringEncoder decodes every encoding (except UTF-8) in one go. | |
232 assert(string == null); | |
233 string = data; | |
234 }, onError: (e) => error = e); | |
235 controller.add(bytes); | |
236 controller.close(); | |
237 if (error != null) throw error; | |
238 assert(string != null); | |
239 return string; | |
240 } | |
241 | |
242 | |
243 // Utility function to synchronously encode a String. | |
244 // Will throw an exception if the encoding is invalid. | |
245 List<int> _encodeString(String string, [Encoding encoding = Encoding.UTF_8]) { | |
246 if (string.length == 0) return []; | |
247 if (encoding == Encoding.UTF_8) return UTF8.encode(string); | |
248 var bytes; | |
249 var controller = new StreamController(sync: true); | |
250 controller.stream | |
251 .transform(new StringEncoder(encoding)) | |
252 .listen((data) { | |
253 // The StringEncoder encodes every encoding (except UTF-8) in one go. | |
254 assert(bytes == null); | |
255 bytes = data; | |
256 }); | |
257 controller.add(string); | |
258 controller.close(); | |
259 assert(bytes != null); | |
260 return bytes; | |
261 } | |
262 | |
263 | |
264 abstract class _SingleByteDecoder | |
265 extends StreamEventTransformer<List<int>, String> { | |
266 final int _replacementChar; | |
267 | |
268 _SingleByteDecoder(this._replacementChar); | |
269 | |
270 void handleData(List<int> data, EventSink<String> sink) { | |
271 var buffer = new List<int>(data.length); | |
272 for (int i = 0; i < data.length; i++) { | |
273 int char = _decodeByte(data[i]); | |
274 if (char < 0) char = _replacementChar; | |
275 buffer[i] = char; | |
276 } | |
277 sink.add(new String.fromCharCodes(buffer)); | |
278 } | |
279 | |
280 int _decodeByte(int byte); | |
281 } | |
282 | |
283 | |
284 // Utility class for decoding ascii data delivered as a stream of | |
285 // bytes. | |
286 class _AsciiDecoder extends _SingleByteDecoder { | |
287 _AsciiDecoder(int replacementChar) : super(replacementChar); | |
288 | |
289 int _decodeByte(int byte) => ((byte & 0x7f) == byte) ? byte : -1; | |
290 } | |
291 | |
292 | |
293 // Utility class for decoding Latin-1 data delivered as a stream of | |
294 // bytes. | |
295 class _Latin1Decoder extends _SingleByteDecoder { | |
296 _Latin1Decoder(int replacementChar) : super(replacementChar); | |
297 | |
298 int _decodeByte(int byte) => ((byte & 0xFF) == byte) ? byte : -1; | |
299 } | |
300 | |
301 | |
302 abstract class _SingleByteEncoder | |
303 extends StreamEventTransformer<String, List<int>> { | |
304 void handleData(String data, EventSink<List<int>> sink) { | |
305 var bytes = _encode(data); | |
306 if (bytes == null) { | |
307 sink.addError(new FormatException("Invalid character for encoding")); | |
308 sink.close(); | |
309 } else { | |
310 sink.add(bytes); | |
311 } | |
312 } | |
313 | |
314 List<int> _encode(String string); | |
315 } | |
316 | |
317 | |
318 // Utility class for encoding a string into an ASCII byte stream. | |
319 class _AsciiEncoder extends _SingleByteEncoder { | |
320 List<int> _encode(String string) { | |
321 var bytes = string.codeUnits; | |
322 for (var byte in bytes) { | |
323 if (byte > 127) return null; | |
324 } | |
325 return bytes; | |
326 } | 136 } |
327 } | 137 } |
328 | 138 |
329 | 139 |
330 // Utility class for encoding a string into a Latin1 byte stream. | 140 class _WindowsCodePageDecoder extends Converter<List<int>, String> { |
331 class _Latin1Encoder extends _SingleByteEncoder { | 141 |
332 List<int> _encode(String string) { | 142 const _WindowsCodePageDecoder(); |
333 var bytes = string.codeUnits; | 143 |
334 for (var byte in bytes) { | 144 String convert(List<int> input) { |
335 if (byte > 255) return null; | 145 return _decodeBytes(input); |
336 } | |
337 return bytes; | |
338 } | 146 } |
339 } | |
340 | 147 |
148 /** | |
149 * Starts a chunked conversion. | |
150 */ | |
151 ByteConversionSink startChunkedConversion( | |
152 ChunkedConversionSink<String> sink) { | |
153 return new _WindowsCodePageDecoderSink(sink); | |
154 } | |
341 | 155 |
342 // Utility class for encoding a string into a current windows | 156 // Override the base-class' bind, to provide a better type. |
343 // code page byte list. | 157 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream); |
344 // Implemented on top of a _SingleByteEncoder, even though it's not really a | |
345 // single byte encoder, to avoid copying boilerplate. | |
346 class _WindowsCodePageEncoder extends _SingleByteEncoder { | |
347 List<int> _encode(String string) => _encodeString(string); | |
348 | |
349 external static List<int> _encodeString(String string); | |
350 } | |
351 | |
352 | |
353 // Utility class for decoding Windows current code page data delivered | |
354 // as a stream of bytes. | |
355 class _WindowsCodePageDecoder extends StreamEventTransformer<List<int>, String> { | |
356 void handleData(List<int> data, EventSink<String> sink) { | |
357 sink.add(_decodeBytes(data)); | |
358 } | |
359 | 158 |
360 external static String _decodeBytes(List<int> bytes); | 159 external static String _decodeBytes(List<int> bytes); |
361 } | 160 } |
161 | |
162 class _WindowsCodePageDecoderSink extends ByteConversionSinkBase { | |
163 // TODO(floitsch): provide more efficient conversions when the input is | |
164 // a slice. | |
165 | |
166 final StringConversionSink _sink; | |
167 | |
168 _WindowsCodePageDecoderSink(this._sink); | |
169 | |
170 void close() { | |
171 _sink.close(); | |
172 } | |
173 | |
174 void add(List<int> bytes) { | |
175 _sink.add(_WindowsCodePageDecoder._decodeBytes(bytes)); | |
176 } | |
177 } | |
OLD | NEW |