Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(588)

Side by Side Diff: sdk/lib/io/string_transformer.dart

Issue 22872012: Remove Encoding-enum from dart:io and add interface in dart:convert. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Fix ddbg. Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.io; 5 part of dart.io;
6 6
7 // All aliases (in lowercase) of supported encoding from
8 // http://www.iana.org/assignments/character-sets/character-sets.xml.
9 Map<String, Encoding> _nameToEncoding = <String, Encoding> {
10 // ISO_8859-1:1987.
11 "iso_8859-1:1987": LATIN1,
12 "iso-ir-100": LATIN1,
13 "iso_8859-1": LATIN1,
14 "iso-8859-1": LATIN1,
15 "latin1": LATIN1,
16 "l1": LATIN1,
17 "ibm819": LATIN1,
18 "cp819": LATIN1,
19 "csisolatin1": LATIN1,
20
21 // US-ASCII.
22 "iso-ir-6": ASCII,
23 "ansi_x3.4-1968": ASCII,
24 "ansi_x3.4-1986": ASCII,
25 "iso_646.irv:1991": ASCII,
26 "iso646-us": ASCII,
27 "us-ascii": ASCII,
28 "us": ASCII,
29 "ibm367": ASCII,
30 "cp367": ASCII,
31 "csascii": ASCII,
32 "ascii": ASCII, // This is not in the IANA official names.
33
34 // UTF-8.
35 "csutf8": UTF8,
36 "utf-8": UTF8
37 };
38
7 /** 39 /**
8 * String encodings. 40 * Gets an [Encoding] object from the name of the character set
41 * name. The names used are the IANA official names for the
42 * character set (see
43 * http://www.iana.org/assignments/character-sets/character-sets.xml).
44 *
45 * The [name] passed is case insensitive.
46 *
47 * If character set is not supported [:null:] is returned.
9 */ 48 */
10 class Encoding { 49 Encoding encodingFromName(String name) {
Søren Gjesse 2013/08/26 08:03:15 Should this move to dart:convert as well? With the
floitsch 2013/08/26 09:33:40 Moved to Encoding.getByName. No option to registe
11 static const Encoding UTF_8 = const Encoding._internal("utf-8");
12 static const Encoding ISO_8859_1 = const Encoding._internal("iso-8859-1");
13 static const Encoding ASCII = const Encoding._internal("us-ascii");
14
15 /**
16 * SYSTEM encoding is the current code page on Windows and UTF-8 on
17 * Linux and Mac.
18 */
19 static const Encoding SYSTEM = const Encoding._internal("system");
20
21 // All aliasses (in lowercase) of supported encoding from
22 // http://www.iana.org/assignments/character-sets/character-sets.xml.
23 static Map<String, Encoding> _nameToEncoding = <String, Encoding> {
24 // ISO_8859-1:1987.
25 "iso_8859-1:1987": ISO_8859_1,
26 "iso-ir-100": ISO_8859_1,
27 "iso_8859-1": ISO_8859_1,
28 "iso-8859-1": ISO_8859_1,
29 "latin1": ISO_8859_1,
30 "l1": ISO_8859_1,
31 "ibm819": ISO_8859_1,
32 "cp819": ISO_8859_1,
33 "csisolatin1": ISO_8859_1,
34
35 // US-ASCII.
36 "iso-ir-6": ASCII,
37 "ansi_x3.4-1968": ASCII,
38 "ansi_x3.4-1986": ASCII,
39 "iso_646.irv:1991": ASCII,
40 "iso646-us": ASCII,
41 "us-ascii": ASCII,
42 "us": ASCII,
43 "ibm367": ASCII,
44 "cp367": ASCII,
45 "csascii": ASCII,
46 "ascii": ASCII, // This is not in the IANA official names.
47
48 // UTF-8.
49 "csutf8": UTF_8,
50 "utf-8": UTF_8
51 };
52
53 /**
54 * Gets an [Encoding] object from the name of the character set
55 * name. The names used are the IANA official names for the
56 * character set (see
57 * http://www.iana.org/assignments/character-sets/character-sets.xml).
58 *
59 * The [name] passed is case insensitive.
60 *
61 * If character set is not supported [:null:] is returned.
62 */
63 static Encoding fromName(String name) {
64 if (name == null) return null; 50 if (name == null) return null;
65 name = name.toLowerCase(); 51 name = name.toLowerCase();
66 return _nameToEncoding[name]; 52 return _nameToEncoding[name];
53 }
54
55 const SYSTEM_ENCODING = const SystemEncoding();
56
57 /**
58 * The system encoding is the current code page on Windows and UTF-8 on
59 * Linux and Mac.
60 */
61 class SystemEncoding extends Encoding {
62 const SystemEncoding();
63
64 List<int> encode(String input) => encoder.convert(input);
65 String decode(List<int> encoded) => decoder.convert(encoded);
66
67 Converter<String, List<int>> get encoder {
68 if (Platform.operatingSystem == "windows") {
69 return const _WindowsCodePageEncoder();
70 } else {
71 return const Utf8Encoder();
72 }
73 }
74
75 Converter<List<int>, String> get decoder {
76 if (Platform.operatingSystem == "windows") {
77 return const _WindowsCodePageDecoder();
78 } else {
79 return const Utf8Decoder();
80 }
81 }
82 }
83
84 class _WindowsCodePageEncoder extends Converter<String, List<int>> {
85
86 const _WindowsCodePageEncoder();
87
88 List<int> convert(String input) {
89 List<int> encoded = _encodeString(input);
90 if (encoded == null) {
91 throw new FormatException("Invalid character for encoding");
92 }
93 return encoded;
67 } 94 }
68 95
69 /** 96 /**
70 * Name of the encoding. This will be the lower-case version of one of the 97 * Starts a chunked conversion.
71 * IANA official names for the character set (see
72 * http://www.iana.org/assignments/character-sets/character-sets.xml)
73 */ 98 */
74 final String name; 99 StringConversionSink startChunkedConversion(
100 ChunkedConversionSink<List<int>> sink) {
101 return new _WindowsCodePageEncoderSink(sink);
102 }
75 103
76 const Encoding._internal(String this.name); 104 // Override the base-class' bind, to provide a better type.
105 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream);
106
107 external static List<int> _encodeString(String string);
77 } 108 }
78 109
79 const UTF_8 = Encoding.UTF_8; 110 class _WindowsCodePageEncoderSink extends StringConversionSinkBase {
80 const ISO_8859_1 = Encoding.ISO_8859_1; 111 // TODO(floitsch): provide more efficient conversions when the input is
81 const ASCII = Encoding.ASCII; 112 // not a String.
82 113
83 /** 114 final ByteConversionSink _sink;
84 * Stream transformer that can decode a stream of bytes into a stream of
85 * strings using [encoding].
86 *
87 * Invalid or forbidden byte-sequences will not produce errors, but will instead
88 * insert [replacementChar] in the decoded strings.
89 */
90 class StringDecoder implements StreamTransformer<List<int>, String> {
91 var _decoder;
92 115
93 static const _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT = 0xFFFD; 116 _WindowsCodePageEncoderSink(this._sink);
94 117
95 /** 118 void close() {
96 * Decodes a stream of bytes into a `String` with an optional 119 _sink.close();
97 * [encoding] and [replacementChar].
98 *
99 * The default value for [encoding] is [Encoding.UTF_8].
100 *
101 * The default value for [replacementChar] is code point U+FFFD.
102 *
103 * Completes with the decoded `String` when the stream is done.
104 */
105 static Future<String> decode(
106 Stream<List<int>> stream,
107 [Encoding encoding = Encoding.UTF_8,
108 int replacementChar = _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
109 if (replacementChar != null &&
110 replacementChar != _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT) {
111 throw new UnsupportedError("replacement character must be null or "
112 "the Unicode replacement character");
113 }
114 return stream
115 .transform(new StringDecoder(encoding, replacementChar))
116 .fold(
117 new StringBuffer(),
118 (prev, data) => prev..write(data))
119 .then((sb) => sb.toString());
120 } 120 }
121 121
122 /** 122 void add(String string) {
123 * Create a new [StringDecoder] with an optional [encoding] and 123 List<int> encoded = _WindowsCodePageByteEncoder._encodeString(string);
124 * [replacementChar]. 124 if (encoded == null) {
125 * 125 throw new FormatException("Invalid character for encoding");
126 * The default value for [encoding] is [Encoding.UTF_8].
127 *
128 * The default value for [replacementChar] is code point U+FFFD.
129 */
130 StringDecoder([Encoding encoding = Encoding.UTF_8, int replacementChar]) {
131 switch (encoding) {
132 case Encoding.UTF_8:
133 if (replacementChar != null &&
134 replacementChar != _UNICODE_REPLACEMENT_CHARACTER_CODEPOINT) {
135 throw new UnsupportedError("replacement character must be null or "
136 "the Unicode replacement character");
137 }
138 _decoder = new Utf8Decoder(allowMalformed: true);
139 break;
140 case Encoding.ASCII:
141 if (replacementChar == null) {
142 replacementChar = '?'.codeUnitAt(0);
143 } else if (replacementChar > 127) {
144 throw new ArgumentError("Invalid replacement character for ASCII");
145 }
146 _decoder = new _AsciiDecoder(replacementChar);
147 break;
148 case Encoding.ISO_8859_1:
149 if (replacementChar == null) {
150 replacementChar = '?'.codeUnitAt(0);
151 } else if (replacementChar > 255) {
152 throw new ArgumentError(
153 "Invalid replacement character for ISO_8859_1");
154 }
155 _decoder = new _Latin1Decoder(replacementChar);
156 break;
157 case Encoding.SYSTEM:
158 if (Platform.operatingSystem == "windows") {
159 _decoder = new _WindowsCodePageDecoder();
160 } else {
161 if (replacementChar != null) {
162 // TODO(ajohnsen): Handle replacement character.
163 throw new UnsupportedError(
164 "Replacement character is not supported for SYSTEM encoding");
165 }
166 _decoder = new Utf8Decoder(allowMalformed: true);
167 }
168 break;
169 default:
170 throw new ArgumentError("Unsupported encoding '$encoding'");
171 } 126 }
127 _sink.add(encoded);
172 } 128 }
173 129
174 Stream<String> bind(Stream<List<int>> stream) => _decoder.bind(stream); 130 void addSlice(String source, int start, int end, bool isLast) {
175 } 131 if (start != 0 || end != source.length) {
176 132 source = source.substring(start, end);
177
178 /**
179 * Stream transformer that can encode a stream of strings info a stream of
180 * bytes using [encoding].
181 *
182 * Strings that cannot be represented in the given encoding will result in an
183 * error and a close event on the stream.
184 */
185 class StringEncoder implements StreamTransformer<String, List<int>> {
186 var _encoder;
187
188 /**
189 * Create a new [StringDecoder] with an optional [encoding] and
190 * [replacementChar].
191 */
192 StringEncoder([Encoding encoding = Encoding.UTF_8]) {
193 switch (encoding) {
194 case Encoding.UTF_8:
195 _encoder = new Utf8Encoder();
196 break;
197 case Encoding.ASCII:
198 _encoder = new _AsciiEncoder();
199 break;
200 case Encoding.ISO_8859_1:
201 _encoder = new _Latin1Encoder();
202 break;
203 case Encoding.SYSTEM:
204 if (Platform.operatingSystem == "windows") {
205 _encoder = new _WindowsCodePageEncoder();
206 } else {
207 _encoder = new Utf8Encoder();
208 }
209 break;
210 default:
211 throw new ArgumentError("Unsupported encoding '$encoding'");
212 } 133 }
213 } 134 add(source);
214 135 if (isLast) close();
215 Stream<List<int>> bind(Stream<String> stream) => _encoder.bind(stream);
216 }
217
218
219 // Utility function to synchronously decode a list of bytes.
220 String _decodeString(List<int> bytes, [Encoding encoding = Encoding.UTF_8]) {
221 if (bytes.length == 0) return "";
222 if (encoding == Encoding.UTF_8) {
223 return UTF8.decode(bytes, allowMalformed: true);
224 }
225 var string;
226 var error;
227 var controller = new StreamController(sync: true);
228 controller.stream
229 .transform(new StringDecoder(encoding))
230 .listen((data) {
231 // The StringEncoder decodes every encoding (except UTF-8) in one go.
232 assert(string == null);
233 string = data;
234 }, onError: (e) => error = e);
235 controller.add(bytes);
236 controller.close();
237 if (error != null) throw error;
238 assert(string != null);
239 return string;
240 }
241
242
243 // Utility function to synchronously encode a String.
244 // Will throw an exception if the encoding is invalid.
245 List<int> _encodeString(String string, [Encoding encoding = Encoding.UTF_8]) {
246 if (string.length == 0) return [];
247 if (encoding == Encoding.UTF_8) return UTF8.encode(string);
248 var bytes;
249 var controller = new StreamController(sync: true);
250 controller.stream
251 .transform(new StringEncoder(encoding))
252 .listen((data) {
253 // The StringEncoder encodes every encoding (except UTF-8) in one go.
254 assert(bytes == null);
255 bytes = data;
256 });
257 controller.add(string);
258 controller.close();
259 assert(bytes != null);
260 return bytes;
261 }
262
263
264 abstract class _SingleByteDecoder
265 extends StreamEventTransformer<List<int>, String> {
266 final int _replacementChar;
267
268 _SingleByteDecoder(this._replacementChar);
269
270 void handleData(List<int> data, EventSink<String> sink) {
271 var buffer = new List<int>(data.length);
272 for (int i = 0; i < data.length; i++) {
273 int char = _decodeByte(data[i]);
274 if (char < 0) char = _replacementChar;
275 buffer[i] = char;
276 }
277 sink.add(new String.fromCharCodes(buffer));
278 }
279
280 int _decodeByte(int byte);
281 }
282
283
284 // Utility class for decoding ascii data delivered as a stream of
285 // bytes.
286 class _AsciiDecoder extends _SingleByteDecoder {
287 _AsciiDecoder(int replacementChar) : super(replacementChar);
288
289 int _decodeByte(int byte) => ((byte & 0x7f) == byte) ? byte : -1;
290 }
291
292
293 // Utility class for decoding Latin-1 data delivered as a stream of
294 // bytes.
295 class _Latin1Decoder extends _SingleByteDecoder {
296 _Latin1Decoder(int replacementChar) : super(replacementChar);
297
298 int _decodeByte(int byte) => ((byte & 0xFF) == byte) ? byte : -1;
299 }
300
301
302 abstract class _SingleByteEncoder
303 extends StreamEventTransformer<String, List<int>> {
304 void handleData(String data, EventSink<List<int>> sink) {
305 var bytes = _encode(data);
306 if (bytes == null) {
307 sink.addError(new FormatException("Invalid character for encoding"));
308 sink.close();
309 } else {
310 sink.add(bytes);
311 }
312 }
313
314 List<int> _encode(String string);
315 }
316
317
318 // Utility class for encoding a string into an ASCII byte stream.
319 class _AsciiEncoder extends _SingleByteEncoder {
320 List<int> _encode(String string) {
321 var bytes = string.codeUnits;
322 for (var byte in bytes) {
323 if (byte > 127) return null;
324 }
325 return bytes;
326 } 136 }
327 } 137 }
328 138
329 139
330 // Utility class for encoding a string into a Latin1 byte stream. 140 class _WindowsCodePageDecoder extends Converter<List<int>, String> {
331 class _Latin1Encoder extends _SingleByteEncoder { 141
332 List<int> _encode(String string) { 142 const _WindowsCodePageDecoder();
333 var bytes = string.codeUnits; 143
334 for (var byte in bytes) { 144 String convert(List<int> input) {
335 if (byte > 255) return null; 145 return _decodeBytes(input);
336 }
337 return bytes;
338 } 146 }
339 }
340 147
148 /**
149 * Starts a chunked conversion.
150 */
151 ByteConversionSink startChunkedConversion(
152 ChunkedConversionSink<String> sink) {
153 return new _WindowsCodePageDecoderSink(sink);
154 }
341 155
342 // Utility class for encoding a string into a current windows 156 // Override the base-class' bind, to provide a better type.
343 // code page byte list. 157 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream);
344 // Implemented on top of a _SingleByteEncoder, even though it's not really a
345 // single byte encoder, to avoid copying boilerplate.
346 class _WindowsCodePageEncoder extends _SingleByteEncoder {
347 List<int> _encode(String string) => _encodeString(string);
348
349 external static List<int> _encodeString(String string);
350 }
351
352
353 // Utility class for decoding Windows current code page data delivered
354 // as a stream of bytes.
355 class _WindowsCodePageDecoder extends StreamEventTransformer<List<int>, String> {
356 void handleData(List<int> data, EventSink<String> sink) {
357 sink.add(_decodeBytes(data));
358 }
359 158
360 external static String _decodeBytes(List<int> bytes); 159 external static String _decodeBytes(List<int> bytes);
361 } 160 }
161
162 class _WindowsCodePageDecoderSink extends ByteConversionSinkBase {
163 // TODO(floitsch): provide more efficient conversions when the input is
164 // a slice.
165
166 final StringConversionSink _sink;
167
168 _WindowsCodePageDecoderSink(this._sink);
169
170 void close() {
171 _sink.close();
172 }
173
174 void add(List<int> bytes) {
175 _sink.add(_WindowsCodePageDecoder._decodeBytes(bytes));
176 }
177 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698