OLD | NEW |
| (Empty) |
1 /// Decodes bytes using the correct name. See [decodeBytes]. | |
2 library char_encodings; | |
3 | |
4 import 'dart:collection'; | |
5 import 'package:utf/utf.dart'; | |
6 | |
7 // TODO(jmesserly): this function is conspicuously absent from dart:utf. | |
8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark. | |
9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is | |
10 /// used in HTML to detect the UTF- | |
11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { | |
12 int end = length != null ? offset + length : bytes.length; | |
13 return (offset + 3) <= end && | |
14 bytes[offset] == 0xEF && | |
15 bytes[offset + 1] == 0xBB && | |
16 bytes[offset + 2] == 0xBF; | |
17 } | |
18 | |
19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire | |
20 // file, but dart:utf does not expose stream-based decoders yet. | |
21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for | |
22 /// the codepoints. Supports the major unicode encodings as well as ascii and | |
23 /// and windows-1252 encodings. | |
24 Iterable<int> decodeBytes(String encoding, List<int> bytes, [int offset = 0, | |
25 int length, | |
26 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
27 if (length == null) length = bytes.length; | |
28 final replace = replacementCodepoint; | |
29 switch (encoding) { | |
30 case 'ascii': | |
31 bytes = bytes.sublist(offset, offset + length); | |
32 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart | |
33 for (int byte in bytes) { | |
34 if (byte > 127) { | |
35 // TODO(jmesserly): ideally this would be DecoderException, like the | |
36 // one thrown in runtime/bin/string_stream.dart, but we don't want to | |
37 // depend on dart:io. | |
38 throw new FormatException("Illegal ASCII character $byte"); | |
39 } | |
40 } | |
41 return bytes; | |
42 | |
43 case 'windows-1252': | |
44 case 'cp1252': | |
45 return decodeWindows1252AsIterable(bytes, offset, length, replace); | |
46 | |
47 case 'utf-8': | |
48 // NOTE: to match the behavior of the other decode functions, we eat the | |
49 // utf-8 BOM here. | |
50 if (hasUtf8Bom(bytes, offset, length)) { | |
51 offset += 3; | |
52 length -= 3; | |
53 } | |
54 return decodeUtf8AsIterable(bytes, offset, length, replace); | |
55 | |
56 case 'utf-16': | |
57 return decodeUtf16AsIterable(bytes, offset, length, replace); | |
58 case 'utf-16-be': | |
59 return decodeUtf16beAsIterable(bytes, offset, length, true, replace); | |
60 case 'utf-16-le': | |
61 return decodeUtf16leAsIterable(bytes, offset, length, true, replace); | |
62 | |
63 case 'utf-32': | |
64 return decodeUtf32AsIterable(bytes, offset, length, replace); | |
65 case 'utf-32-be': | |
66 return decodeUtf32beAsIterable(bytes, offset, length, true, replace); | |
67 case 'utf-32-le': | |
68 return decodeUtf32leAsIterable(bytes, offset, length, true, replace); | |
69 | |
70 default: | |
71 throw new ArgumentError('Encoding $encoding not supported'); | |
72 } | |
73 } | |
74 | |
75 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. | |
76 /// Returns the code points for the [input]. This works like [String.charCodes] | |
77 /// but it decodes UTF-16 surrogate pairs. | |
78 List<int> toCodepoints(String input) { | |
79 var newCodes = <int>[]; | |
80 for (int i = 0; i < input.length; i++) { | |
81 var c = input.codeUnitAt(i); | |
82 if (0xD800 <= c && c <= 0xDBFF) { | |
83 int next = i + 1; | |
84 if (next < input.length) { | |
85 var d = input.codeUnitAt(next); | |
86 if (0xDC00 <= d && d <= 0xDFFF) { | |
87 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); | |
88 i = next; | |
89 } | |
90 } | |
91 } | |
92 newCodes.add(c); | |
93 } | |
94 return newCodes; | |
95 } | |
96 | |
97 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as | |
98 /// an iterable. Thus, the consumer can only convert as much of the input as | |
99 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError] | |
100 /// rather than replace the bad value. | |
101 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, | |
102 [int offset = 0, int length, | |
103 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
104 return new IterableWindows1252Decoder( | |
105 bytes, offset, length, replacementCodepoint); | |
106 } | |
107 | |
108 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type | |
109 /// provides an iterator on demand and the iterator will only translate bytes | |
110 /// as requested by the user of the iterator. (Note: results are not cached.) | |
111 class IterableWindows1252Decoder extends IterableBase<int> { | |
112 final List<int> bytes; | |
113 final int offset; | |
114 final int length; | |
115 final int replacementCodepoint; | |
116 | |
117 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, | |
118 int this.length = null, | |
119 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | |
120 | |
121 Windows1252Decoder get iterator => | |
122 new Windows1252Decoder(bytes, offset, length, replacementCodepoint); | |
123 } | |
124 | |
125 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. | |
126 /// The parameters can set an offset into a list of bytes (as int), limit the | |
127 /// length of the values to be decoded, and override the default Unicode | |
128 /// replacement character. Set the replacementCharacter to null to throw an | |
129 /// ArgumentError rather than replace the bad value. The return value | |
130 /// from this method can be used as an Iterable (e.g. in a for-loop). | |
131 class Windows1252Decoder implements Iterator<int> { | |
132 final int replacementCodepoint; | |
133 final List<int> _bytes; | |
134 int _offset; | |
135 final int _length; | |
136 | |
137 Windows1252Decoder(List<int> bytes, [int offset = 0, int length, | |
138 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) | |
139 : _bytes = bytes, | |
140 _offset = offset - 1, | |
141 _length = length == null ? bytes.length : length; | |
142 | |
143 bool get _inRange => _offset >= 0 && _offset < _length; | |
144 int get current => _inRange ? _mapChar(_bytes[_offset]) : null; | |
145 | |
146 bool moveNext() { | |
147 _offset++; | |
148 return _inRange; | |
149 } | |
150 | |
151 int _mapChar(int char) { | |
152 // TODO(jmesserly): this is duplicating entitiesWindows1252 and | |
153 // replacementCharacters from constants.dart | |
154 switch (char) { | |
155 case 0x80: | |
156 return 0x20AC; // EURO SIGN | |
157 case 0x82: | |
158 return 0x201A; // SINGLE LOW-9 QUOTATION MARK | |
159 case 0x83: | |
160 return 0x0192; // LATIN SMALL LETTER F WITH HOOK | |
161 case 0x84: | |
162 return 0x201E; // DOUBLE LOW-9 QUOTATION MARK | |
163 case 0x85: | |
164 return 0x2026; // HORIZONTAL ELLIPSIS | |
165 case 0x86: | |
166 return 0x2020; // DAGGER | |
167 case 0x87: | |
168 return 0x2021; // DOUBLE DAGGER | |
169 case 0x88: | |
170 return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT | |
171 case 0x89: | |
172 return 0x2030; // PER MILLE SIGN | |
173 case 0x8A: | |
174 return 0x0160; // LATIN CAPITAL LETTER S WITH CARON | |
175 case 0x8B: | |
176 return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
177 case 0x8C: | |
178 return 0x0152; // LATIN CAPITAL LIGATURE OE | |
179 case 0x8E: | |
180 return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON | |
181 case 0x91: | |
182 return 0x2018; // LEFT SINGLE QUOTATION MARK | |
183 case 0x92: | |
184 return 0x2019; // RIGHT SINGLE QUOTATION MARK | |
185 case 0x93: | |
186 return 0x201C; // LEFT DOUBLE QUOTATION MARK | |
187 case 0x94: | |
188 return 0x201D; // RIGHT DOUBLE QUOTATION MARK | |
189 case 0x95: | |
190 return 0x2022; // BULLET | |
191 case 0x96: | |
192 return 0x2013; // EN DASH | |
193 case 0x97: | |
194 return 0x2014; // EM DASH | |
195 case 0x98: | |
196 return 0x02DC; // SMALL TILDE | |
197 case 0x99: | |
198 return 0x2122; // TRADE MARK SIGN | |
199 case 0x9A: | |
200 return 0x0161; // LATIN SMALL LETTER S WITH CARON | |
201 case 0x9B: | |
202 return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
203 case 0x9C: | |
204 return 0x0153; // LATIN SMALL LIGATURE OE | |
205 case 0x9E: | |
206 return 0x017E; // LATIN SMALL LETTER Z WITH CARON | |
207 case 0x9F: | |
208 return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS | |
209 | |
210 case 0x81: | |
211 case 0x8D: | |
212 case 0x8F: | |
213 case 0x90: | |
214 case 0x9D: | |
215 if (replacementCodepoint == null) { | |
216 throw new ArgumentError( | |
217 "Invalid windows-1252 code point $char at $_offset"); | |
218 } | |
219 return replacementCodepoint; | |
220 } | |
221 return char; | |
222 } | |
223 } | |
OLD | NEW |