OLD | NEW |
1 /** Decodes bytes using the correct name. See [decodeBytes]. */ | 1 /// Decodes bytes using the correct name. See [decodeBytes]. |
2 library char_encodings; | 2 library char_encodings; |
3 | 3 |
4 import 'dart:collection'; | 4 import 'dart:collection'; |
5 import 'package:utf/utf.dart'; | 5 import 'package:utf/utf.dart'; |
6 | 6 |
7 // TODO(jmesserly): this function is conspicuously absent from dart:utf. | 7 // TODO(jmesserly): this function is conspicuously absent from dart:utf. |
8 /** | 8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark. |
9 * Returns true if the [bytes] starts with a UTF-8 byte order mark. | 9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is |
10 * Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is | 10 /// used in HTML to detect the UTF- |
11 * used in HTML to detect the UTF- | |
12 */ | |
13 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { | 11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { |
14 int end = length != null ? offset + length : bytes.length; | 12 int end = length != null ? offset + length : bytes.length; |
15 return (offset + 3) <= end && | 13 return (offset + 3) <= end && |
16 bytes[offset] == 0xEF && | 14 bytes[offset] == 0xEF && |
17 bytes[offset + 1] == 0xBB && | 15 bytes[offset + 1] == 0xBB && |
18 bytes[offset + 2] == 0xBF; | 16 bytes[offset + 2] == 0xBF; |
19 } | 17 } |
20 | 18 |
21 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire | 19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire |
22 // file, but dart:utf does not expose stream-based decoders yet. | 20 // file, but dart:utf does not expose stream-based decoders yet. |
23 /** | 21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for |
24 * Decodes the [bytes] with the provided [encoding] and returns an iterable for | 22 /// the codepoints. Supports the major unicode encodings as well as ascii and |
25 * the codepoints. Supports the major unicode encodings as well as ascii and | 23 /// and windows-1252 encodings. |
26 * and windows-1252 encodings. | |
27 */ | |
28 Iterable<int> decodeBytes(String encoding, List<int> bytes, | 24 Iterable<int> decodeBytes(String encoding, List<int> bytes, |
29 [int offset = 0, int length, | 25 [int offset = 0, int length, |
30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 26 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
31 if (length == null) length = bytes.length; | 27 if (length == null) length = bytes.length; |
32 final replace = replacementCodepoint; | 28 final replace = replacementCodepoint; |
33 switch (encoding) { | 29 switch (encoding) { |
34 case 'ascii': | 30 case 'ascii': |
35 bytes = bytes.sublist(offset, offset + length); | 31 bytes = bytes.sublist(offset, offset + length); |
36 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart | 32 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart |
37 for (int byte in bytes) { | 33 for (int byte in bytes) { |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 case 'utf-32-le': | 67 case 'utf-32-le': |
72 return decodeUtf32leAsIterable(bytes, offset, length, true, replace); | 68 return decodeUtf32leAsIterable(bytes, offset, length, true, replace); |
73 | 69 |
74 default: | 70 default: |
75 throw new ArgumentError('Encoding $encoding not supported'); | 71 throw new ArgumentError('Encoding $encoding not supported'); |
76 } | 72 } |
77 } | 73 } |
78 | 74 |
79 | 75 |
80 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. | 76 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. |
81 /** | 77 /// Returns the code points for the [input]. This works like [String.charCodes] |
82 * Returns the code points for the [input]. This works like [String.charCodes] | 78 /// but it decodes UTF-16 surrogate pairs. |
83 * but it decodes UTF-16 surrogate pairs. | |
84 */ | |
85 List<int> toCodepoints(String input) { | 79 List<int> toCodepoints(String input) { |
86 var newCodes = <int>[]; | 80 var newCodes = <int>[]; |
87 for (int i = 0; i < input.length; i++) { | 81 for (int i = 0; i < input.length; i++) { |
88 var c = input.codeUnitAt(i); | 82 var c = input.codeUnitAt(i); |
89 if (0xD800 <= c && c <= 0xDBFF) { | 83 if (0xD800 <= c && c <= 0xDBFF) { |
90 int next = i + 1; | 84 int next = i + 1; |
91 if (next < input.length) { | 85 if (next < input.length) { |
92 var d = input.codeUnitAt(next); | 86 var d = input.codeUnitAt(next); |
93 if (0xDC00 <= d && d <= 0xDFFF) { | 87 if (0xDC00 <= d && d <= 0xDFFF) { |
94 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); | 88 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); |
95 i = next; | 89 i = next; |
96 } | 90 } |
97 } | 91 } |
98 } | 92 } |
99 newCodes.add(c); | 93 newCodes.add(c); |
100 } | 94 } |
101 return newCodes; | 95 return newCodes; |
102 } | 96 } |
103 | 97 |
104 | 98 |
105 /** | 99 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as |
106 * Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an | 100 /// an iterable. Thus, the consumer can only convert as much of the input as |
107 * iterable. Thus, the consumer can only convert as much of the input as needed. | 101 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError] |
108 * Set the [replacementCharacter] to null to throw an [ArgumentError] | 102 /// rather than replace the bad value. |
109 * rather than replace the bad value. | |
110 */ | |
111 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, | 103 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, |
112 [int offset = 0, int length, | 104 [int offset = 0, int length, |
113 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 105 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
114 return new IterableWindows1252Decoder(bytes, offset, length, | 106 return new IterableWindows1252Decoder(bytes, offset, length, |
115 replacementCodepoint); | 107 replacementCodepoint); |
116 } | 108 } |
117 | 109 |
118 | 110 |
119 /** | 111 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type |
120 * Return type of [decodeWindows1252AsIterable] and variants. The Iterable type | 112 /// provides an iterator on demand and the iterator will only translate bytes |
121 * provides an iterator on demand and the iterator will only translate bytes | 113 /// as requested by the user of the iterator. (Note: results are not cached.) |
122 * as requested by the user of the iterator. (Note: results are not cached.) | |
123 */ | |
124 class IterableWindows1252Decoder extends IterableBase<int> { | 114 class IterableWindows1252Decoder extends IterableBase<int> { |
125 final List<int> bytes; | 115 final List<int> bytes; |
126 final int offset; | 116 final int offset; |
127 final int length; | 117 final int length; |
128 final int replacementCodepoint; | 118 final int replacementCodepoint; |
129 | 119 |
130 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, | 120 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, |
131 int this.length = null, | 121 int this.length = null, |
132 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | 122 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
133 | 123 |
134 Windows1252Decoder get iterator => | 124 Windows1252Decoder get iterator => |
135 new Windows1252Decoder(bytes, offset, length, replacementCodepoint); | 125 new Windows1252Decoder(bytes, offset, length, replacementCodepoint); |
136 } | 126 } |
137 | 127 |
138 | 128 |
139 /** | 129 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. |
140 * Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. | 130 /// The parameters can set an offset into a list of bytes (as int), limit the |
141 * The parameters can set an offset into a list of bytes (as int), limit the | 131 /// length of the values to be decoded, and override the default Unicode |
142 * length of the values to be decoded, and override the default Unicode | 132 /// replacement character. Set the replacementCharacter to null to throw an |
143 * replacement character. Set the replacementCharacter to null to throw an | 133 /// ArgumentError rather than replace the bad value. The return value |
144 * ArgumentError rather than replace the bad value. The return value | 134 /// from this method can be used as an Iterable (e.g. in a for-loop). |
145 * from this method can be used as an Iterable (e.g. in a for-loop). | |
146 */ | |
147 class Windows1252Decoder implements Iterator<int> { | 135 class Windows1252Decoder implements Iterator<int> { |
148 final int replacementCodepoint; | 136 final int replacementCodepoint; |
149 final List<int> _bytes; | 137 final List<int> _bytes; |
150 int _offset; | 138 int _offset; |
151 final int _length; | 139 final int _length; |
152 | 140 |
153 Windows1252Decoder(List<int> bytes, [int offset = 0, int length, | 141 Windows1252Decoder(List<int> bytes, [int offset = 0, int length, |
154 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) | 142 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
155 : _bytes = bytes, | 143 : _bytes = bytes, |
156 _offset = offset - 1, | 144 _offset = offset - 1, |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
203 case 0x9D: | 191 case 0x9D: |
204 if (replacementCodepoint == null) { | 192 if (replacementCodepoint == null) { |
205 throw new ArgumentError( | 193 throw new ArgumentError( |
206 "Invalid windows-1252 code point $char at $_offset"); | 194 "Invalid windows-1252 code point $char at $_offset"); |
207 } | 195 } |
208 return replacementCodepoint; | 196 return replacementCodepoint; |
209 } | 197 } |
210 return char; | 198 return char; |
211 } | 199 } |
212 } | 200 } |
OLD | NEW |