OLD | NEW |
(Empty) | |
| 1 /// Decodes bytes using the correct name. See [decodeBytes]. |
| 2 library char_encodings; |
| 3 |
| 4 import 'dart:collection'; |
| 5 import 'package:utf/utf.dart'; |
| 6 |
| 7 // TODO(jmesserly): this function is conspicuously absent from dart:utf. |
| 8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark. |
| 9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is |
| 10 /// used in HTML to detect the UTF- |
| 11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { |
| 12 int end = length != null ? offset + length : bytes.length; |
| 13 return (offset + 3) <= end && |
| 14 bytes[offset] == 0xEF && |
| 15 bytes[offset + 1] == 0xBB && |
| 16 bytes[offset + 2] == 0xBF; |
| 17 } |
| 18 |
| 19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire |
| 20 // file, but dart:utf does not expose stream-based decoders yet. |
| 21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for |
| 22 /// the codepoints. Supports the major unicode encodings as well as ascii and |
| 23 /// and windows-1252 encodings. |
| 24 Iterable<int> decodeBytes(String encoding, List<int> bytes, [int offset = 0, |
| 25 int length, |
| 26 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 27 if (length == null) length = bytes.length; |
| 28 final replace = replacementCodepoint; |
| 29 switch (encoding) { |
| 30 case 'ascii': |
| 31 bytes = bytes.sublist(offset, offset + length); |
| 32 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart |
| 33 for (int byte in bytes) { |
| 34 if (byte > 127) { |
| 35 // TODO(jmesserly): ideally this would be DecoderException, like the |
| 36 // one thrown in runtime/bin/string_stream.dart, but we don't want to |
| 37 // depend on dart:io. |
| 38 throw new FormatException("Illegal ASCII character $byte"); |
| 39 } |
| 40 } |
| 41 return bytes; |
| 42 |
| 43 case 'windows-1252': |
| 44 case 'cp1252': |
| 45 return decodeWindows1252AsIterable(bytes, offset, length, replace); |
| 46 |
| 47 case 'utf-8': |
| 48 // NOTE: to match the behavior of the other decode functions, we eat the |
| 49 // utf-8 BOM here. |
| 50 if (hasUtf8Bom(bytes, offset, length)) { |
| 51 offset += 3; |
| 52 length -= 3; |
| 53 } |
| 54 return decodeUtf8AsIterable(bytes, offset, length, replace); |
| 55 |
| 56 case 'utf-16': |
| 57 return decodeUtf16AsIterable(bytes, offset, length, replace); |
| 58 case 'utf-16-be': |
| 59 return decodeUtf16beAsIterable(bytes, offset, length, true, replace); |
| 60 case 'utf-16-le': |
| 61 return decodeUtf16leAsIterable(bytes, offset, length, true, replace); |
| 62 |
| 63 case 'utf-32': |
| 64 return decodeUtf32AsIterable(bytes, offset, length, replace); |
| 65 case 'utf-32-be': |
| 66 return decodeUtf32beAsIterable(bytes, offset, length, true, replace); |
| 67 case 'utf-32-le': |
| 68 return decodeUtf32leAsIterable(bytes, offset, length, true, replace); |
| 69 |
| 70 default: |
| 71 throw new ArgumentError('Encoding $encoding not supported'); |
| 72 } |
| 73 } |
| 74 |
| 75 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. |
| 76 /// Returns the code points for the [input]. This works like [String.charCodes] |
| 77 /// but it decodes UTF-16 surrogate pairs. |
| 78 List<int> toCodepoints(String input) { |
| 79 var newCodes = <int>[]; |
| 80 for (int i = 0; i < input.length; i++) { |
| 81 var c = input.codeUnitAt(i); |
| 82 if (0xD800 <= c && c <= 0xDBFF) { |
| 83 int next = i + 1; |
| 84 if (next < input.length) { |
| 85 var d = input.codeUnitAt(next); |
| 86 if (0xDC00 <= d && d <= 0xDFFF) { |
| 87 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); |
| 88 i = next; |
| 89 } |
| 90 } |
| 91 } |
| 92 newCodes.add(c); |
| 93 } |
| 94 return newCodes; |
| 95 } |
| 96 |
| 97 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as |
| 98 /// an iterable. Thus, the consumer can only convert as much of the input as |
| 99 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError] |
| 100 /// rather than replace the bad value. |
| 101 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, |
| 102 [int offset = 0, int length, |
| 103 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 104 return new IterableWindows1252Decoder( |
| 105 bytes, offset, length, replacementCodepoint); |
| 106 } |
| 107 |
| 108 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type |
| 109 /// provides an iterator on demand and the iterator will only translate bytes |
| 110 /// as requested by the user of the iterator. (Note: results are not cached.) |
| 111 class IterableWindows1252Decoder extends IterableBase<int> { |
| 112 final List<int> bytes; |
| 113 final int offset; |
| 114 final int length; |
| 115 final int replacementCodepoint; |
| 116 |
| 117 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, |
| 118 int this.length = null, |
| 119 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
| 120 |
| 121 Windows1252Decoder get iterator => |
| 122 new Windows1252Decoder(bytes, offset, length, replacementCodepoint); |
| 123 } |
| 124 |
| 125 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. |
| 126 /// The parameters can set an offset into a list of bytes (as int), limit the |
| 127 /// length of the values to be decoded, and override the default Unicode |
| 128 /// replacement character. Set the replacementCharacter to null to throw an |
| 129 /// ArgumentError rather than replace the bad value. The return value |
| 130 /// from this method can be used as an Iterable (e.g. in a for-loop). |
| 131 class Windows1252Decoder implements Iterator<int> { |
| 132 final int replacementCodepoint; |
| 133 final List<int> _bytes; |
| 134 int _offset; |
| 135 final int _length; |
| 136 |
| 137 Windows1252Decoder(List<int> bytes, [int offset = 0, int length, |
| 138 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
| 139 : _bytes = bytes, |
| 140 _offset = offset - 1, |
| 141 _length = length == null ? bytes.length : length; |
| 142 |
| 143 bool get _inRange => _offset >= 0 && _offset < _length; |
| 144 int get current => _inRange ? _mapChar(_bytes[_offset]) : null; |
| 145 |
| 146 bool moveNext() { |
| 147 _offset++; |
| 148 return _inRange; |
| 149 } |
| 150 |
| 151 int _mapChar(int char) { |
| 152 // TODO(jmesserly): this is duplicating entitiesWindows1252 and |
| 153 // replacementCharacters from constants.dart |
| 154 switch (char) { |
| 155 case 0x80: |
| 156 return 0x20AC; // EURO SIGN |
| 157 case 0x82: |
| 158 return 0x201A; // SINGLE LOW-9 QUOTATION MARK |
| 159 case 0x83: |
| 160 return 0x0192; // LATIN SMALL LETTER F WITH HOOK |
| 161 case 0x84: |
| 162 return 0x201E; // DOUBLE LOW-9 QUOTATION MARK |
| 163 case 0x85: |
| 164 return 0x2026; // HORIZONTAL ELLIPSIS |
| 165 case 0x86: |
| 166 return 0x2020; // DAGGER |
| 167 case 0x87: |
| 168 return 0x2021; // DOUBLE DAGGER |
| 169 case 0x88: |
| 170 return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT |
| 171 case 0x89: |
| 172 return 0x2030; // PER MILLE SIGN |
| 173 case 0x8A: |
| 174 return 0x0160; // LATIN CAPITAL LETTER S WITH CARON |
| 175 case 0x8B: |
| 176 return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
| 177 case 0x8C: |
| 178 return 0x0152; // LATIN CAPITAL LIGATURE OE |
| 179 case 0x8E: |
| 180 return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON |
| 181 case 0x91: |
| 182 return 0x2018; // LEFT SINGLE QUOTATION MARK |
| 183 case 0x92: |
| 184 return 0x2019; // RIGHT SINGLE QUOTATION MARK |
| 185 case 0x93: |
| 186 return 0x201C; // LEFT DOUBLE QUOTATION MARK |
| 187 case 0x94: |
| 188 return 0x201D; // RIGHT DOUBLE QUOTATION MARK |
| 189 case 0x95: |
| 190 return 0x2022; // BULLET |
| 191 case 0x96: |
| 192 return 0x2013; // EN DASH |
| 193 case 0x97: |
| 194 return 0x2014; // EM DASH |
| 195 case 0x98: |
| 196 return 0x02DC; // SMALL TILDE |
| 197 case 0x99: |
| 198 return 0x2122; // TRADE MARK SIGN |
| 199 case 0x9A: |
| 200 return 0x0161; // LATIN SMALL LETTER S WITH CARON |
| 201 case 0x9B: |
| 202 return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
| 203 case 0x9C: |
| 204 return 0x0153; // LATIN SMALL LIGATURE OE |
| 205 case 0x9E: |
| 206 return 0x017E; // LATIN SMALL LETTER Z WITH CARON |
| 207 case 0x9F: |
| 208 return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS |
| 209 |
| 210 case 0x81: |
| 211 case 0x8D: |
| 212 case 0x8F: |
| 213 case 0x90: |
| 214 case 0x9D: |
| 215 if (replacementCodepoint == null) { |
| 216 throw new ArgumentError( |
| 217 "Invalid windows-1252 code point $char at $_offset"); |
| 218 } |
| 219 return replacementCodepoint; |
| 220 } |
| 221 return char; |
| 222 } |
| 223 } |
OLD | NEW |