| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 /// Decodes bytes using the correct name. See [decodeBytes]. |  | 
| 2 library char_encodings; |  | 
| 3 |  | 
| 4 import 'dart:collection'; |  | 
| 5 import 'package:utf/utf.dart'; |  | 
| 6 |  | 
| 7 // TODO(jmesserly): this function is conspicuously absent from dart:utf. |  | 
| 8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark. |  | 
| 9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is |  | 
| 10 /// used in HTML to detect the UTF- |  | 
| 11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { |  | 
| 12   int end = length != null ? offset + length : bytes.length; |  | 
| 13   return (offset + 3) <= end && |  | 
| 14       bytes[offset] == 0xEF && |  | 
| 15       bytes[offset + 1] == 0xBB && |  | 
| 16       bytes[offset + 2] == 0xBF; |  | 
| 17 } |  | 
| 18 |  | 
| 19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire |  | 
| 20 // file, but dart:utf does not expose stream-based decoders yet. |  | 
| 21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for |  | 
| 22 /// the codepoints. Supports the major unicode encodings as well as ascii and |  | 
| 23 /// and windows-1252 encodings. |  | 
| 24 Iterable<int> decodeBytes(String encoding, List<int> bytes, [int offset = 0, |  | 
| 25     int length, |  | 
| 26     int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |  | 
| 27   if (length == null) length = bytes.length; |  | 
| 28   final replace = replacementCodepoint; |  | 
| 29   switch (encoding) { |  | 
| 30     case 'ascii': |  | 
| 31       bytes = bytes.sublist(offset, offset + length); |  | 
| 32       // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart |  | 
| 33       for (int byte in bytes) { |  | 
| 34         if (byte > 127) { |  | 
| 35           // TODO(jmesserly): ideally this would be DecoderException, like the |  | 
| 36           // one thrown in runtime/bin/string_stream.dart, but we don't want to |  | 
| 37           // depend on dart:io. |  | 
| 38           throw new FormatException("Illegal ASCII character $byte"); |  | 
| 39         } |  | 
| 40       } |  | 
| 41       return bytes; |  | 
| 42 |  | 
| 43     case 'windows-1252': |  | 
| 44     case 'cp1252': |  | 
| 45       return decodeWindows1252AsIterable(bytes, offset, length, replace); |  | 
| 46 |  | 
| 47     case 'utf-8': |  | 
| 48       // NOTE: to match the behavior of the other decode functions, we eat the |  | 
| 49       // utf-8 BOM here. |  | 
| 50       if (hasUtf8Bom(bytes, offset, length)) { |  | 
| 51         offset += 3; |  | 
| 52         length -= 3; |  | 
| 53       } |  | 
| 54       return decodeUtf8AsIterable(bytes, offset, length, replace); |  | 
| 55 |  | 
| 56     case 'utf-16': |  | 
| 57       return decodeUtf16AsIterable(bytes, offset, length, replace); |  | 
| 58     case 'utf-16-be': |  | 
| 59       return decodeUtf16beAsIterable(bytes, offset, length, true, replace); |  | 
| 60     case 'utf-16-le': |  | 
| 61       return decodeUtf16leAsIterable(bytes, offset, length, true, replace); |  | 
| 62 |  | 
| 63     case 'utf-32': |  | 
| 64       return decodeUtf32AsIterable(bytes, offset, length, replace); |  | 
| 65     case 'utf-32-be': |  | 
| 66       return decodeUtf32beAsIterable(bytes, offset, length, true, replace); |  | 
| 67     case 'utf-32-le': |  | 
| 68       return decodeUtf32leAsIterable(bytes, offset, length, true, replace); |  | 
| 69 |  | 
| 70     default: |  | 
| 71       throw new ArgumentError('Encoding $encoding not supported'); |  | 
| 72   } |  | 
| 73 } |  | 
| 74 |  | 
| 75 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. |  | 
| 76 /// Returns the code points for the [input]. This works like [String.charCodes] |  | 
| 77 /// but it decodes UTF-16 surrogate pairs. |  | 
| 78 List<int> toCodepoints(String input) { |  | 
| 79   var newCodes = <int>[]; |  | 
| 80   for (int i = 0; i < input.length; i++) { |  | 
| 81     var c = input.codeUnitAt(i); |  | 
| 82     if (0xD800 <= c && c <= 0xDBFF) { |  | 
| 83       int next = i + 1; |  | 
| 84       if (next < input.length) { |  | 
| 85         var d = input.codeUnitAt(next); |  | 
| 86         if (0xDC00 <= d && d <= 0xDFFF) { |  | 
| 87           c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); |  | 
| 88           i = next; |  | 
| 89         } |  | 
| 90       } |  | 
| 91     } |  | 
| 92     newCodes.add(c); |  | 
| 93   } |  | 
| 94   return newCodes; |  | 
| 95 } |  | 
| 96 |  | 
| 97 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as |  | 
| 98 /// an iterable. Thus, the consumer can only convert as much of the input as |  | 
| 99 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError] |  | 
| 100 /// rather than replace the bad value. |  | 
| 101 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, |  | 
| 102     [int offset = 0, int length, |  | 
| 103     int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |  | 
| 104   return new IterableWindows1252Decoder( |  | 
| 105       bytes, offset, length, replacementCodepoint); |  | 
| 106 } |  | 
| 107 |  | 
| 108 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type |  | 
| 109 /// provides an iterator on demand and the iterator will only translate bytes |  | 
| 110 /// as requested by the user of the iterator. (Note: results are not cached.) |  | 
| 111 class IterableWindows1252Decoder extends IterableBase<int> { |  | 
| 112   final List<int> bytes; |  | 
| 113   final int offset; |  | 
| 114   final int length; |  | 
| 115   final int replacementCodepoint; |  | 
| 116 |  | 
| 117   IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, |  | 
| 118       int this.length = null, |  | 
| 119       int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |  | 
| 120 |  | 
| 121   Windows1252Decoder get iterator => |  | 
| 122       new Windows1252Decoder(bytes, offset, length, replacementCodepoint); |  | 
| 123 } |  | 
| 124 |  | 
| 125 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. |  | 
| 126 /// The parameters can set an offset into a list of bytes (as int), limit the |  | 
| 127 /// length of the values to be decoded, and override the default Unicode |  | 
| 128 /// replacement character. Set the replacementCharacter to null to throw an |  | 
| 129 /// ArgumentError rather than replace the bad value. The return value |  | 
| 130 /// from this method can be used as an Iterable (e.g. in a for-loop). |  | 
| 131 class Windows1252Decoder implements Iterator<int> { |  | 
| 132   final int replacementCodepoint; |  | 
| 133   final List<int> _bytes; |  | 
| 134   int _offset; |  | 
| 135   final int _length; |  | 
| 136 |  | 
| 137   Windows1252Decoder(List<int> bytes, [int offset = 0, int length, |  | 
| 138       this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |  | 
| 139       : _bytes = bytes, |  | 
| 140         _offset = offset - 1, |  | 
| 141         _length = length == null ? bytes.length : length; |  | 
| 142 |  | 
| 143   bool get _inRange => _offset >= 0 && _offset < _length; |  | 
| 144   int get current => _inRange ? _mapChar(_bytes[_offset]) : null; |  | 
| 145 |  | 
| 146   bool moveNext() { |  | 
| 147     _offset++; |  | 
| 148     return _inRange; |  | 
| 149   } |  | 
| 150 |  | 
| 151   int _mapChar(int char) { |  | 
| 152     // TODO(jmesserly): this is duplicating entitiesWindows1252 and |  | 
| 153     // replacementCharacters from constants.dart |  | 
| 154     switch (char) { |  | 
| 155       case 0x80: |  | 
| 156         return 0x20AC; // EURO SIGN |  | 
| 157       case 0x82: |  | 
| 158         return 0x201A; // SINGLE LOW-9 QUOTATION MARK |  | 
| 159       case 0x83: |  | 
| 160         return 0x0192; // LATIN SMALL LETTER F WITH HOOK |  | 
| 161       case 0x84: |  | 
| 162         return 0x201E; // DOUBLE LOW-9 QUOTATION MARK |  | 
| 163       case 0x85: |  | 
| 164         return 0x2026; // HORIZONTAL ELLIPSIS |  | 
| 165       case 0x86: |  | 
| 166         return 0x2020; // DAGGER |  | 
| 167       case 0x87: |  | 
| 168         return 0x2021; // DOUBLE DAGGER |  | 
| 169       case 0x88: |  | 
| 170         return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT |  | 
| 171       case 0x89: |  | 
| 172         return 0x2030; // PER MILLE SIGN |  | 
| 173       case 0x8A: |  | 
| 174         return 0x0160; // LATIN CAPITAL LETTER S WITH CARON |  | 
| 175       case 0x8B: |  | 
| 176         return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK |  | 
| 177       case 0x8C: |  | 
| 178         return 0x0152; // LATIN CAPITAL LIGATURE OE |  | 
| 179       case 0x8E: |  | 
| 180         return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON |  | 
| 181       case 0x91: |  | 
| 182         return 0x2018; // LEFT SINGLE QUOTATION MARK |  | 
| 183       case 0x92: |  | 
| 184         return 0x2019; // RIGHT SINGLE QUOTATION MARK |  | 
| 185       case 0x93: |  | 
| 186         return 0x201C; // LEFT DOUBLE QUOTATION MARK |  | 
| 187       case 0x94: |  | 
| 188         return 0x201D; // RIGHT DOUBLE QUOTATION MARK |  | 
| 189       case 0x95: |  | 
| 190         return 0x2022; // BULLET |  | 
| 191       case 0x96: |  | 
| 192         return 0x2013; // EN DASH |  | 
| 193       case 0x97: |  | 
| 194         return 0x2014; // EM DASH |  | 
| 195       case 0x98: |  | 
| 196         return 0x02DC; // SMALL TILDE |  | 
| 197       case 0x99: |  | 
| 198         return 0x2122; // TRADE MARK SIGN |  | 
| 199       case 0x9A: |  | 
| 200         return 0x0161; // LATIN SMALL LETTER S WITH CARON |  | 
| 201       case 0x9B: |  | 
| 202         return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |  | 
| 203       case 0x9C: |  | 
| 204         return 0x0153; // LATIN SMALL LIGATURE OE |  | 
| 205       case 0x9E: |  | 
| 206         return 0x017E; // LATIN SMALL LETTER Z WITH CARON |  | 
| 207       case 0x9F: |  | 
| 208         return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS |  | 
| 209 |  | 
| 210       case 0x81: |  | 
| 211       case 0x8D: |  | 
| 212       case 0x8F: |  | 
| 213       case 0x90: |  | 
| 214       case 0x9D: |  | 
| 215         if (replacementCodepoint == null) { |  | 
| 216           throw new ArgumentError( |  | 
| 217               "Invalid windows-1252 code point $char at $_offset"); |  | 
| 218         } |  | 
| 219         return replacementCodepoint; |  | 
| 220     } |  | 
| 221     return char; |  | 
| 222   } |  | 
| 223 } |  | 
| OLD | NEW | 
|---|