Index: pkg/third_party/html5lib/lib/src/char_encodings.dart |
diff --git a/pkg/third_party/html5lib/lib/src/char_encodings.dart b/pkg/third_party/html5lib/lib/src/char_encodings.dart |
new file mode 100644 |
index 0000000000000000000000000000000000000000..610a8da9c1a6022ed1771b6ab40de91028fc3565 |
--- /dev/null |
+++ b/pkg/third_party/html5lib/lib/src/char_encodings.dart |
@@ -0,0 +1,212 @@ |
+/** Decodes bytes using the correct name. See [decodeBytes]. */ |
+library char_encodings; |
+ |
+import 'dart:collection'; |
+import 'dart:utf'; |
+ |
+// TODO(jmesserly): this function is conspicuously absent from dart:utf. |
+/** |
+ * Returns true if the [bytes] starts with a UTF-8 byte order mark. |
+ * Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is |
+ * used in HTML to detect the UTF- |
+ */ |
+bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { |
+ int end = length != null ? offset + length : bytes.length; |
+ return (offset + 3) <= end && |
+ bytes[offset] == 0xEF && |
+ bytes[offset + 1] == 0xBB && |
+ bytes[offset + 2] == 0xBF; |
+} |
+ |
+// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire |
+// file, but dart:utf does not expose stream-based decoders yet. |
+/** |
+ * Decodes the [bytes] with the provided [encoding] and returns an iterable for |
+ * the codepoints. Supports the major unicode encodings as well as ascii and |
+ * and windows-1252 encodings. |
+ */ |
+Iterable<int> decodeBytes(String encoding, List<int> bytes, |
+ [int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ if (length == null) length = bytes.length; |
+ final replace = replacementCodepoint; |
+ switch (encoding) { |
+ case 'ascii': |
+ bytes = bytes.sublist(offset, offset + length); |
+ // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart |
+ for (int byte in bytes) { |
+ if (byte > 127) { |
+ // TODO(jmesserly): ideally this would be DecoderException, like the |
+ // one thrown in runtime/bin/string_stream.dart, but we don't want to |
+ // depend on dart:io. |
+ throw new FormatException("Illegal ASCII character $byte"); |
+ } |
+ } |
+ return bytes; |
+ |
+ case 'windows-1252': |
+ case 'cp1252': |
+ return decodeWindows1252AsIterable(bytes, offset, length, replace); |
+ |
+ case 'utf-8': |
+ // NOTE: to match the behavior of the other decode functions, we eat the |
+ // utf-8 BOM here. |
+ if (hasUtf8Bom(bytes, offset, length)) { |
+ offset += 3; |
+ length -= 3; |
+ } |
+ return decodeUtf8AsIterable(bytes, offset, length, replace); |
+ |
+ case 'utf-16': |
+ return decodeUtf16AsIterable(bytes, offset, length, replace); |
+ case 'utf-16-be': |
+ return decodeUtf16beAsIterable(bytes, offset, length, true, replace); |
+ case 'utf-16-le': |
+ return decodeUtf16leAsIterable(bytes, offset, length, true, replace); |
+ |
+ case 'utf-32': |
+ return decodeUtf32AsIterable(bytes, offset, length, replace); |
+ case 'utf-32-be': |
+ return decodeUtf32beAsIterable(bytes, offset, length, true, replace); |
+ case 'utf-32-le': |
+ return decodeUtf32leAsIterable(bytes, offset, length, true, replace); |
+ |
+ default: |
+ throw new ArgumentError('Encoding $encoding not supported'); |
+ } |
+} |
+ |
+ |
+// TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. |
+/** |
+ * Returns the code points for the [input]. This works like [String.charCodes] |
+ * but it decodes UTF-16 surrogate pairs. |
+ */ |
+List<int> toCodepoints(String input) { |
+ var newCodes = <int>[]; |
+ for (int i = 0; i < input.length; i++) { |
+ var c = input.codeUnitAt(i); |
+ if (0xD800 <= c && c <= 0xDBFF) { |
+ int next = i + 1; |
+ if (next < input.length) { |
+ var d = input.codeUnitAt(next); |
+ if (0xDC00 <= d && d <= 0xDFFF) { |
+ c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); |
+ i = next; |
+ } |
+ } |
+ } |
+ newCodes.add(c); |
+ } |
+ return newCodes; |
+} |
+ |
+ |
+/** |
+ * Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an |
+ * iterable. Thus, the consumer can only convert as much of the input as needed. |
+ * Set the [replacementCharacter] to null to throw an [ArgumentError] |
+ * rather than replace the bad value. |
+ */ |
+IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, |
+ [int offset = 0, int length, |
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
+ return new IterableWindows1252Decoder(bytes, offset, length, |
+ replacementCodepoint); |
+} |
+ |
+ |
+/** |
+ * Return type of [decodeWindows1252AsIterable] and variants. The Iterable type |
+ * provides an iterator on demand and the iterator will only translate bytes |
+ * as requested by the user of the iterator. (Note: results are not cached.) |
+ */ |
+class IterableWindows1252Decoder extends IterableBase<int> { |
+ final List<int> bytes; |
+ final int offset; |
+ final int length; |
+ final int replacementCodepoint; |
+ |
+ IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, |
+ int this.length = null, |
+ int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
+ |
+ Windows1252Decoder get iterator => |
+ new Windows1252Decoder(bytes, offset, length, replacementCodepoint); |
+} |
+ |
+ |
+/** |
+ * Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. |
+ * The parameters can set an offset into a list of bytes (as int), limit the |
+ * length of the values to be decoded, and override the default Unicode |
+ * replacement character. Set the replacementCharacter to null to throw an |
+ * ArgumentError rather than replace the bad value. The return value |
+ * from this method can be used as an Iterable (e.g. in a for-loop). |
+ */ |
+class Windows1252Decoder implements Iterator<int> { |
+ final int replacementCodepoint; |
+ final List<int> _bytes; |
+ int _offset; |
+ final int _length; |
+ |
+ Windows1252Decoder(List<int> bytes, [int offset = 0, int length, |
+ this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
+ : _bytes = bytes, |
+ _offset = offset - 1, |
+ _length = length == null ? bytes.length : length; |
+ |
+ bool get _inRange => _offset >= 0 && _offset < _length; |
+ int get current => _inRange ? _mapChar(_bytes[_offset]) : null; |
+ |
+ bool moveNext() { |
+ _offset++; |
+ return _inRange; |
+ } |
+ |
+ int _mapChar(int char) { |
+ // TODO(jmesserly): this is duplicating entitiesWindows1252 and |
+ // replacementCharacters from constants.dart |
+ switch (char) { |
+ case 0x80: return 0x20AC; // EURO SIGN |
+ case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK |
+ case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK |
+ case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK |
+ case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS |
+ case 0x86: return 0x2020; // DAGGER |
+ case 0x87: return 0x2021; // DOUBLE DAGGER |
+ case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT |
+ case 0x89: return 0x2030; // PER MILLE SIGN |
+ case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON |
+ case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
+ case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE |
+ case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON |
+ case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK |
+ case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK |
+ case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK |
+ case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK |
+ case 0x95: return 0x2022; // BULLET |
+ case 0x96: return 0x2013; // EN DASH |
+ case 0x97: return 0x2014; // EM DASH |
+ case 0x98: return 0x02DC; // SMALL TILDE |
+ case 0x99: return 0x2122; // TRADE MARK SIGN |
+ case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON |
+ case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
+ case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE |
+ case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON |
+ case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS |
+ |
+ case 0x81: |
+ case 0x8D: |
+ case 0x8F: |
+ case 0x90: |
+ case 0x9D: |
+ if (replacementCodepoint == null) { |
+ throw new ArgumentError( |
+ "Invalid windows-1252 code point $char at $_offset"); |
+ } |
+ return replacementCodepoint; |
+ } |
+ return char; |
+ } |
+} |