Index: pkg/third_party/html5lib/lib/src/char_encodings.dart |
diff --git a/pkg/third_party/html5lib/lib/src/char_encodings.dart b/pkg/third_party/html5lib/lib/src/char_encodings.dart |
deleted file mode 100644 |
index 9c04999b9d88bedf6f0a08ea56daf773f054589c..0000000000000000000000000000000000000000 |
--- a/pkg/third_party/html5lib/lib/src/char_encodings.dart |
+++ /dev/null |
@@ -1,200 +0,0 @@ |
-/// Decodes bytes using the correct name. See [decodeBytes]. |
-library char_encodings; |
- |
-import 'dart:collection'; |
-import 'package:utf/utf.dart'; |
- |
-// TODO(jmesserly): this function is conspicuously absent from dart:utf. |
-/// Returns true if the [bytes] starts with a UTF-8 byte order mark. |
-/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is |
-/// used in HTML to detect the UTF- |
-bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) { |
- int end = length != null ? offset + length : bytes.length; |
- return (offset + 3) <= end && |
- bytes[offset] == 0xEF && |
- bytes[offset + 1] == 0xBB && |
- bytes[offset + 2] == 0xBF; |
-} |
- |
-// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire |
-// file, but dart:utf does not expose stream-based decoders yet. |
-/// Decodes the [bytes] with the provided [encoding] and returns an iterable for |
-/// the codepoints. Supports the major unicode encodings as well as ascii and |
-/// and windows-1252 encodings. |
-Iterable<int> decodeBytes(String encoding, List<int> bytes, |
- [int offset = 0, int length, |
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
- if (length == null) length = bytes.length; |
- final replace = replacementCodepoint; |
- switch (encoding) { |
- case 'ascii': |
- bytes = bytes.sublist(offset, offset + length); |
- // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart |
- for (int byte in bytes) { |
- if (byte > 127) { |
- // TODO(jmesserly): ideally this would be DecoderException, like the |
- // one thrown in runtime/bin/string_stream.dart, but we don't want to |
- // depend on dart:io. |
- throw new FormatException("Illegal ASCII character $byte"); |
- } |
- } |
- return bytes; |
- |
- case 'windows-1252': |
- case 'cp1252': |
- return decodeWindows1252AsIterable(bytes, offset, length, replace); |
- |
- case 'utf-8': |
- // NOTE: to match the behavior of the other decode functions, we eat the |
- // utf-8 BOM here. |
- if (hasUtf8Bom(bytes, offset, length)) { |
- offset += 3; |
- length -= 3; |
- } |
- return decodeUtf8AsIterable(bytes, offset, length, replace); |
- |
- case 'utf-16': |
- return decodeUtf16AsIterable(bytes, offset, length, replace); |
- case 'utf-16-be': |
- return decodeUtf16beAsIterable(bytes, offset, length, true, replace); |
- case 'utf-16-le': |
- return decodeUtf16leAsIterable(bytes, offset, length, true, replace); |
- |
- case 'utf-32': |
- return decodeUtf32AsIterable(bytes, offset, length, replace); |
- case 'utf-32-be': |
- return decodeUtf32beAsIterable(bytes, offset, length, true, replace); |
- case 'utf-32-le': |
- return decodeUtf32leAsIterable(bytes, offset, length, true, replace); |
- |
- default: |
- throw new ArgumentError('Encoding $encoding not supported'); |
- } |
-} |
- |
- |
-// TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed. |
-/// Returns the code points for the [input]. This works like [String.charCodes] |
-/// but it decodes UTF-16 surrogate pairs. |
-List<int> toCodepoints(String input) { |
- var newCodes = <int>[]; |
- for (int i = 0; i < input.length; i++) { |
- var c = input.codeUnitAt(i); |
- if (0xD800 <= c && c <= 0xDBFF) { |
- int next = i + 1; |
- if (next < input.length) { |
- var d = input.codeUnitAt(next); |
- if (0xDC00 <= d && d <= 0xDFFF) { |
- c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); |
- i = next; |
- } |
- } |
- } |
- newCodes.add(c); |
- } |
- return newCodes; |
-} |
- |
- |
-/// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as |
-/// an iterable. Thus, the consumer can only convert as much of the input as |
-/// needed. Set the [replacementCharacter] to null to throw an [ArgumentError] |
-/// rather than replace the bad value. |
-IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes, |
- [int offset = 0, int length, |
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
- return new IterableWindows1252Decoder(bytes, offset, length, |
- replacementCodepoint); |
-} |
- |
- |
-/// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type |
-/// provides an iterator on demand and the iterator will only translate bytes |
-/// as requested by the user of the iterator. (Note: results are not cached.) |
-class IterableWindows1252Decoder extends IterableBase<int> { |
- final List<int> bytes; |
- final int offset; |
- final int length; |
- final int replacementCodepoint; |
- |
- IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0, |
- int this.length = null, |
- int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
- |
- Windows1252Decoder get iterator => |
- new Windows1252Decoder(bytes, offset, length, replacementCodepoint); |
-} |
- |
- |
-/// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes. |
-/// The parameters can set an offset into a list of bytes (as int), limit the |
-/// length of the values to be decoded, and override the default Unicode |
-/// replacement character. Set the replacementCharacter to null to throw an |
-/// ArgumentError rather than replace the bad value. The return value |
-/// from this method can be used as an Iterable (e.g. in a for-loop). |
-class Windows1252Decoder implements Iterator<int> { |
- final int replacementCodepoint; |
- final List<int> _bytes; |
- int _offset; |
- final int _length; |
- |
- Windows1252Decoder(List<int> bytes, [int offset = 0, int length, |
- this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
- : _bytes = bytes, |
- _offset = offset - 1, |
- _length = length == null ? bytes.length : length; |
- |
- bool get _inRange => _offset >= 0 && _offset < _length; |
- int get current => _inRange ? _mapChar(_bytes[_offset]) : null; |
- |
- bool moveNext() { |
- _offset++; |
- return _inRange; |
- } |
- |
- int _mapChar(int char) { |
- // TODO(jmesserly): this is duplicating entitiesWindows1252 and |
- // replacementCharacters from constants.dart |
- switch (char) { |
- case 0x80: return 0x20AC; // EURO SIGN |
- case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK |
- case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK |
- case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK |
- case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS |
- case 0x86: return 0x2020; // DAGGER |
- case 0x87: return 0x2021; // DOUBLE DAGGER |
- case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT |
- case 0x89: return 0x2030; // PER MILLE SIGN |
- case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON |
- case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
- case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE |
- case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON |
- case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK |
- case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK |
- case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK |
- case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK |
- case 0x95: return 0x2022; // BULLET |
- case 0x96: return 0x2013; // EN DASH |
- case 0x97: return 0x2014; // EM DASH |
- case 0x98: return 0x02DC; // SMALL TILDE |
- case 0x99: return 0x2122; // TRADE MARK SIGN |
- case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON |
- case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
- case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE |
- case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON |
- case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS |
- |
- case 0x81: |
- case 0x8D: |
- case 0x8F: |
- case 0x90: |
- case 0x9D: |
- if (replacementCodepoint == null) { |
- throw new ArgumentError( |
- "Invalid windows-1252 code point $char at $_offset"); |
- } |
- return replacementCodepoint; |
- } |
- return char; |
- } |
-} |