| Index: third_party/pkg/html5lib/lib/src/char_encodings.dart
|
| diff --git a/third_party/pkg/html5lib/lib/src/char_encodings.dart b/third_party/pkg/html5lib/lib/src/char_encodings.dart
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..610a8da9c1a6022ed1771b6ab40de91028fc3565
|
| --- /dev/null
|
| +++ b/third_party/pkg/html5lib/lib/src/char_encodings.dart
|
| @@ -0,0 +1,212 @@
|
| +/** Decodes bytes using the correct name. See [decodeBytes]. */
|
| +library char_encodings;
|
| +
|
| +import 'dart:collection';
|
| +import 'dart:utf';
|
| +
|
| +// TODO(jmesserly): this function is conspicuously absent from dart:utf.
|
| +/**
|
| + * Returns true if the [bytes] starts with a UTF-8 byte order mark.
|
| + * Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
|
| + * used in HTML to detect the UTF-
|
| + */
|
| +bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
|
| + int end = length != null ? offset + length : bytes.length;
|
| + return (offset + 3) <= end &&
|
| + bytes[offset] == 0xEF &&
|
| + bytes[offset + 1] == 0xBB &&
|
| + bytes[offset + 2] == 0xBF;
|
| +}
|
| +
|
| +// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
|
| +// file, but dart:utf does not expose stream-based decoders yet.
|
| +/**
|
| + * Decodes the [bytes] with the provided [encoding] and returns an iterable for
|
| + * the codepoints. Supports the major unicode encodings as well as ascii and
|
| + * and windows-1252 encodings.
|
| + */
|
| +Iterable<int> decodeBytes(String encoding, List<int> bytes,
|
| + [int offset = 0, int length,
|
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
|
| + if (length == null) length = bytes.length;
|
| + final replace = replacementCodepoint;
|
| + switch (encoding) {
|
| + case 'ascii':
|
| + bytes = bytes.sublist(offset, offset + length);
|
| + // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
|
| + for (int byte in bytes) {
|
| + if (byte > 127) {
|
| + // TODO(jmesserly): ideally this would be DecoderException, like the
|
| + // one thrown in runtime/bin/string_stream.dart, but we don't want to
|
| + // depend on dart:io.
|
| + throw new FormatException("Illegal ASCII character $byte");
|
| + }
|
| + }
|
| + return bytes;
|
| +
|
| + case 'windows-1252':
|
| + case 'cp1252':
|
| + return decodeWindows1252AsIterable(bytes, offset, length, replace);
|
| +
|
| + case 'utf-8':
|
| + // NOTE: to match the behavior of the other decode functions, we eat the
|
| + // utf-8 BOM here.
|
| + if (hasUtf8Bom(bytes, offset, length)) {
|
| + offset += 3;
|
| + length -= 3;
|
| + }
|
| + return decodeUtf8AsIterable(bytes, offset, length, replace);
|
| +
|
| + case 'utf-16':
|
| + return decodeUtf16AsIterable(bytes, offset, length, replace);
|
| + case 'utf-16-be':
|
| + return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
|
| + case 'utf-16-le':
|
| + return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
|
| +
|
| + case 'utf-32':
|
| + return decodeUtf32AsIterable(bytes, offset, length, replace);
|
| + case 'utf-32-be':
|
| + return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
|
| + case 'utf-32-le':
|
| + return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
|
| +
|
| + default:
|
| + throw new ArgumentError('Encoding $encoding not supported');
|
| + }
|
| +}
|
| +
|
| +
|
| +// TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.
|
| +/**
|
| + * Returns the code points for the [input]. This works like [String.charCodes]
|
| + * but it decodes UTF-16 surrogate pairs.
|
| + */
|
| +List<int> toCodepoints(String input) {
|
| + var newCodes = <int>[];
|
| + for (int i = 0; i < input.length; i++) {
|
| + var c = input.codeUnitAt(i);
|
| + if (0xD800 <= c && c <= 0xDBFF) {
|
| + int next = i + 1;
|
| + if (next < input.length) {
|
| + var d = input.codeUnitAt(next);
|
| + if (0xDC00 <= d && d <= 0xDFFF) {
|
| + c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
|
| + i = next;
|
| + }
|
| + }
|
| + }
|
| + newCodes.add(c);
|
| + }
|
| + return newCodes;
|
| +}
|
| +
|
| +
|
| +/**
|
| + * Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an
|
| + * iterable. Thus, the consumer can only convert as much of the input as needed.
|
| + * Set the [replacementCharacter] to null to throw an [ArgumentError]
|
| + * rather than replace the bad value.
|
| + */
|
| +IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
|
| + [int offset = 0, int length,
|
| + int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
|
| + return new IterableWindows1252Decoder(bytes, offset, length,
|
| + replacementCodepoint);
|
| +}
|
| +
|
| +
|
| +/**
|
| + * Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
|
| + * provides an iterator on demand and the iterator will only translate bytes
|
| + * as requested by the user of the iterator. (Note: results are not cached.)
|
| + */
|
| +class IterableWindows1252Decoder extends IterableBase<int> {
|
| + final List<int> bytes;
|
| + final int offset;
|
| + final int length;
|
| + final int replacementCodepoint;
|
| +
|
| + IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,
|
| + int this.length = null,
|
| + int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
|
| +
|
| + Windows1252Decoder get iterator =>
|
| + new Windows1252Decoder(bytes, offset, length, replacementCodepoint);
|
| +}
|
| +
|
| +
|
| +/**
|
| + * Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
|
| + * The parameters can set an offset into a list of bytes (as int), limit the
|
| + * length of the values to be decoded, and override the default Unicode
|
| + * replacement character. Set the replacementCharacter to null to throw an
|
| + * ArgumentError rather than replace the bad value. The return value
|
| + * from this method can be used as an Iterable (e.g. in a for-loop).
|
| + */
|
| +class Windows1252Decoder implements Iterator<int> {
|
| + final int replacementCodepoint;
|
| + final List<int> _bytes;
|
| + int _offset;
|
| + final int _length;
|
| +
|
| + Windows1252Decoder(List<int> bytes, [int offset = 0, int length,
|
| + this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
|
| + : _bytes = bytes,
|
| + _offset = offset - 1,
|
| + _length = length == null ? bytes.length : length;
|
| +
|
| + bool get _inRange => _offset >= 0 && _offset < _length;
|
| + int get current => _inRange ? _mapChar(_bytes[_offset]) : null;
|
| +
|
| + bool moveNext() {
|
| + _offset++;
|
| + return _inRange;
|
| + }
|
| +
|
| + int _mapChar(int char) {
|
| + // TODO(jmesserly): this is duplicating entitiesWindows1252 and
|
| + // replacementCharacters from constants.dart
|
| + switch (char) {
|
| + case 0x80: return 0x20AC; // EURO SIGN
|
| + case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK
|
| + case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK
|
| + case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
|
| + case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS
|
| + case 0x86: return 0x2020; // DAGGER
|
| + case 0x87: return 0x2021; // DOUBLE DAGGER
|
| + case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
|
| + case 0x89: return 0x2030; // PER MILLE SIGN
|
| + case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
|
| + case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
| + case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE
|
| + case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
|
| + case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK
|
| + case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK
|
| + case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK
|
| + case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK
|
| + case 0x95: return 0x2022; // BULLET
|
| + case 0x96: return 0x2013; // EN DASH
|
| + case 0x97: return 0x2014; // EM DASH
|
| + case 0x98: return 0x02DC; // SMALL TILDE
|
| + case 0x99: return 0x2122; // TRADE MARK SIGN
|
| + case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON
|
| + case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
| + case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE
|
| + case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON
|
| + case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
|
| +
|
| + case 0x81:
|
| + case 0x8D:
|
| + case 0x8F:
|
| + case 0x90:
|
| + case 0x9D:
|
| + if (replacementCodepoint == null) {
|
| + throw new ArgumentError(
|
| + "Invalid windows-1252 code point $char at $_offset");
|
| + }
|
| + return replacementCodepoint;
|
| + }
|
| + return char;
|
| + }
|
| +}
|
|
|