| Index: third_party/pkg/html5lib/lib/src/encoding_parser.dart
|
| diff --git a/third_party/pkg/html5lib/lib/src/encoding_parser.dart b/third_party/pkg/html5lib/lib/src/encoding_parser.dart
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..7b0edb8adfb3965be8886418f7d90911cd598841
|
| --- /dev/null
|
| +++ b/third_party/pkg/html5lib/lib/src/encoding_parser.dart
|
| @@ -0,0 +1,385 @@
|
| +library encoding_parser;
|
| +
|
| +import 'dart:collection';
|
| +import 'constants.dart';
|
| +import 'inputstream.dart';
|
| +import 'utils.dart';
|
| +
|
| +// TODO(jmesserly): I converted StopIteration to StateError("No more elements").
|
| +// Seems strange to throw this from outside of an iterator though.
|
| +/**
|
| + * String-like object with an associated position and various extra methods
|
| + * If the position is ever greater than the string length then an exception is
|
| + * raised.
|
| + */
|
| +class EncodingBytes extends IterableBase<String> {
|
| + final String _bytes;
|
| + int _position = -1;
|
| +
|
| + EncodingBytes(this._bytes);
|
| +
|
| + Iterator<String> get iterator => _bytes.split('').iterator;
|
| +
|
| + int get length => _bytes.length;
|
| +
|
| + String next() {
|
| + var p = _position = _position + 1;
|
| + if (p >= length) {
|
| + throw new StateError("No more elements");
|
| + } else if (p < 0) {
|
| + throw new RangeError(p);
|
| + }
|
| + return _bytes[p];
|
| + }
|
| +
|
| + String previous() {
|
| + var p = _position;
|
| + if (p >= length) {
|
| + throw new StateError("No more elements");
|
| + } else if (p < 0) {
|
| + throw new RangeError(p);
|
| + }
|
| + _position = p = p - 1;
|
| + return _bytes[p];
|
| + }
|
| +
|
| + set position(int value) {
|
| + if (_position >= length) {
|
| + throw new StateError("No more elements");
|
| + }
|
| + _position = value;
|
| + }
|
| +
|
| + int get position {
|
| + if (_position >= length) {
|
| + throw new StateError("No more elements");
|
| + }
|
| + if (_position >= 0) {
|
| + return _position;
|
| + } else {
|
| + return 0;
|
| + }
|
| + }
|
| +
|
| + String get currentByte => _bytes[position];
|
| +
|
| + /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */
|
| + String skipChars([CharPreciate skipChars]) {
|
| + if (skipChars == null) skipChars = isWhitespace;
|
| + var p = position; // use property for the error-checking
|
| + while (p < length) {
|
| + var c = _bytes[p];
|
| + if (!skipChars(c)) {
|
| + _position = p;
|
| + return c;
|
| + }
|
| + p += 1;
|
| + }
|
| + _position = p;
|
| + return null;
|
| + }
|
| +
|
| + String skipUntil(CharPreciate untilChars) {
|
| + var p = position;
|
| + while (p < length) {
|
| + var c = _bytes[p];
|
| + if (untilChars(c)) {
|
| + _position = p;
|
| + return c;
|
| + }
|
| + p += 1;
|
| + }
|
| + return null;
|
| + }
|
| +
|
| + /**
|
| + * Look for a sequence of bytes at the start of a string. If the bytes
|
| + * are found return true and advance the position to the byte after the
|
| + * match. Otherwise return false and leave the position alone.
|
| + */
|
| + bool matchBytes(String bytes) {
|
| + var p = position;
|
| + if (_bytes.length < p + bytes.length) {
|
| + return false;
|
| + }
|
| + var data = _bytes.substring(p, p + bytes.length);
|
| + if (data == bytes) {
|
| + position += bytes.length;
|
| + return true;
|
| + }
|
| + return false;
|
| + }
|
| +
|
| + /**
|
| + * Look for the next sequence of bytes matching a given sequence. If
|
| + * a match is found advance the position to the last byte of the match
|
| + */
|
| + bool jumpTo(String bytes) {
|
| + var newPosition = _bytes.indexOf(bytes, position);
|
| + if (newPosition >= 0) {
|
| + _position = newPosition + bytes.length - 1;
|
| + return true;
|
| + } else {
|
| + throw new StateError("No more elements");
|
| + }
|
| + }
|
| +
|
| + String slice(int start, [int end]) {
|
| + if (end == null) end = length;
|
| + if (end < 0) end += length;
|
| + return _bytes.substring(start, end - start);
|
| + }
|
| +}
|
| +
|
| +/** Mini parser for detecting character encoding from meta elements. */
|
| +class EncodingParser {
|
| + final EncodingBytes data;
|
| + String encoding;
|
| +
|
| + /** [bytes] - the data to work on for encoding detection. */
|
| + EncodingParser(List<int> bytes)
|
| + // Note: this is intentionally interpreting bytes as codepoints.
|
| + : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());
|
| +
|
| + String getEncoding() {
|
| + final methodDispatch = [
|
| + ["<!--", handleComment],
|
| + ["<meta", handleMeta],
|
| + ["</", handlePossibleEndTag],
|
| + ["<!", handleOther],
|
| + ["<?", handleOther],
|
| + ["<", handlePossibleStartTag]];
|
| +
|
| + try {
|
| + for (var byte in data) {
|
| + var keepParsing = true;
|
| + for (var dispatch in methodDispatch) {
|
| + if (data.matchBytes(dispatch[0])) {
|
| + try {
|
| + keepParsing = dispatch[1]();
|
| + break;
|
| + } on StateError catch (e) {
|
| + keepParsing = false;
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + if (!keepParsing) {
|
| + break;
|
| + }
|
| + }
|
| + } on StateError catch (e) {
|
| + // Catch this here to match behavior of Python's StopIteration
|
| + }
|
| + return encoding;
|
| + }
|
| +
|
| + /** Skip over comments. */
|
| + bool handleComment() => data.jumpTo("-->");
|
| +
|
| + bool handleMeta() {
|
| + if (!isWhitespace(data.currentByte)) {
|
| + // if we have <meta not followed by a space so just keep going
|
| + return true;
|
| + }
|
| + // We have a valid meta element we want to search for attributes
|
| + while (true) {
|
| + // Try to find the next attribute after the current position
|
| + var attr = getAttribute();
|
| + if (attr == null) return true;
|
| +
|
| + if (attr[0] == "charset") {
|
| + var tentativeEncoding = attr[1];
|
| + var codec = codecName(tentativeEncoding);
|
| + if (codec != null) {
|
| + encoding = codec;
|
| + return false;
|
| + }
|
| + } else if (attr[0] == "content") {
|
| + var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
|
| + var tentativeEncoding = contentParser.parse();
|
| + var codec = codecName(tentativeEncoding);
|
| + if (codec != null) {
|
| + encoding = codec;
|
| + return false;
|
| + }
|
| + }
|
| + }
|
| + }
|
| +
|
| + bool handlePossibleStartTag() => handlePossibleTag(false);
|
| +
|
| + bool handlePossibleEndTag() {
|
| + data.next();
|
| + return handlePossibleTag(true);
|
| + }
|
| +
|
| + bool handlePossibleTag(bool endTag) {
|
| + if (!isLetter(data.currentByte)) {
|
| + //If the next byte is not an ascii letter either ignore this
|
| + //fragment (possible start tag case) or treat it according to
|
| + //handleOther
|
| + if (endTag) {
|
| + data.previous();
|
| + handleOther();
|
| + }
|
| + return true;
|
| + }
|
| +
|
| + var c = data.skipUntil(isSpaceOrAngleBracket);
|
| + if (c == "<") {
|
| + // return to the first step in the overall "two step" algorithm
|
| + // reprocessing the < byte
|
| + data.previous();
|
| + } else {
|
| + //Read all attributes
|
| + var attr = getAttribute();
|
| + while (attr != null) {
|
| + attr = getAttribute();
|
| + }
|
| + }
|
| + return true;
|
| + }
|
| +
|
| + bool handleOther() => data.jumpTo(">");
|
| +
|
| + /**
|
| + * Return a name,value pair for the next attribute in the stream,
|
| + * if one is found, or null
|
| + */
|
| + List<String> getAttribute() {
|
| + // Step 1 (skip chars)
|
| + var c = data.skipChars((x) => x == "/" || isWhitespace(x));
|
| + // Step 2
|
| + if (c == ">" || c == null) {
|
| + return null;
|
| + }
|
| + // Step 3
|
| + var attrName = [];
|
| + var attrValue = [];
|
| + // Step 4 attribute name
|
| + while (true) {
|
| + if (c == null) {
|
| + return null;
|
| + } else if (c == "=" && attrName.length > 0) {
|
| + break;
|
| + } else if (isWhitespace(c)) {
|
| + // Step 6!
|
| + c = data.skipChars();
|
| + c = data.next();
|
| + break;
|
| + } else if (c == "/" || c == ">") {
|
| + return [attrName.join(), ""];
|
| + } else if (isLetter(c)) {
|
| + attrName.add(c.toLowerCase());
|
| + } else {
|
| + attrName.add(c);
|
| + }
|
| + // Step 5
|
| + c = data.next();
|
| + }
|
| + // Step 7
|
| + if (c != "=") {
|
| + data.previous();
|
| + return [attrName.join(), ""];
|
| + }
|
| + // Step 8
|
| + data.next();
|
| + // Step 9
|
| + c = data.skipChars();
|
| + // Step 10
|
| + if (c == "'" || c == '"') {
|
| + // 10.1
|
| + var quoteChar = c;
|
| + while (true) {
|
| + // 10.2
|
| + c = data.next();
|
| + if (c == quoteChar) {
|
| + // 10.3
|
| + data.next();
|
| + return [attrName.join(), attrValue.join()];
|
| + } else if (isLetter(c)) {
|
| + // 10.4
|
| + attrValue.add(c.toLowerCase());
|
| + } else {
|
| + // 10.5
|
| + attrValue.add(c);
|
| + }
|
| + }
|
| + } else if (c == ">") {
|
| + return [attrName.join(), ""];
|
| + } else if (c == null) {
|
| + return null;
|
| + } else if (isLetter(c)) {
|
| + attrValue.add(c.toLowerCase());
|
| + } else {
|
| + attrValue.add(c);
|
| + }
|
| + // Step 11
|
| + while (true) {
|
| + c = data.next();
|
| + if (isSpaceOrAngleBracket(c)) {
|
| + return [attrName.join(), attrValue.join()];
|
| + } else if (c == null) {
|
| + return null;
|
| + } else if (isLetter(c)) {
|
| + attrValue.add(c.toLowerCase());
|
| + } else {
|
| + attrValue.add(c);
|
| + }
|
| + }
|
| + }
|
| +}
|
| +
|
| +
|
| +class ContentAttrParser {
|
| + final EncodingBytes data;
|
| +
|
| + ContentAttrParser(this.data);
|
| +
|
| + String parse() {
|
| + try {
|
| + // Check if the attr name is charset
|
| + // otherwise return
|
| + data.jumpTo("charset");
|
| + data.position += 1;
|
| + data.skipChars();
|
| + if (data.currentByte != "=") {
|
| + // If there is no = sign keep looking for attrs
|
| + return null;
|
| + }
|
| + data.position += 1;
|
| + data.skipChars();
|
| + // Look for an encoding between matching quote marks
|
| + if (data.currentByte == '"' || data.currentByte == "'") {
|
| + var quoteMark = data.currentByte;
|
| + data.position += 1;
|
| + var oldPosition = data.position;
|
| + if (data.jumpTo(quoteMark)) {
|
| + return data.slice(oldPosition, data.position);
|
| + } else {
|
| + return null;
|
| + }
|
| + } else {
|
| + // Unquoted value
|
| + var oldPosition = data.position;
|
| + try {
|
| + data.skipUntil(isWhitespace);
|
| + return data.slice(oldPosition, data.position);
|
| + } on StateError catch (e) {
|
| + //Return the whole remaining value
|
| + return data.slice(oldPosition);
|
| + }
|
| + }
|
| + } on StateError catch (e) {
|
| + return null;
|
| + }
|
| + }
|
| +}
|
| +
|
| +
|
| +bool isSpaceOrAngleBracket(String char) {
|
| + return char == ">" || char == "<" || isWhitespace(char);
|
| +}
|
| +
|
| +typedef bool CharPreciate(String char);
|
|
|