pkg/third_party/html5lib/lib/src/encoding_parser.dart - Issue 22375011: move html5lib code into dart svn repo

Unified Diff: pkg/third_party/html5lib/lib/src/encoding_parser.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« pkg/third_party/html5lib/html5lib.status ('K') | « pkg/third_party/html5lib/lib/src/constants.dart ('k') | pkg/third_party/html5lib/lib/src/inputstream.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: pkg/third_party/html5lib/lib/src/encoding_parser.dart

diff --git a/pkg/third_party/html5lib/lib/src/encoding_parser.dart b/pkg/third_party/html5lib/lib/src/encoding_parser.dart

new file mode 100644

index 0000000000000000000000000000000000000000..7b0edb8adfb3965be8886418f7d90911cd598841

--- /dev/null

+++ b/pkg/third_party/html5lib/lib/src/encoding_parser.dart

@@ -0,0 +1,385 @@

+library encoding_parser;

+import 'dart:collection';

+import 'constants.dart';

+import 'inputstream.dart';

+import 'utils.dart';

+// TODO(jmesserly): I converted StopIteration to StateError("No more elements").

+// Seems strange to throw this from outside of an iterator though.

+/**

+ * String-like object with an associated position and various extra methods

+ * If the position is ever greater than the string length then an exception is

+ * raised.

+ */

+class EncodingBytes extends IterableBase<String> {

+ final String _bytes;

+ int _position = -1;

+ EncodingBytes(this._bytes);

+ Iterator<String> get iterator => _bytes.split('').iterator;

+ int get length => _bytes.length;

+ String next() {

+ var p = _position = _position + 1;

+ if (p >= length) {

+ throw new StateError("No more elements");

+ } else if (p < 0) {

+ throw new RangeError(p);

+ }

+ return _bytes[p];

+ }

+ String previous() {

+ var p = _position;

+ if (p >= length) {

+ throw new StateError("No more elements");

+ } else if (p < 0) {

+ throw new RangeError(p);

+ }

+ _position = p = p - 1;

+ return _bytes[p];

+ }

+ set position(int value) {

+ if (_position >= length) {

+ throw new StateError("No more elements");

+ }

+ _position = value;

+ }

+ int get position {

+ if (_position >= length) {

+ throw new StateError("No more elements");

+ }

+ if (_position >= 0) {

+ return _position;

+ } else {

+ return 0;

+ }

+ String get currentByte => _bytes[position];

+ /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */

+ String skipChars([CharPreciate skipChars]) {

+ if (skipChars == null) skipChars = isWhitespace;

+ var p = position; // use property for the error-checking

+ while (p < length) {

+ var c = _bytes[p];

+ if (!skipChars(c)) {

+ _position = p;

+ return c;

+ }

+ p += 1;

+ }

+ _position = p;

+ return null;

+ }

+ String skipUntil(CharPreciate untilChars) {

+ var p = position;

+ while (p < length) {

+ var c = _bytes[p];

+ if (untilChars(c)) {

+ _position = p;

+ return c;

+ }

+ p += 1;

+ }

+ return null;

+ }

+ /**

+ * Look for a sequence of bytes at the start of a string. If the bytes

+ * are found return true and advance the position to the byte after the

+ * match. Otherwise return false and leave the position alone.

+ */

+ bool matchBytes(String bytes) {

+ var p = position;

+ if (_bytes.length < p + bytes.length) {

+ return false;

+ }

+ var data = _bytes.substring(p, p + bytes.length);

+ if (data == bytes) {

+ position += bytes.length;

+ return true;

+ }

+ return false;

+ }

+ /**

+ * Look for the next sequence of bytes matching a given sequence. If

+ * a match is found advance the position to the last byte of the match

+ */

+ bool jumpTo(String bytes) {

+ var newPosition = _bytes.indexOf(bytes, position);

+ if (newPosition >= 0) {

+ _position = newPosition + bytes.length - 1;

+ return true;

+ } else {

+ throw new StateError("No more elements");

+ }

+ String slice(int start, [int end]) {

+ if (end == null) end = length;

+ if (end < 0) end += length;

+ return _bytes.substring(start, end - start);

+ }

+/** Mini parser for detecting character encoding from meta elements. */

+class EncodingParser {

+ final EncodingBytes data;

+ String encoding;

+ /** [bytes] - the data to work on for encoding detection. */

+ EncodingParser(List<int> bytes)

+ // Note: this is intentionally interpreting bytes as codepoints.

+ : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

+ String getEncoding() {

+ final methodDispatch = [

+ ["<!--", handleComment],

+ ["<meta", handleMeta],

+ ["</", handlePossibleEndTag],

+ ["<!", handleOther],

+ ["<?", handleOther],

+ ["<", handlePossibleStartTag]];

+ try {

+ for (var byte in data) {

+ var keepParsing = true;

+ for (var dispatch in methodDispatch) {

+ if (data.matchBytes(dispatch[0])) {

+ try {

+ keepParsing = dispatch[1]();

+ break;

+ } on StateError catch (e) {

+ keepParsing = false;

+ break;

+ }

+ if (!keepParsing) {

+ break;

+ }

+ } on StateError catch (e) {

+ // Catch this here to match behavior of Python's StopIteration

+ }

+ return encoding;

+ }

+ /** Skip over comments. */

+ bool handleComment() => data.jumpTo("-->");

+ bool handleMeta() {

+ if (!isWhitespace(data.currentByte)) {

+ // if we have <meta not followed by a space so just keep going

+ return true;

+ }

+ // We have a valid meta element we want to search for attributes

+ while (true) {

+ // Try to find the next attribute after the current position

+ var attr = getAttribute();

+ if (attr == null) return true;

+ if (attr[0] == "charset") {

+ var tentativeEncoding = attr[1];

+ var codec = codecName(tentativeEncoding);

+ if (codec != null) {

+ encoding = codec;

+ return false;

+ }

+ } else if (attr[0] == "content") {

+ var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));

+ var tentativeEncoding = contentParser.parse();

+ var codec = codecName(tentativeEncoding);

+ if (codec != null) {

+ encoding = codec;

+ return false;

+ }

+ bool handlePossibleStartTag() => handlePossibleTag(false);

+ bool handlePossibleEndTag() {

+ data.next();

+ return handlePossibleTag(true);

+ }

+ bool handlePossibleTag(bool endTag) {

+ if (!isLetter(data.currentByte)) {

+ //If the next byte is not an ascii letter either ignore this

+ //fragment (possible start tag case) or treat it according to

+ //handleOther

+ if (endTag) {

+ data.previous();

+ handleOther();

+ }

+ return true;

+ }

+ var c = data.skipUntil(isSpaceOrAngleBracket);

+ if (c == "<") {

+ // return to the first step in the overall "two step" algorithm

+ // reprocessing the < byte

+ data.previous();

+ } else {

+ //Read all attributes

+ var attr = getAttribute();

+ while (attr != null) {

+ attr = getAttribute();

+ }

+ return true;

+ }

+ bool handleOther() => data.jumpTo(">");

+ /**

+ * Return a name,value pair for the next attribute in the stream,

+ * if one is found, or null

+ */

+ List<String> getAttribute() {

+ // Step 1 (skip chars)

+ var c = data.skipChars((x) => x == "/" || isWhitespace(x));

+ // Step 2

+ if (c == ">" || c == null) {

+ return null;

+ }

+ // Step 3

+ var attrName = [];

+ var attrValue = [];

+ // Step 4 attribute name

+ while (true) {

+ if (c == null) {

+ return null;

+ } else if (c == "=" && attrName.length > 0) {

+ break;

+ } else if (isWhitespace(c)) {

+ // Step 6!

+ c = data.skipChars();

+ c = data.next();

+ break;

+ } else if (c == "/" || c == ">") {

+ return [attrName.join(), ""];

+ } else if (isLetter(c)) {

+ attrName.add(c.toLowerCase());

+ } else {

+ attrName.add(c);

+ }

+ // Step 5

+ c = data.next();

+ }

+ // Step 7

+ if (c != "=") {

+ data.previous();

+ return [attrName.join(), ""];

+ }

+ // Step 8

+ data.next();

+ // Step 9

+ c = data.skipChars();

+ // Step 10

+ if (c == "'" || c == '"') {

+ // 10.1

+ var quoteChar = c;

+ while (true) {

+ // 10.2

+ c = data.next();

+ if (c == quoteChar) {

+ // 10.3

+ data.next();

+ return [attrName.join(), attrValue.join()];

+ } else if (isLetter(c)) {

+ // 10.4

+ attrValue.add(c.toLowerCase());

+ } else {

+ // 10.5

+ attrValue.add(c);

+ }

+ } else if (c == ">") {

+ return [attrName.join(), ""];

+ } else if (c == null) {

+ return null;

+ } else if (isLetter(c)) {

+ attrValue.add(c.toLowerCase());

+ } else {

+ attrValue.add(c);

+ }

+ // Step 11

+ while (true) {

+ c = data.next();

+ if (isSpaceOrAngleBracket(c)) {

+ return [attrName.join(), attrValue.join()];

+ } else if (c == null) {

+ return null;

+ } else if (isLetter(c)) {

+ attrValue.add(c.toLowerCase());

+ } else {

+ attrValue.add(c);

+ }

+class ContentAttrParser {

+ final EncodingBytes data;

+ ContentAttrParser(this.data);

+ String parse() {

+ try {

+ // Check if the attr name is charset

+ // otherwise return

+ data.jumpTo("charset");

+ data.position += 1;

+ data.skipChars();

+ if (data.currentByte != "=") {

+ // If there is no = sign keep looking for attrs

+ return null;

+ }

+ data.position += 1;

+ data.skipChars();

+ // Look for an encoding between matching quote marks

+ if (data.currentByte == '"' || data.currentByte == "'") {

+ var quoteMark = data.currentByte;

+ data.position += 1;

+ var oldPosition = data.position;

+ if (data.jumpTo(quoteMark)) {

+ return data.slice(oldPosition, data.position);

+ } else {

+ return null;

+ }

+ } else {

+ // Unquoted value

+ var oldPosition = data.position;

+ try {

+ data.skipUntil(isWhitespace);

+ return data.slice(oldPosition, data.position);

+ } on StateError catch (e) {

+ //Return the whole remaining value

+ return data.slice(oldPosition);

+ }

+ } on StateError catch (e) {

+ return null;

+ }

+bool isSpaceOrAngleBracket(String char) {

+ return char == ">" || char == "<" || isWhitespace(char);

+typedef bool CharPreciate(String char);