pkg/third_party/html5lib/lib/src/encoding_parser.dart - Issue 22375011: move html5lib code into dart svn repo

Side by Side Diff: pkg/third_party/html5lib/lib/src/encoding_parser.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 library encoding_parser;

	2

	3 import 'dart:collection';

	4 import 'constants.dart';

	5 import 'inputstream.dart';

	6 import 'utils.dart';

	7

	8 // TODO(jmesserly): I converted StopIteration to StateError("No more elements").

	9 // Seems strange to throw this from outside of an iterator though.

	10 /**

	11 * String-like object with an associated position and various extra methods

	12 * If the position is ever greater than the string length then an exception is

	13 * raised.

	14 */

	15 class EncodingBytes extends IterableBase<String> {

	16 final String _bytes;

	17 int _position = -1;

	18

	19 EncodingBytes(this._bytes);

	20

	21 Iterator<String> get iterator => _bytes.split('').iterator;

	22

	23 int get length => _bytes.length;

	24

	25 String next() {

	26 var p = _position = _position + 1;

	27 if (p >= length) {

	28 throw new StateError("No more elements");

	29 } else if (p < 0) {

	30 throw new RangeError(p);

	31 }

	32 return _bytes[p];

	33 }

	34

	35 String previous() {

	36 var p = _position;

	37 if (p >= length) {

	38 throw new StateError("No more elements");

	39 } else if (p < 0) {

	40 throw new RangeError(p);

	41 }

	42 _position = p = p - 1;

	43 return _bytes[p];

	44 }

	45

	46 set position(int value) {

	47 if (_position >= length) {

	48 throw new StateError("No more elements");

	49 }

	50 _position = value;

	51 }

	52

	53 int get position {

	54 if (_position >= length) {

	55 throw new StateError("No more elements");

	56 }

	57 if (_position >= 0) {

	58 return _position;

	59 } else {

	60 return 0;

	61 }

	62 }

	63

	64 String get currentByte => _bytes[position];

	65

	66 /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */

	67 String skipChars([CharPreciate skipChars]) {

	68 if (skipChars == null) skipChars = isWhitespace;

	69 var p = position; // use property for the error-checking

	70 while (p < length) {

	71 var c = _bytes[p];

	72 if (!skipChars(c)) {

	73 _position = p;

	74 return c;

	75 }

	76 p += 1;

	77 }

	78 _position = p;

	79 return null;

	80 }

	81

	82 String skipUntil(CharPreciate untilChars) {

	83 var p = position;

	84 while (p < length) {

	85 var c = _bytes[p];

	86 if (untilChars(c)) {

	87 _position = p;

	88 return c;

	89 }

	90 p += 1;

	91 }

	92 return null;

	93 }

	94

	95 /**

	96 * Look for a sequence of bytes at the start of a string. If the bytes

	97 * are found return true and advance the position to the byte after the

	98 * match. Otherwise return false and leave the position alone.

	99 */

	100 bool matchBytes(String bytes) {

	101 var p = position;

	102 if (_bytes.length < p + bytes.length) {

	103 return false;

	104 }

	105 var data = _bytes.substring(p, p + bytes.length);

	106 if (data == bytes) {

	107 position += bytes.length;

	108 return true;

	109 }

	110 return false;

	111 }

	112

	113 /**

	114 * Look for the next sequence of bytes matching a given sequence. If

	115 * a match is found advance the position to the last byte of the match

	116 */

	117 bool jumpTo(String bytes) {

	118 var newPosition = _bytes.indexOf(bytes, position);

	119 if (newPosition >= 0) {

	120 _position = newPosition + bytes.length - 1;

	121 return true;

	122 } else {

	123 throw new StateError("No more elements");

	124 }

	125 }

	126

	127 String slice(int start, [int end]) {

	128 if (end == null) end = length;

	129 if (end < 0) end += length;

	130 return _bytes.substring(start, end - start);

	131 }

	132 }

	133

	134 /** Mini parser for detecting character encoding from meta elements. */

	135 class EncodingParser {

	136 final EncodingBytes data;

	137 String encoding;

	138

	139 /** [bytes] - the data to work on for encoding detection. */

	140 EncodingParser(List<int> bytes)

	141 // Note: this is intentionally interpreting bytes as codepoints.

	142 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

	143

	144 String getEncoding() {

	145 final methodDispatch = [

	146 ["<!--", handleComment],

	147 ["<meta", handleMeta],

	148 ["</", handlePossibleEndTag],

	149 ["<!", handleOther],

	150 ["<?", handleOther],

	151 ["<", handlePossibleStartTag]];

	152

	153 try {

	154 for (var byte in data) {

	155 var keepParsing = true;

	156 for (var dispatch in methodDispatch) {

	157 if (data.matchBytes(dispatch[0])) {

	158 try {

	159 keepParsing = dispatch[1]();

	160 break;

	161 } on StateError catch (e) {

	162 keepParsing = false;

	163 break;

	164 }

	165 }

	166 }

	167 if (!keepParsing) {

	168 break;

	169 }

	170 }

	171 } on StateError catch (e) {

	172 // Catch this here to match behavior of Python's StopIteration

	173 }

	174 return encoding;

	175 }

	176

	177 /** Skip over comments. */

	178 bool handleComment() => data.jumpTo("-->");

	179

	180 bool handleMeta() {

	181 if (!isWhitespace(data.currentByte)) {

	182 // if we have <meta not followed by a space so just keep going

	183 return true;

	184 }

	185 // We have a valid meta element we want to search for attributes

	186 while (true) {

	187 // Try to find the next attribute after the current position

	188 var attr = getAttribute();

	189 if (attr == null) return true;

	190

	191 if (attr[0] == "charset") {

	192 var tentativeEncoding = attr[1];

	193 var codec = codecName(tentativeEncoding);

	194 if (codec != null) {

	195 encoding = codec;

	196 return false;

	197 }

	198 } else if (attr[0] == "content") {

	199 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));

	200 var tentativeEncoding = contentParser.parse();

	201 var codec = codecName(tentativeEncoding);

	202 if (codec != null) {

	203 encoding = codec;

	204 return false;

	205 }

	206 }

	207 }

	208 }

	209

	210 bool handlePossibleStartTag() => handlePossibleTag(false);

	211

	212 bool handlePossibleEndTag() {

	213 data.next();

	214 return handlePossibleTag(true);

	215 }

	216

	217 bool handlePossibleTag(bool endTag) {

	218 if (!isLetter(data.currentByte)) {

	219 //If the next byte is not an ascii letter either ignore this

	220 //fragment (possible start tag case) or treat it according to

	221 //handleOther

	222 if (endTag) {

	223 data.previous();

	224 handleOther();

	225 }

	226 return true;

	227 }

	228

	229 var c = data.skipUntil(isSpaceOrAngleBracket);

	230 if (c == "<") {

	231 // return to the first step in the overall "two step" algorithm

	232 // reprocessing the < byte

	233 data.previous();

	234 } else {

	235 //Read all attributes

	236 var attr = getAttribute();

	237 while (attr != null) {

	238 attr = getAttribute();

	239 }

	240 }

	241 return true;

	242 }

	243

	244 bool handleOther() => data.jumpTo(">");

	245

	246 /**

	247 * Return a name,value pair for the next attribute in the stream,

	248 * if one is found, or null

	249 */

	250 List<String> getAttribute() {

	251 // Step 1 (skip chars)

	252 var c = data.skipChars((x) => x == "/" \|\| isWhitespace(x));

	253 // Step 2

	254 if (c == ">" \|\| c == null) {

	255 return null;

	256 }

	257 // Step 3

	258 var attrName = [];

	259 var attrValue = [];

	260 // Step 4 attribute name

	261 while (true) {

	262 if (c == null) {

	263 return null;

	264 } else if (c == "=" && attrName.length > 0) {

	265 break;

	266 } else if (isWhitespace(c)) {

	267 // Step 6!

	268 c = data.skipChars();

	269 c = data.next();

	270 break;

	271 } else if (c == "/" \|\| c == ">") {

	272 return [attrName.join(), ""];

	273 } else if (isLetter(c)) {

	274 attrName.add(c.toLowerCase());

	275 } else {

	276 attrName.add(c);

	277 }

	278 // Step 5

	279 c = data.next();

	280 }

	281 // Step 7

	282 if (c != "=") {

	283 data.previous();

	284 return [attrName.join(), ""];

	285 }

	286 // Step 8

	287 data.next();

	288 // Step 9

	289 c = data.skipChars();

	290 // Step 10

	291 if (c == "'" \|\| c == '"') {

	292 // 10.1

	293 var quoteChar = c;

	294 while (true) {

	295 // 10.2

	296 c = data.next();

	297 if (c == quoteChar) {

	298 // 10.3

	299 data.next();

	300 return [attrName.join(), attrValue.join()];

	301 } else if (isLetter(c)) {

	302 // 10.4

	303 attrValue.add(c.toLowerCase());

	304 } else {

	305 // 10.5

	306 attrValue.add(c);

	307 }

	308 }

	309 } else if (c == ">") {

	310 return [attrName.join(), ""];

	311 } else if (c == null) {

	312 return null;

	313 } else if (isLetter(c)) {

	314 attrValue.add(c.toLowerCase());

	315 } else {

	316 attrValue.add(c);

	317 }

	318 // Step 11

	319 while (true) {

	320 c = data.next();

	321 if (isSpaceOrAngleBracket(c)) {

	322 return [attrName.join(), attrValue.join()];

	323 } else if (c == null) {

	324 return null;

	325 } else if (isLetter(c)) {

	326 attrValue.add(c.toLowerCase());

	327 } else {

	328 attrValue.add(c);

	329 }

	330 }

	331 }

	332 }

	333

	334

	335 class ContentAttrParser {

	336 final EncodingBytes data;

	337

	338 ContentAttrParser(this.data);

	339

	340 String parse() {

	341 try {

	342 // Check if the attr name is charset

	343 // otherwise return

	344 data.jumpTo("charset");

	345 data.position += 1;

	346 data.skipChars();

	347 if (data.currentByte != "=") {

	348 // If there is no = sign keep looking for attrs

	349 return null;

	350 }

	351 data.position += 1;

	352 data.skipChars();

	353 // Look for an encoding between matching quote marks

	354 if (data.currentByte == '"' \|\| data.currentByte == "'") {

	355 var quoteMark = data.currentByte;

	356 data.position += 1;

	357 var oldPosition = data.position;

	358 if (data.jumpTo(quoteMark)) {

	359 return data.slice(oldPosition, data.position);

	360 } else {

	361 return null;

	362 }

	363 } else {

	364 // Unquoted value

	365 var oldPosition = data.position;

	366 try {

	367 data.skipUntil(isWhitespace);

	368 return data.slice(oldPosition, data.position);

	369 } on StateError catch (e) {

	370 //Return the whole remaining value

	371 return data.slice(oldPosition);

	372 }

	373 }

	374 } on StateError catch (e) {

	375 return null;

	376 }

	377 }

	378 }

	379

	380

	381 bool isSpaceOrAngleBracket(String char) {

	382 return char == ">" \|\| char == "<" \|\| isWhitespace(char);

	383 }

	384

	385 typedef bool CharPreciate(String char);

OLD	NEW