observatory_pub_packages/html5lib/src/encoding_parser.dart - Issue 816693004: Add observatory_pub_packages snapshot to third_party

Side by Side Diff: observatory_pub_packages/html5lib/src/encoding_parser.dart

Issue 816693004: Add observatory_pub_packages snapshot to third_party (Closed) Base URL: http://dart.googlecode.com/svn/third_party/

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 library encoding_parser;

	2

	3 import 'dart:collection';

	4 import 'constants.dart';

	5 import 'inputstream.dart';

	6

	7 // TODO(jmesserly): I converted StopIteration to StateError("No more elements").

	8 // Seems strange to throw this from outside of an iterator though.

	9 /// String-like object with an associated position and various extra methods

	10 /// If the position is ever greater than the string length then an exception is

	11 /// raised.

	12 class EncodingBytes extends IterableBase<String> {

	13 final String _bytes;

	14 int _position = -1;

	15

	16 EncodingBytes(this._bytes);

	17

	18 Iterator<String> get iterator => _bytes.split('').iterator;

	19

	20 int get length => _bytes.length;

	21

	22 String next() {

	23 var p = _position = _position + 1;

	24 if (p >= length) {

	25 throw new StateError("No more elements");

	26 } else if (p < 0) {

	27 throw new RangeError(p);

	28 }

	29 return _bytes[p];

	30 }

	31

	32 String previous() {

	33 var p = _position;

	34 if (p >= length) {

	35 throw new StateError("No more elements");

	36 } else if (p < 0) {

	37 throw new RangeError(p);

	38 }

	39 _position = p = p - 1;

	40 return _bytes[p];

	41 }

	42

	43 set position(int value) {

	44 if (_position >= length) {

	45 throw new StateError("No more elements");

	46 }

	47 _position = value;

	48 }

	49

	50 int get position {

	51 if (_position >= length) {

	52 throw new StateError("No more elements");

	53 }

	54 if (_position >= 0) {

	55 return _position;

	56 } else {

	57 return 0;

	58 }

	59 }

	60

	61 String get currentByte => _bytes[position];

	62

	63 /// Skip past a list of characters. Defaults to skipping [isWhitespace].

	64 String skipChars([CharPreciate skipChars]) {

	65 if (skipChars == null) skipChars = isWhitespace;

	66 var p = position; // use property for the error-checking

	67 while (p < length) {

	68 var c = _bytes[p];

	69 if (!skipChars(c)) {

	70 _position = p;

	71 return c;

	72 }

	73 p += 1;

	74 }

	75 _position = p;

	76 return null;

	77 }

	78

	79 String skipUntil(CharPreciate untilChars) {

	80 var p = position;

	81 while (p < length) {

	82 var c = _bytes[p];

	83 if (untilChars(c)) {

	84 _position = p;

	85 return c;

	86 }

	87 p += 1;

	88 }

	89 return null;

	90 }

	91

	92 /// Look for a sequence of bytes at the start of a string. If the bytes

	93 /// are found return true and advance the position to the byte after the

	94 /// match. Otherwise return false and leave the position alone.

	95 bool matchBytes(String bytes) {

	96 var p = position;

	97 if (_bytes.length < p + bytes.length) {

	98 return false;

	99 }

	100 var data = _bytes.substring(p, p + bytes.length);

	101 if (data == bytes) {

	102 position += bytes.length;

	103 return true;

	104 }

	105 return false;

	106 }

	107

	108 /// Look for the next sequence of bytes matching a given sequence. If

	109 /// a match is found advance the position to the last byte of the match

	110 bool jumpTo(String bytes) {

	111 var newPosition = _bytes.indexOf(bytes, position);

	112 if (newPosition >= 0) {

	113 _position = newPosition + bytes.length - 1;

	114 return true;

	115 } else {

	116 throw new StateError("No more elements");

	117 }

	118 }

	119

	120 String slice(int start, [int end]) {

	121 if (end == null) end = length;

	122 if (end < 0) end += length;

	123 return _bytes.substring(start, end - start);

	124 }

	125 }

	126

	127 /// Mini parser for detecting character encoding from meta elements.

	128 class EncodingParser {

	129 final EncodingBytes data;

	130 String encoding;

	131

	132 /// [bytes] - the data to work on for encoding detection.

	133 EncodingParser(List<int> bytes)

	134 // Note: this is intentionally interpreting bytes as codepoints.

	135 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

	136

	137 String getEncoding() {

	138 final methodDispatch = [

	139 ["<!--", handleComment],

	140 ["<meta", handleMeta],

	141 ["</", handlePossibleEndTag],

	142 ["<!", handleOther],

	143 ["<?", handleOther],

	144 ["<", handlePossibleStartTag]];

	145

	146 try {

	147 for (var byte in data) {

	148 var keepParsing = true;

	149 for (var dispatch in methodDispatch) {

	150 if (data.matchBytes(dispatch[0])) {

	151 try {

	152 keepParsing = dispatch[1]();

	153 break;

	154 } on StateError catch (e) {

	155 keepParsing = false;

	156 break;

	157 }

	158 }

	159 }

	160 if (!keepParsing) {

	161 break;

	162 }

	163 }

	164 } on StateError catch (e) {

	165 // Catch this here to match behavior of Python's StopIteration

	166 }

	167 return encoding;

	168 }

	169

	170 /// Skip over comments.

	171 bool handleComment() => data.jumpTo("-->");

	172

	173 bool handleMeta() {

	174 if (!isWhitespace(data.currentByte)) {

	175 // if we have <meta not followed by a space so just keep going

	176 return true;

	177 }

	178 // We have a valid meta element we want to search for attributes

	179 while (true) {

	180 // Try to find the next attribute after the current position

	181 var attr = getAttribute();

	182 if (attr == null) return true;

	183

	184 if (attr[0] == "charset") {

	185 var tentativeEncoding = attr[1];

	186 var codec = codecName(tentativeEncoding);

	187 if (codec != null) {

	188 encoding = codec;

	189 return false;

	190 }

	191 } else if (attr[0] == "content") {

	192 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));

	193 var tentativeEncoding = contentParser.parse();

	194 var codec = codecName(tentativeEncoding);

	195 if (codec != null) {

	196 encoding = codec;

	197 return false;

	198 }

	199 }

	200 }

	201 return true; // unreachable

	202 }

	203

	204 bool handlePossibleStartTag() => handlePossibleTag(false);

	205

	206 bool handlePossibleEndTag() {

	207 data.next();

	208 return handlePossibleTag(true);

	209 }

	210

	211 bool handlePossibleTag(bool endTag) {

	212 if (!isLetter(data.currentByte)) {

	213 //If the next byte is not an ascii letter either ignore this

	214 //fragment (possible start tag case) or treat it according to

	215 //handleOther

	216 if (endTag) {

	217 data.previous();

	218 handleOther();

	219 }

	220 return true;

	221 }

	222

	223 var c = data.skipUntil(isSpaceOrAngleBracket);

	224 if (c == "<") {

	225 // return to the first step in the overall "two step" algorithm

	226 // reprocessing the < byte

	227 data.previous();

	228 } else {

	229 //Read all attributes

	230 var attr = getAttribute();

	231 while (attr != null) {

	232 attr = getAttribute();

	233 }

	234 }

	235 return true;

	236 }

	237

	238 bool handleOther() => data.jumpTo(">");

	239

	240 /// Return a name,value pair for the next attribute in the stream,

	241 /// if one is found, or null

	242 List<String> getAttribute() {

	243 // Step 1 (skip chars)

	244 var c = data.skipChars((x) => x == "/" \|\| isWhitespace(x));

	245 // Step 2

	246 if (c == ">" \|\| c == null) {

	247 return null;

	248 }

	249 // Step 3

	250 var attrName = [];

	251 var attrValue = [];

	252 // Step 4 attribute name

	253 while (true) {

	254 if (c == null) {

	255 return null;

	256 } else if (c == "=" && attrName.length > 0) {

	257 break;

	258 } else if (isWhitespace(c)) {

	259 // Step 6!

	260 c = data.skipChars();

	261 c = data.next();

	262 break;

	263 } else if (c == "/" \|\| c == ">") {

	264 return [attrName.join(), ""];

	265 } else if (isLetter(c)) {

	266 attrName.add(c.toLowerCase());

	267 } else {

	268 attrName.add(c);

	269 }

	270 // Step 5

	271 c = data.next();

	272 }

	273 // Step 7

	274 if (c != "=") {

	275 data.previous();

	276 return [attrName.join(), ""];

	277 }

	278 // Step 8

	279 data.next();

	280 // Step 9

	281 c = data.skipChars();

	282 // Step 10

	283 if (c == "'" \|\| c == '"') {

	284 // 10.1

	285 var quoteChar = c;

	286 while (true) {

	287 // 10.2

	288 c = data.next();

	289 if (c == quoteChar) {

	290 // 10.3

	291 data.next();

	292 return [attrName.join(), attrValue.join()];

	293 } else if (isLetter(c)) {

	294 // 10.4

	295 attrValue.add(c.toLowerCase());

	296 } else {

	297 // 10.5

	298 attrValue.add(c);

	299 }

	300 }

	301 } else if (c == ">") {

	302 return [attrName.join(), ""];

	303 } else if (c == null) {

	304 return null;

	305 } else if (isLetter(c)) {

	306 attrValue.add(c.toLowerCase());

	307 } else {

	308 attrValue.add(c);

	309 }

	310 // Step 11

	311 while (true) {

	312 c = data.next();

	313 if (isSpaceOrAngleBracket(c)) {

	314 return [attrName.join(), attrValue.join()];

	315 } else if (c == null) {

	316 return null;

	317 } else if (isLetter(c)) {

	318 attrValue.add(c.toLowerCase());

	319 } else {

	320 attrValue.add(c);

	321 }

	322 }

	323 return null; // unreachable

	324 }

	325 }

	326

	327

	328 class ContentAttrParser {

	329 final EncodingBytes data;

	330

	331 ContentAttrParser(this.data);

	332

	333 String parse() {

	334 try {

	335 // Check if the attr name is charset

	336 // otherwise return

	337 data.jumpTo("charset");

	338 data.position += 1;

	339 data.skipChars();

	340 if (data.currentByte != "=") {

	341 // If there is no = sign keep looking for attrs

	342 return null;

	343 }

	344 data.position += 1;

	345 data.skipChars();

	346 // Look for an encoding between matching quote marks

	347 if (data.currentByte == '"' \|\| data.currentByte == "'") {

	348 var quoteMark = data.currentByte;

	349 data.position += 1;

	350 var oldPosition = data.position;

	351 if (data.jumpTo(quoteMark)) {

	352 return data.slice(oldPosition, data.position);

	353 } else {

	354 return null;

	355 }

	356 } else {

	357 // Unquoted value

	358 var oldPosition = data.position;

	359 try {

	360 data.skipUntil(isWhitespace);

	361 return data.slice(oldPosition, data.position);

	362 } on StateError catch (e) {

	363 //Return the whole remaining value

	364 return data.slice(oldPosition);

	365 }

	366 }

	367 } on StateError catch (e) {

	368 return null;

	369 }

	370 }

	371 }

	372

	373

	374 bool isSpaceOrAngleBracket(String char) {

	375 return char == ">" \|\| char == "<" \|\| isWhitespace(char);

	376 }

	377

	378 typedef bool CharPreciate(String char);

OLD	NEW

« no previous file with comments | « observatory_pub_packages/html5lib/src/css_class_set.dart ('k') | observatory_pub_packages/html5lib/src/inputstream.dart » ('j') | no next file with comments »