Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(236)

Side by Side Diff: pkg/third_party/html5lib/lib/src/encoding_parser.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 library encoding_parser;
2
3 import 'dart:collection';
4 import 'constants.dart';
5 import 'inputstream.dart';
6 import 'utils.dart';
7
8 // TODO(jmesserly): I converted StopIteration to StateError("No more elements").
9 // Seems strange to throw this from outside of an iterator though.
10 /**
11 * String-like object with an associated position and various extra methods
12 * If the position is ever greater than the string length then an exception is
13 * raised.
14 */
15 class EncodingBytes extends IterableBase<String> {
16 final String _bytes;
17 int _position = -1;
18
19 EncodingBytes(this._bytes);
20
21 Iterator<String> get iterator => _bytes.split('').iterator;
22
23 int get length => _bytes.length;
24
25 String next() {
26 var p = _position = _position + 1;
27 if (p >= length) {
28 throw new StateError("No more elements");
29 } else if (p < 0) {
30 throw new RangeError(p);
31 }
32 return _bytes[p];
33 }
34
35 String previous() {
36 var p = _position;
37 if (p >= length) {
38 throw new StateError("No more elements");
39 } else if (p < 0) {
40 throw new RangeError(p);
41 }
42 _position = p = p - 1;
43 return _bytes[p];
44 }
45
46 set position(int value) {
47 if (_position >= length) {
48 throw new StateError("No more elements");
49 }
50 _position = value;
51 }
52
53 int get position {
54 if (_position >= length) {
55 throw new StateError("No more elements");
56 }
57 if (_position >= 0) {
58 return _position;
59 } else {
60 return 0;
61 }
62 }
63
64 String get currentByte => _bytes[position];
65
66 /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */
67 String skipChars([CharPreciate skipChars]) {
68 if (skipChars == null) skipChars = isWhitespace;
69 var p = position; // use property for the error-checking
70 while (p < length) {
71 var c = _bytes[p];
72 if (!skipChars(c)) {
73 _position = p;
74 return c;
75 }
76 p += 1;
77 }
78 _position = p;
79 return null;
80 }
81
82 String skipUntil(CharPreciate untilChars) {
83 var p = position;
84 while (p < length) {
85 var c = _bytes[p];
86 if (untilChars(c)) {
87 _position = p;
88 return c;
89 }
90 p += 1;
91 }
92 return null;
93 }
94
95 /**
96 * Look for a sequence of bytes at the start of a string. If the bytes
97 * are found return true and advance the position to the byte after the
98 * match. Otherwise return false and leave the position alone.
99 */
100 bool matchBytes(String bytes) {
101 var p = position;
102 if (_bytes.length < p + bytes.length) {
103 return false;
104 }
105 var data = _bytes.substring(p, p + bytes.length);
106 if (data == bytes) {
107 position += bytes.length;
108 return true;
109 }
110 return false;
111 }
112
113 /**
114 * Look for the next sequence of bytes matching a given sequence. If
115 * a match is found advance the position to the last byte of the match
116 */
117 bool jumpTo(String bytes) {
118 var newPosition = _bytes.indexOf(bytes, position);
119 if (newPosition >= 0) {
120 _position = newPosition + bytes.length - 1;
121 return true;
122 } else {
123 throw new StateError("No more elements");
124 }
125 }
126
127 String slice(int start, [int end]) {
128 if (end == null) end = length;
129 if (end < 0) end += length;
130 return _bytes.substring(start, end - start);
131 }
132 }
133
134 /** Mini parser for detecting character encoding from meta elements. */
135 class EncodingParser {
136 final EncodingBytes data;
137 String encoding;
138
139 /** [bytes] - the data to work on for encoding detection. */
140 EncodingParser(List<int> bytes)
141 // Note: this is intentionally interpreting bytes as codepoints.
142 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());
143
144 String getEncoding() {
145 final methodDispatch = [
146 ["<!--", handleComment],
147 ["<meta", handleMeta],
148 ["</", handlePossibleEndTag],
149 ["<!", handleOther],
150 ["<?", handleOther],
151 ["<", handlePossibleStartTag]];
152
153 try {
154 for (var byte in data) {
155 var keepParsing = true;
156 for (var dispatch in methodDispatch) {
157 if (data.matchBytes(dispatch[0])) {
158 try {
159 keepParsing = dispatch[1]();
160 break;
161 } on StateError catch (e) {
162 keepParsing = false;
163 break;
164 }
165 }
166 }
167 if (!keepParsing) {
168 break;
169 }
170 }
171 } on StateError catch (e) {
172 // Catch this here to match behavior of Python's StopIteration
173 }
174 return encoding;
175 }
176
177 /** Skip over comments. */
178 bool handleComment() => data.jumpTo("-->");
179
180 bool handleMeta() {
181 if (!isWhitespace(data.currentByte)) {
182 // if we have <meta not followed by a space so just keep going
183 return true;
184 }
185 // We have a valid meta element we want to search for attributes
186 while (true) {
187 // Try to find the next attribute after the current position
188 var attr = getAttribute();
189 if (attr == null) return true;
190
191 if (attr[0] == "charset") {
192 var tentativeEncoding = attr[1];
193 var codec = codecName(tentativeEncoding);
194 if (codec != null) {
195 encoding = codec;
196 return false;
197 }
198 } else if (attr[0] == "content") {
199 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
200 var tentativeEncoding = contentParser.parse();
201 var codec = codecName(tentativeEncoding);
202 if (codec != null) {
203 encoding = codec;
204 return false;
205 }
206 }
207 }
208 }
209
210 bool handlePossibleStartTag() => handlePossibleTag(false);
211
212 bool handlePossibleEndTag() {
213 data.next();
214 return handlePossibleTag(true);
215 }
216
217 bool handlePossibleTag(bool endTag) {
218 if (!isLetter(data.currentByte)) {
219 //If the next byte is not an ascii letter either ignore this
220 //fragment (possible start tag case) or treat it according to
221 //handleOther
222 if (endTag) {
223 data.previous();
224 handleOther();
225 }
226 return true;
227 }
228
229 var c = data.skipUntil(isSpaceOrAngleBracket);
230 if (c == "<") {
231 // return to the first step in the overall "two step" algorithm
232 // reprocessing the < byte
233 data.previous();
234 } else {
235 //Read all attributes
236 var attr = getAttribute();
237 while (attr != null) {
238 attr = getAttribute();
239 }
240 }
241 return true;
242 }
243
244 bool handleOther() => data.jumpTo(">");
245
246 /**
247 * Return a name,value pair for the next attribute in the stream,
248 * if one is found, or null
249 */
250 List<String> getAttribute() {
251 // Step 1 (skip chars)
252 var c = data.skipChars((x) => x == "/" || isWhitespace(x));
253 // Step 2
254 if (c == ">" || c == null) {
255 return null;
256 }
257 // Step 3
258 var attrName = [];
259 var attrValue = [];
260 // Step 4 attribute name
261 while (true) {
262 if (c == null) {
263 return null;
264 } else if (c == "=" && attrName.length > 0) {
265 break;
266 } else if (isWhitespace(c)) {
267 // Step 6!
268 c = data.skipChars();
269 c = data.next();
270 break;
271 } else if (c == "/" || c == ">") {
272 return [attrName.join(), ""];
273 } else if (isLetter(c)) {
274 attrName.add(c.toLowerCase());
275 } else {
276 attrName.add(c);
277 }
278 // Step 5
279 c = data.next();
280 }
281 // Step 7
282 if (c != "=") {
283 data.previous();
284 return [attrName.join(), ""];
285 }
286 // Step 8
287 data.next();
288 // Step 9
289 c = data.skipChars();
290 // Step 10
291 if (c == "'" || c == '"') {
292 // 10.1
293 var quoteChar = c;
294 while (true) {
295 // 10.2
296 c = data.next();
297 if (c == quoteChar) {
298 // 10.3
299 data.next();
300 return [attrName.join(), attrValue.join()];
301 } else if (isLetter(c)) {
302 // 10.4
303 attrValue.add(c.toLowerCase());
304 } else {
305 // 10.5
306 attrValue.add(c);
307 }
308 }
309 } else if (c == ">") {
310 return [attrName.join(), ""];
311 } else if (c == null) {
312 return null;
313 } else if (isLetter(c)) {
314 attrValue.add(c.toLowerCase());
315 } else {
316 attrValue.add(c);
317 }
318 // Step 11
319 while (true) {
320 c = data.next();
321 if (isSpaceOrAngleBracket(c)) {
322 return [attrName.join(), attrValue.join()];
323 } else if (c == null) {
324 return null;
325 } else if (isLetter(c)) {
326 attrValue.add(c.toLowerCase());
327 } else {
328 attrValue.add(c);
329 }
330 }
331 }
332 }
333
334
335 class ContentAttrParser {
336 final EncodingBytes data;
337
338 ContentAttrParser(this.data);
339
340 String parse() {
341 try {
342 // Check if the attr name is charset
343 // otherwise return
344 data.jumpTo("charset");
345 data.position += 1;
346 data.skipChars();
347 if (data.currentByte != "=") {
348 // If there is no = sign keep looking for attrs
349 return null;
350 }
351 data.position += 1;
352 data.skipChars();
353 // Look for an encoding between matching quote marks
354 if (data.currentByte == '"' || data.currentByte == "'") {
355 var quoteMark = data.currentByte;
356 data.position += 1;
357 var oldPosition = data.position;
358 if (data.jumpTo(quoteMark)) {
359 return data.slice(oldPosition, data.position);
360 } else {
361 return null;
362 }
363 } else {
364 // Unquoted value
365 var oldPosition = data.position;
366 try {
367 data.skipUntil(isWhitespace);
368 return data.slice(oldPosition, data.position);
369 } on StateError catch (e) {
370 //Return the whole remaining value
371 return data.slice(oldPosition);
372 }
373 }
374 } on StateError catch (e) {
375 return null;
376 }
377 }
378 }
379
380
381 bool isSpaceOrAngleBracket(String char) {
382 return char == ">" || char == "<" || isWhitespace(char);
383 }
384
385 typedef bool CharPreciate(String char);
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698