Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(948)

Side by Side Diff: observatory_pub_packages/html5lib/src/encoding_parser.dart

Issue 816693004: Add observatory_pub_packages snapshot to third_party (Closed) Base URL: http://dart.googlecode.com/svn/third_party/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 library encoding_parser;
2
3 import 'dart:collection';
4 import 'constants.dart';
5 import 'inputstream.dart';
6
7 // TODO(jmesserly): I converted StopIteration to StateError("No more elements").
8 // Seems strange to throw this from outside of an iterator though.
9 /// String-like object with an associated position and various extra methods
10 /// If the position is ever greater than the string length then an exception is
11 /// raised.
12 class EncodingBytes extends IterableBase<String> {
13 final String _bytes;
14 int _position = -1;
15
16 EncodingBytes(this._bytes);
17
18 Iterator<String> get iterator => _bytes.split('').iterator;
19
20 int get length => _bytes.length;
21
22 String next() {
23 var p = _position = _position + 1;
24 if (p >= length) {
25 throw new StateError("No more elements");
26 } else if (p < 0) {
27 throw new RangeError(p);
28 }
29 return _bytes[p];
30 }
31
32 String previous() {
33 var p = _position;
34 if (p >= length) {
35 throw new StateError("No more elements");
36 } else if (p < 0) {
37 throw new RangeError(p);
38 }
39 _position = p = p - 1;
40 return _bytes[p];
41 }
42
43 set position(int value) {
44 if (_position >= length) {
45 throw new StateError("No more elements");
46 }
47 _position = value;
48 }
49
50 int get position {
51 if (_position >= length) {
52 throw new StateError("No more elements");
53 }
54 if (_position >= 0) {
55 return _position;
56 } else {
57 return 0;
58 }
59 }
60
61 String get currentByte => _bytes[position];
62
63 /// Skip past a list of characters. Defaults to skipping [isWhitespace].
64 String skipChars([CharPreciate skipChars]) {
65 if (skipChars == null) skipChars = isWhitespace;
66 var p = position; // use property for the error-checking
67 while (p < length) {
68 var c = _bytes[p];
69 if (!skipChars(c)) {
70 _position = p;
71 return c;
72 }
73 p += 1;
74 }
75 _position = p;
76 return null;
77 }
78
79 String skipUntil(CharPreciate untilChars) {
80 var p = position;
81 while (p < length) {
82 var c = _bytes[p];
83 if (untilChars(c)) {
84 _position = p;
85 return c;
86 }
87 p += 1;
88 }
89 return null;
90 }
91
92 /// Look for a sequence of bytes at the start of a string. If the bytes
93 /// are found return true and advance the position to the byte after the
94 /// match. Otherwise return false and leave the position alone.
95 bool matchBytes(String bytes) {
96 var p = position;
97 if (_bytes.length < p + bytes.length) {
98 return false;
99 }
100 var data = _bytes.substring(p, p + bytes.length);
101 if (data == bytes) {
102 position += bytes.length;
103 return true;
104 }
105 return false;
106 }
107
108 /// Look for the next sequence of bytes matching a given sequence. If
109 /// a match is found advance the position to the last byte of the match
110 bool jumpTo(String bytes) {
111 var newPosition = _bytes.indexOf(bytes, position);
112 if (newPosition >= 0) {
113 _position = newPosition + bytes.length - 1;
114 return true;
115 } else {
116 throw new StateError("No more elements");
117 }
118 }
119
120 String slice(int start, [int end]) {
121 if (end == null) end = length;
122 if (end < 0) end += length;
123 return _bytes.substring(start, end - start);
124 }
125 }
126
127 /// Mini parser for detecting character encoding from meta elements.
128 class EncodingParser {
129 final EncodingBytes data;
130 String encoding;
131
132 /// [bytes] - the data to work on for encoding detection.
133 EncodingParser(List<int> bytes)
134 // Note: this is intentionally interpreting bytes as codepoints.
135 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());
136
137 String getEncoding() {
138 final methodDispatch = [
139 ["<!--", handleComment],
140 ["<meta", handleMeta],
141 ["</", handlePossibleEndTag],
142 ["<!", handleOther],
143 ["<?", handleOther],
144 ["<", handlePossibleStartTag]];
145
146 try {
147 for (var byte in data) {
148 var keepParsing = true;
149 for (var dispatch in methodDispatch) {
150 if (data.matchBytes(dispatch[0])) {
151 try {
152 keepParsing = dispatch[1]();
153 break;
154 } on StateError catch (e) {
155 keepParsing = false;
156 break;
157 }
158 }
159 }
160 if (!keepParsing) {
161 break;
162 }
163 }
164 } on StateError catch (e) {
165 // Catch this here to match behavior of Python's StopIteration
166 }
167 return encoding;
168 }
169
170 /// Skip over comments.
171 bool handleComment() => data.jumpTo("-->");
172
173 bool handleMeta() {
174 if (!isWhitespace(data.currentByte)) {
175 // if we have <meta not followed by a space so just keep going
176 return true;
177 }
178 // We have a valid meta element we want to search for attributes
179 while (true) {
180 // Try to find the next attribute after the current position
181 var attr = getAttribute();
182 if (attr == null) return true;
183
184 if (attr[0] == "charset") {
185 var tentativeEncoding = attr[1];
186 var codec = codecName(tentativeEncoding);
187 if (codec != null) {
188 encoding = codec;
189 return false;
190 }
191 } else if (attr[0] == "content") {
192 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
193 var tentativeEncoding = contentParser.parse();
194 var codec = codecName(tentativeEncoding);
195 if (codec != null) {
196 encoding = codec;
197 return false;
198 }
199 }
200 }
201 return true; // unreachable
202 }
203
204 bool handlePossibleStartTag() => handlePossibleTag(false);
205
206 bool handlePossibleEndTag() {
207 data.next();
208 return handlePossibleTag(true);
209 }
210
211 bool handlePossibleTag(bool endTag) {
212 if (!isLetter(data.currentByte)) {
213 //If the next byte is not an ascii letter either ignore this
214 //fragment (possible start tag case) or treat it according to
215 //handleOther
216 if (endTag) {
217 data.previous();
218 handleOther();
219 }
220 return true;
221 }
222
223 var c = data.skipUntil(isSpaceOrAngleBracket);
224 if (c == "<") {
225 // return to the first step in the overall "two step" algorithm
226 // reprocessing the < byte
227 data.previous();
228 } else {
229 //Read all attributes
230 var attr = getAttribute();
231 while (attr != null) {
232 attr = getAttribute();
233 }
234 }
235 return true;
236 }
237
238 bool handleOther() => data.jumpTo(">");
239
240 /// Return a name,value pair for the next attribute in the stream,
241 /// if one is found, or null
242 List<String> getAttribute() {
243 // Step 1 (skip chars)
244 var c = data.skipChars((x) => x == "/" || isWhitespace(x));
245 // Step 2
246 if (c == ">" || c == null) {
247 return null;
248 }
249 // Step 3
250 var attrName = [];
251 var attrValue = [];
252 // Step 4 attribute name
253 while (true) {
254 if (c == null) {
255 return null;
256 } else if (c == "=" && attrName.length > 0) {
257 break;
258 } else if (isWhitespace(c)) {
259 // Step 6!
260 c = data.skipChars();
261 c = data.next();
262 break;
263 } else if (c == "/" || c == ">") {
264 return [attrName.join(), ""];
265 } else if (isLetter(c)) {
266 attrName.add(c.toLowerCase());
267 } else {
268 attrName.add(c);
269 }
270 // Step 5
271 c = data.next();
272 }
273 // Step 7
274 if (c != "=") {
275 data.previous();
276 return [attrName.join(), ""];
277 }
278 // Step 8
279 data.next();
280 // Step 9
281 c = data.skipChars();
282 // Step 10
283 if (c == "'" || c == '"') {
284 // 10.1
285 var quoteChar = c;
286 while (true) {
287 // 10.2
288 c = data.next();
289 if (c == quoteChar) {
290 // 10.3
291 data.next();
292 return [attrName.join(), attrValue.join()];
293 } else if (isLetter(c)) {
294 // 10.4
295 attrValue.add(c.toLowerCase());
296 } else {
297 // 10.5
298 attrValue.add(c);
299 }
300 }
301 } else if (c == ">") {
302 return [attrName.join(), ""];
303 } else if (c == null) {
304 return null;
305 } else if (isLetter(c)) {
306 attrValue.add(c.toLowerCase());
307 } else {
308 attrValue.add(c);
309 }
310 // Step 11
311 while (true) {
312 c = data.next();
313 if (isSpaceOrAngleBracket(c)) {
314 return [attrName.join(), attrValue.join()];
315 } else if (c == null) {
316 return null;
317 } else if (isLetter(c)) {
318 attrValue.add(c.toLowerCase());
319 } else {
320 attrValue.add(c);
321 }
322 }
323 return null; // unreachable
324 }
325 }
326
327
328 class ContentAttrParser {
329 final EncodingBytes data;
330
331 ContentAttrParser(this.data);
332
333 String parse() {
334 try {
335 // Check if the attr name is charset
336 // otherwise return
337 data.jumpTo("charset");
338 data.position += 1;
339 data.skipChars();
340 if (data.currentByte != "=") {
341 // If there is no = sign keep looking for attrs
342 return null;
343 }
344 data.position += 1;
345 data.skipChars();
346 // Look for an encoding between matching quote marks
347 if (data.currentByte == '"' || data.currentByte == "'") {
348 var quoteMark = data.currentByte;
349 data.position += 1;
350 var oldPosition = data.position;
351 if (data.jumpTo(quoteMark)) {
352 return data.slice(oldPosition, data.position);
353 } else {
354 return null;
355 }
356 } else {
357 // Unquoted value
358 var oldPosition = data.position;
359 try {
360 data.skipUntil(isWhitespace);
361 return data.slice(oldPosition, data.position);
362 } on StateError catch (e) {
363 //Return the whole remaining value
364 return data.slice(oldPosition);
365 }
366 }
367 } on StateError catch (e) {
368 return null;
369 }
370 }
371 }
372
373
374 bool isSpaceOrAngleBracket(String char) {
375 return char == ">" || char == "<" || isWhitespace(char);
376 }
377
378 typedef bool CharPreciate(String char);
OLDNEW
« no previous file with comments | « observatory_pub_packages/html5lib/src/css_class_set.dart ('k') | observatory_pub_packages/html5lib/src/inputstream.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698