OLD | NEW |
| (Empty) |
1 library encoding_parser; | |
2 | |
3 import 'constants.dart'; | |
4 import 'inputstream.dart'; | |
5 | |
6 // TODO(jmesserly): I converted StopIteration to StateError("No more elements"). | |
7 // Seems strange to throw this from outside of an iterator though. | |
8 /// String-like object with an associated position and various extra methods | |
9 /// If the position is ever greater than the string length then an exception is | |
10 /// raised. | |
11 class EncodingBytes { | |
12 final String _bytes; | |
13 int _position = -1; | |
14 | |
15 EncodingBytes(this._bytes); | |
16 | |
17 int get length => _bytes.length; | |
18 | |
19 String next() { | |
20 var p = _position = _position + 1; | |
21 if (p >= length) { | |
22 throw new StateError("No more elements"); | |
23 } else if (p < 0) { | |
24 throw new RangeError(p); | |
25 } | |
26 return _bytes[p]; | |
27 } | |
28 | |
29 String previous() { | |
30 var p = _position; | |
31 if (p >= length) { | |
32 throw new StateError("No more elements"); | |
33 } else if (p < 0) { | |
34 throw new RangeError(p); | |
35 } | |
36 _position = p = p - 1; | |
37 return _bytes[p]; | |
38 } | |
39 | |
40 set position(int value) { | |
41 if (_position >= length) { | |
42 throw new StateError("No more elements"); | |
43 } | |
44 _position = value; | |
45 } | |
46 | |
47 int get position { | |
48 if (_position >= length) { | |
49 throw new StateError("No more elements"); | |
50 } | |
51 if (_position >= 0) { | |
52 return _position; | |
53 } else { | |
54 return 0; | |
55 } | |
56 } | |
57 | |
58 String get currentByte => _bytes[position]; | |
59 | |
60 /// Skip past a list of characters. Defaults to skipping [isWhitespace]. | |
61 String skipChars([CharPreciate skipChars]) { | |
62 if (skipChars == null) skipChars = isWhitespace; | |
63 var p = position; // use property for the error-checking | |
64 while (p < length) { | |
65 var c = _bytes[p]; | |
66 if (!skipChars(c)) { | |
67 _position = p; | |
68 return c; | |
69 } | |
70 p += 1; | |
71 } | |
72 _position = p; | |
73 return null; | |
74 } | |
75 | |
76 String skipUntil(CharPreciate untilChars) { | |
77 var p = position; | |
78 while (p < length) { | |
79 var c = _bytes[p]; | |
80 if (untilChars(c)) { | |
81 _position = p; | |
82 return c; | |
83 } | |
84 p += 1; | |
85 } | |
86 return null; | |
87 } | |
88 | |
89 /// Look for a sequence of bytes at the start of a string. If the bytes | |
90 /// are found return true and advance the position to the byte after the | |
91 /// match. Otherwise return false and leave the position alone. | |
92 bool matchBytes(String bytes) { | |
93 var p = position; | |
94 if (_bytes.length < p + bytes.length) { | |
95 return false; | |
96 } | |
97 var data = _bytes.substring(p, p + bytes.length); | |
98 if (data == bytes) { | |
99 position += bytes.length; | |
100 return true; | |
101 } | |
102 return false; | |
103 } | |
104 | |
105 /// Look for the next sequence of bytes matching a given sequence. If | |
106 /// a match is found advance the position to the last byte of the match | |
107 bool jumpTo(String bytes) { | |
108 var newPosition = _bytes.indexOf(bytes, position); | |
109 if (newPosition >= 0) { | |
110 _position = newPosition + bytes.length - 1; | |
111 return true; | |
112 } else { | |
113 throw new StateError("No more elements"); | |
114 } | |
115 } | |
116 | |
117 String slice(int start, [int end]) { | |
118 if (end == null) end = length; | |
119 if (end < 0) end += length; | |
120 return _bytes.substring(start, end - start); | |
121 } | |
122 } | |
123 | |
124 /// Mini parser for detecting character encoding from meta elements. | |
125 class EncodingParser { | |
126 final EncodingBytes data; | |
127 String encoding; | |
128 | |
129 /// [bytes] - the data to work on for encoding detection. | |
130 EncodingParser(List<int> bytes) | |
131 // Note: this is intentionally interpreting bytes as codepoints. | |
132 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase()); | |
133 | |
134 String getEncoding() { | |
135 final methodDispatch = [ | |
136 ["<!--", handleComment], | |
137 ["<meta", handleMeta], | |
138 ["</", handlePossibleEndTag], | |
139 ["<!", handleOther], | |
140 ["<?", handleOther], | |
141 ["<", handlePossibleStartTag] | |
142 ]; | |
143 | |
144 try { | |
145 for (;;) { | |
146 for (var dispatch in methodDispatch) { | |
147 if (data.matchBytes(dispatch[0])) { | |
148 var keepParsing = dispatch[1](); | |
149 if (keepParsing) break; | |
150 | |
151 // We found an encoding. Stop. | |
152 return encoding; | |
153 } | |
154 } | |
155 data.position += 1; | |
156 } | |
157 } on StateError catch (e) { | |
158 // Catch this here to match behavior of Python's StopIteration | |
159 // TODO(jmesserly): refactor to not use exceptions | |
160 } | |
161 return encoding; | |
162 } | |
163 | |
164 /// Skip over comments. | |
165 bool handleComment() => data.jumpTo("-->"); | |
166 | |
167 bool handleMeta() { | |
168 if (!isWhitespace(data.currentByte)) { | |
169 // if we have <meta not followed by a space so just keep going | |
170 return true; | |
171 } | |
172 // We have a valid meta element we want to search for attributes | |
173 while (true) { | |
174 // Try to find the next attribute after the current position | |
175 var attr = getAttribute(); | |
176 if (attr == null) return true; | |
177 | |
178 if (attr[0] == "charset") { | |
179 var tentativeEncoding = attr[1]; | |
180 var codec = codecName(tentativeEncoding); | |
181 if (codec != null) { | |
182 encoding = codec; | |
183 return false; | |
184 } | |
185 } else if (attr[0] == "content") { | |
186 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1])); | |
187 var tentativeEncoding = contentParser.parse(); | |
188 var codec = codecName(tentativeEncoding); | |
189 if (codec != null) { | |
190 encoding = codec; | |
191 return false; | |
192 } | |
193 } | |
194 } | |
195 return true; // unreachable | |
196 } | |
197 | |
198 bool handlePossibleStartTag() => handlePossibleTag(false); | |
199 | |
200 bool handlePossibleEndTag() { | |
201 data.next(); | |
202 return handlePossibleTag(true); | |
203 } | |
204 | |
205 bool handlePossibleTag(bool endTag) { | |
206 if (!isLetter(data.currentByte)) { | |
207 //If the next byte is not an ascii letter either ignore this | |
208 //fragment (possible start tag case) or treat it according to | |
209 //handleOther | |
210 if (endTag) { | |
211 data.previous(); | |
212 handleOther(); | |
213 } | |
214 return true; | |
215 } | |
216 | |
217 var c = data.skipUntil(isSpaceOrAngleBracket); | |
218 if (c == "<") { | |
219 // return to the first step in the overall "two step" algorithm | |
220 // reprocessing the < byte | |
221 data.previous(); | |
222 } else { | |
223 //Read all attributes | |
224 var attr = getAttribute(); | |
225 while (attr != null) { | |
226 attr = getAttribute(); | |
227 } | |
228 } | |
229 return true; | |
230 } | |
231 | |
232 bool handleOther() => data.jumpTo(">"); | |
233 | |
234 /// Return a name,value pair for the next attribute in the stream, | |
235 /// if one is found, or null | |
236 List<String> getAttribute() { | |
237 // Step 1 (skip chars) | |
238 var c = data.skipChars((x) => x == "/" || isWhitespace(x)); | |
239 // Step 2 | |
240 if (c == ">" || c == null) { | |
241 return null; | |
242 } | |
243 // Step 3 | |
244 var attrName = []; | |
245 var attrValue = []; | |
246 // Step 4 attribute name | |
247 while (true) { | |
248 if (c == null) { | |
249 return null; | |
250 } else if (c == "=" && attrName.length > 0) { | |
251 break; | |
252 } else if (isWhitespace(c)) { | |
253 // Step 6! | |
254 c = data.skipChars(); | |
255 c = data.next(); | |
256 break; | |
257 } else if (c == "/" || c == ">") { | |
258 return [attrName.join(), ""]; | |
259 } else if (isLetter(c)) { | |
260 attrName.add(c.toLowerCase()); | |
261 } else { | |
262 attrName.add(c); | |
263 } | |
264 // Step 5 | |
265 c = data.next(); | |
266 } | |
267 // Step 7 | |
268 if (c != "=") { | |
269 data.previous(); | |
270 return [attrName.join(), ""]; | |
271 } | |
272 // Step 8 | |
273 data.next(); | |
274 // Step 9 | |
275 c = data.skipChars(); | |
276 // Step 10 | |
277 if (c == "'" || c == '"') { | |
278 // 10.1 | |
279 var quoteChar = c; | |
280 while (true) { | |
281 // 10.2 | |
282 c = data.next(); | |
283 if (c == quoteChar) { | |
284 // 10.3 | |
285 data.next(); | |
286 return [attrName.join(), attrValue.join()]; | |
287 } else if (isLetter(c)) { | |
288 // 10.4 | |
289 attrValue.add(c.toLowerCase()); | |
290 } else { | |
291 // 10.5 | |
292 attrValue.add(c); | |
293 } | |
294 } | |
295 } else if (c == ">") { | |
296 return [attrName.join(), ""]; | |
297 } else if (c == null) { | |
298 return null; | |
299 } else if (isLetter(c)) { | |
300 attrValue.add(c.toLowerCase()); | |
301 } else { | |
302 attrValue.add(c); | |
303 } | |
304 // Step 11 | |
305 while (true) { | |
306 c = data.next(); | |
307 if (isSpaceOrAngleBracket(c)) { | |
308 return [attrName.join(), attrValue.join()]; | |
309 } else if (c == null) { | |
310 return null; | |
311 } else if (isLetter(c)) { | |
312 attrValue.add(c.toLowerCase()); | |
313 } else { | |
314 attrValue.add(c); | |
315 } | |
316 } | |
317 return null; // unreachable | |
318 } | |
319 } | |
320 | |
321 class ContentAttrParser { | |
322 final EncodingBytes data; | |
323 | |
324 ContentAttrParser(this.data); | |
325 | |
326 String parse() { | |
327 try { | |
328 // Check if the attr name is charset | |
329 // otherwise return | |
330 data.jumpTo("charset"); | |
331 data.position += 1; | |
332 data.skipChars(); | |
333 if (data.currentByte != "=") { | |
334 // If there is no = sign keep looking for attrs | |
335 return null; | |
336 } | |
337 data.position += 1; | |
338 data.skipChars(); | |
339 // Look for an encoding between matching quote marks | |
340 if (data.currentByte == '"' || data.currentByte == "'") { | |
341 var quoteMark = data.currentByte; | |
342 data.position += 1; | |
343 var oldPosition = data.position; | |
344 if (data.jumpTo(quoteMark)) { | |
345 return data.slice(oldPosition, data.position); | |
346 } else { | |
347 return null; | |
348 } | |
349 } else { | |
350 // Unquoted value | |
351 var oldPosition = data.position; | |
352 try { | |
353 data.skipUntil(isWhitespace); | |
354 return data.slice(oldPosition, data.position); | |
355 } on StateError catch (e) { | |
356 //Return the whole remaining value | |
357 return data.slice(oldPosition); | |
358 } | |
359 } | |
360 } on StateError catch (e) { | |
361 return null; | |
362 } | |
363 } | |
364 } | |
365 | |
366 bool isSpaceOrAngleBracket(String char) { | |
367 return char == ">" || char == "<" || isWhitespace(char); | |
368 } | |
369 | |
370 typedef bool CharPreciate(String char); | |
OLD | NEW |