OLD | NEW |
(Empty) | |
| 1 library encoding_parser; |
| 2 |
| 3 import 'dart:collection'; |
| 4 import 'constants.dart'; |
| 5 import 'inputstream.dart'; |
| 6 import 'utils.dart'; |
| 7 |
| 8 // TODO(jmesserly): I converted StopIteration to StateError("No more elements"). |
| 9 // Seems strange to throw this from outside of an iterator though. |
| 10 /** |
| 11 * String-like object with an associated position and various extra methods |
| 12 * If the position is ever greater than the string length then an exception is |
| 13 * raised. |
| 14 */ |
| 15 class EncodingBytes extends IterableBase<String> { |
| 16 final String _bytes; |
| 17 int _position = -1; |
| 18 |
| 19 EncodingBytes(this._bytes); |
| 20 |
| 21 Iterator<String> get iterator => _bytes.split('').iterator; |
| 22 |
| 23 int get length => _bytes.length; |
| 24 |
| 25 String next() { |
| 26 var p = _position = _position + 1; |
| 27 if (p >= length) { |
| 28 throw new StateError("No more elements"); |
| 29 } else if (p < 0) { |
| 30 throw new RangeError(p); |
| 31 } |
| 32 return _bytes[p]; |
| 33 } |
| 34 |
| 35 String previous() { |
| 36 var p = _position; |
| 37 if (p >= length) { |
| 38 throw new StateError("No more elements"); |
| 39 } else if (p < 0) { |
| 40 throw new RangeError(p); |
| 41 } |
| 42 _position = p = p - 1; |
| 43 return _bytes[p]; |
| 44 } |
| 45 |
| 46 set position(int value) { |
| 47 if (_position >= length) { |
| 48 throw new StateError("No more elements"); |
| 49 } |
| 50 _position = value; |
| 51 } |
| 52 |
| 53 int get position { |
| 54 if (_position >= length) { |
| 55 throw new StateError("No more elements"); |
| 56 } |
| 57 if (_position >= 0) { |
| 58 return _position; |
| 59 } else { |
| 60 return 0; |
| 61 } |
| 62 } |
| 63 |
| 64 String get currentByte => _bytes[position]; |
| 65 |
| 66 /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */ |
| 67 String skipChars([CharPreciate skipChars]) { |
| 68 if (skipChars == null) skipChars = isWhitespace; |
| 69 var p = position; // use property for the error-checking |
| 70 while (p < length) { |
| 71 var c = _bytes[p]; |
| 72 if (!skipChars(c)) { |
| 73 _position = p; |
| 74 return c; |
| 75 } |
| 76 p += 1; |
| 77 } |
| 78 _position = p; |
| 79 return null; |
| 80 } |
| 81 |
| 82 String skipUntil(CharPreciate untilChars) { |
| 83 var p = position; |
| 84 while (p < length) { |
| 85 var c = _bytes[p]; |
| 86 if (untilChars(c)) { |
| 87 _position = p; |
| 88 return c; |
| 89 } |
| 90 p += 1; |
| 91 } |
| 92 return null; |
| 93 } |
| 94 |
| 95 /** |
| 96 * Look for a sequence of bytes at the start of a string. If the bytes |
| 97 * are found return true and advance the position to the byte after the |
| 98 * match. Otherwise return false and leave the position alone. |
| 99 */ |
| 100 bool matchBytes(String bytes) { |
| 101 var p = position; |
| 102 if (_bytes.length < p + bytes.length) { |
| 103 return false; |
| 104 } |
| 105 var data = _bytes.substring(p, p + bytes.length); |
| 106 if (data == bytes) { |
| 107 position += bytes.length; |
| 108 return true; |
| 109 } |
| 110 return false; |
| 111 } |
| 112 |
| 113 /** |
| 114 * Look for the next sequence of bytes matching a given sequence. If |
| 115 * a match is found advance the position to the last byte of the match |
| 116 */ |
| 117 bool jumpTo(String bytes) { |
| 118 var newPosition = _bytes.indexOf(bytes, position); |
| 119 if (newPosition >= 0) { |
| 120 _position = newPosition + bytes.length - 1; |
| 121 return true; |
| 122 } else { |
| 123 throw new StateError("No more elements"); |
| 124 } |
| 125 } |
| 126 |
| 127 String slice(int start, [int end]) { |
| 128 if (end == null) end = length; |
| 129 if (end < 0) end += length; |
| 130 return _bytes.substring(start, end - start); |
| 131 } |
| 132 } |
| 133 |
| 134 /** Mini parser for detecting character encoding from meta elements. */ |
| 135 class EncodingParser { |
| 136 final EncodingBytes data; |
| 137 String encoding; |
| 138 |
| 139 /** [bytes] - the data to work on for encoding detection. */ |
| 140 EncodingParser(List<int> bytes) |
| 141 // Note: this is intentionally interpreting bytes as codepoints. |
| 142 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase()); |
| 143 |
| 144 String getEncoding() { |
| 145 final methodDispatch = [ |
| 146 ["<!--", handleComment], |
| 147 ["<meta", handleMeta], |
| 148 ["</", handlePossibleEndTag], |
| 149 ["<!", handleOther], |
| 150 ["<?", handleOther], |
| 151 ["<", handlePossibleStartTag]]; |
| 152 |
| 153 try { |
| 154 for (var byte in data) { |
| 155 var keepParsing = true; |
| 156 for (var dispatch in methodDispatch) { |
| 157 if (data.matchBytes(dispatch[0])) { |
| 158 try { |
| 159 keepParsing = dispatch[1](); |
| 160 break; |
| 161 } on StateError catch (e) { |
| 162 keepParsing = false; |
| 163 break; |
| 164 } |
| 165 } |
| 166 } |
| 167 if (!keepParsing) { |
| 168 break; |
| 169 } |
| 170 } |
| 171 } on StateError catch (e) { |
| 172 // Catch this here to match behavior of Python's StopIteration |
| 173 } |
| 174 return encoding; |
| 175 } |
| 176 |
| 177 /** Skip over comments. */ |
| 178 bool handleComment() => data.jumpTo("-->"); |
| 179 |
| 180 bool handleMeta() { |
| 181 if (!isWhitespace(data.currentByte)) { |
| 182 // if we have <meta not followed by a space so just keep going |
| 183 return true; |
| 184 } |
| 185 // We have a valid meta element we want to search for attributes |
| 186 while (true) { |
| 187 // Try to find the next attribute after the current position |
| 188 var attr = getAttribute(); |
| 189 if (attr == null) return true; |
| 190 |
| 191 if (attr[0] == "charset") { |
| 192 var tentativeEncoding = attr[1]; |
| 193 var codec = codecName(tentativeEncoding); |
| 194 if (codec != null) { |
| 195 encoding = codec; |
| 196 return false; |
| 197 } |
| 198 } else if (attr[0] == "content") { |
| 199 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1])); |
| 200 var tentativeEncoding = contentParser.parse(); |
| 201 var codec = codecName(tentativeEncoding); |
| 202 if (codec != null) { |
| 203 encoding = codec; |
| 204 return false; |
| 205 } |
| 206 } |
| 207 } |
| 208 } |
| 209 |
| 210 bool handlePossibleStartTag() => handlePossibleTag(false); |
| 211 |
| 212 bool handlePossibleEndTag() { |
| 213 data.next(); |
| 214 return handlePossibleTag(true); |
| 215 } |
| 216 |
| 217 bool handlePossibleTag(bool endTag) { |
| 218 if (!isLetter(data.currentByte)) { |
| 219 //If the next byte is not an ascii letter either ignore this |
| 220 //fragment (possible start tag case) or treat it according to |
| 221 //handleOther |
| 222 if (endTag) { |
| 223 data.previous(); |
| 224 handleOther(); |
| 225 } |
| 226 return true; |
| 227 } |
| 228 |
| 229 var c = data.skipUntil(isSpaceOrAngleBracket); |
| 230 if (c == "<") { |
| 231 // return to the first step in the overall "two step" algorithm |
| 232 // reprocessing the < byte |
| 233 data.previous(); |
| 234 } else { |
| 235 //Read all attributes |
| 236 var attr = getAttribute(); |
| 237 while (attr != null) { |
| 238 attr = getAttribute(); |
| 239 } |
| 240 } |
| 241 return true; |
| 242 } |
| 243 |
| 244 bool handleOther() => data.jumpTo(">"); |
| 245 |
| 246 /** |
| 247 * Return a name,value pair for the next attribute in the stream, |
| 248 * if one is found, or null |
| 249 */ |
| 250 List<String> getAttribute() { |
| 251 // Step 1 (skip chars) |
| 252 var c = data.skipChars((x) => x == "/" || isWhitespace(x)); |
| 253 // Step 2 |
| 254 if (c == ">" || c == null) { |
| 255 return null; |
| 256 } |
| 257 // Step 3 |
| 258 var attrName = []; |
| 259 var attrValue = []; |
| 260 // Step 4 attribute name |
| 261 while (true) { |
| 262 if (c == null) { |
| 263 return null; |
| 264 } else if (c == "=" && attrName.length > 0) { |
| 265 break; |
| 266 } else if (isWhitespace(c)) { |
| 267 // Step 6! |
| 268 c = data.skipChars(); |
| 269 c = data.next(); |
| 270 break; |
| 271 } else if (c == "/" || c == ">") { |
| 272 return [attrName.join(), ""]; |
| 273 } else if (isLetter(c)) { |
| 274 attrName.add(c.toLowerCase()); |
| 275 } else { |
| 276 attrName.add(c); |
| 277 } |
| 278 // Step 5 |
| 279 c = data.next(); |
| 280 } |
| 281 // Step 7 |
| 282 if (c != "=") { |
| 283 data.previous(); |
| 284 return [attrName.join(), ""]; |
| 285 } |
| 286 // Step 8 |
| 287 data.next(); |
| 288 // Step 9 |
| 289 c = data.skipChars(); |
| 290 // Step 10 |
| 291 if (c == "'" || c == '"') { |
| 292 // 10.1 |
| 293 var quoteChar = c; |
| 294 while (true) { |
| 295 // 10.2 |
| 296 c = data.next(); |
| 297 if (c == quoteChar) { |
| 298 // 10.3 |
| 299 data.next(); |
| 300 return [attrName.join(), attrValue.join()]; |
| 301 } else if (isLetter(c)) { |
| 302 // 10.4 |
| 303 attrValue.add(c.toLowerCase()); |
| 304 } else { |
| 305 // 10.5 |
| 306 attrValue.add(c); |
| 307 } |
| 308 } |
| 309 } else if (c == ">") { |
| 310 return [attrName.join(), ""]; |
| 311 } else if (c == null) { |
| 312 return null; |
| 313 } else if (isLetter(c)) { |
| 314 attrValue.add(c.toLowerCase()); |
| 315 } else { |
| 316 attrValue.add(c); |
| 317 } |
| 318 // Step 11 |
| 319 while (true) { |
| 320 c = data.next(); |
| 321 if (isSpaceOrAngleBracket(c)) { |
| 322 return [attrName.join(), attrValue.join()]; |
| 323 } else if (c == null) { |
| 324 return null; |
| 325 } else if (isLetter(c)) { |
| 326 attrValue.add(c.toLowerCase()); |
| 327 } else { |
| 328 attrValue.add(c); |
| 329 } |
| 330 } |
| 331 } |
| 332 } |
| 333 |
| 334 |
| 335 class ContentAttrParser { |
| 336 final EncodingBytes data; |
| 337 |
| 338 ContentAttrParser(this.data); |
| 339 |
| 340 String parse() { |
| 341 try { |
| 342 // Check if the attr name is charset |
| 343 // otherwise return |
| 344 data.jumpTo("charset"); |
| 345 data.position += 1; |
| 346 data.skipChars(); |
| 347 if (data.currentByte != "=") { |
| 348 // If there is no = sign keep looking for attrs |
| 349 return null; |
| 350 } |
| 351 data.position += 1; |
| 352 data.skipChars(); |
| 353 // Look for an encoding between matching quote marks |
| 354 if (data.currentByte == '"' || data.currentByte == "'") { |
| 355 var quoteMark = data.currentByte; |
| 356 data.position += 1; |
| 357 var oldPosition = data.position; |
| 358 if (data.jumpTo(quoteMark)) { |
| 359 return data.slice(oldPosition, data.position); |
| 360 } else { |
| 361 return null; |
| 362 } |
| 363 } else { |
| 364 // Unquoted value |
| 365 var oldPosition = data.position; |
| 366 try { |
| 367 data.skipUntil(isWhitespace); |
| 368 return data.slice(oldPosition, data.position); |
| 369 } on StateError catch (e) { |
| 370 //Return the whole remaining value |
| 371 return data.slice(oldPosition); |
| 372 } |
| 373 } |
| 374 } on StateError catch (e) { |
| 375 return null; |
| 376 } |
| 377 } |
| 378 } |
| 379 |
| 380 |
| 381 bool isSpaceOrAngleBracket(String char) { |
| 382 return char == ">" || char == "<" || isWhitespace(char); |
| 383 } |
| 384 |
| 385 typedef bool CharPreciate(String char); |
OLD | NEW |