OLD | NEW |
(Empty) | |
| 1 library inputstream; |
| 2 |
| 3 import 'dart:collection'; |
| 4 import 'package:utf/utf.dart'; |
| 5 import 'package:source_span/source_span.dart'; |
| 6 import 'char_encodings.dart'; |
| 7 import 'constants.dart'; |
| 8 import 'utils.dart'; |
| 9 import 'encoding_parser.dart'; |
| 10 |
| 11 /// Hooks to call into dart:io without directly referencing it. |
| 12 class ConsoleSupport { |
| 13 List<int> bytesFromFile(source) => null; |
| 14 } |
| 15 |
| 16 // TODO(jmesserly): use lazy init here when supported. |
| 17 ConsoleSupport consoleSupport = new ConsoleSupport(); |
| 18 |
| 19 /// Provides a unicode stream of characters to the HtmlTokenizer. |
| 20 /// |
| 21 /// This class takes care of character encoding and removing or replacing |
| 22 /// incorrect byte-sequences and also provides column and line tracking. |
| 23 class HtmlInputStream { |
| 24 /// Number of bytes to use when looking for a meta element with |
| 25 /// encoding information. |
| 26 static const int numBytesMeta = 512; |
| 27 |
| 28 /// Encoding to use if no other information can be found. |
| 29 static const String defaultEncoding = 'windows-1252'; |
| 30 |
| 31 /// The name of the character encoding. |
| 32 String charEncodingName; |
| 33 |
| 34 /// True if we are certain about [charEncodingName], false for tenative. |
| 35 bool charEncodingCertain = true; |
| 36 |
| 37 final bool generateSpans; |
| 38 |
| 39 /// Location where the contents of the stream were found. |
| 40 final String sourceUrl; |
| 41 |
| 42 List<int> _rawBytes; |
| 43 |
| 44 /// Raw UTF-16 codes, used if a Dart String is passed in. |
| 45 Iterable<int> _rawChars; |
| 46 |
| 47 Queue<String> errors; |
| 48 |
| 49 SourceFile fileInfo; |
| 50 |
| 51 List<int> _lineStarts; |
| 52 |
| 53 List<int> _chars; |
| 54 |
| 55 int _offset; |
| 56 |
| 57 /// Initialises the HtmlInputStream. |
| 58 /// |
| 59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source |
| 60 /// for use by html5lib. |
| 61 /// |
| 62 /// [source] can be either a [String] or a [List<int>] containing the raw |
| 63 /// bytes, or a file if [consoleSupport] is initialized. |
| 64 /// |
| 65 /// The optional encoding parameter must be a string that indicates |
| 66 /// the encoding. If specified, that encoding will be used, |
| 67 /// regardless of any BOM or later declaration (such as in a meta |
| 68 /// element) |
| 69 /// |
| 70 /// [parseMeta] - Look for a <meta> element containing encoding information |
| 71 HtmlInputStream(source, [String encoding, bool parseMeta = true, |
| 72 this.generateSpans = false, this.sourceUrl]) |
| 73 : charEncodingName = codecName(encoding) { |
| 74 if (source is String) { |
| 75 _rawChars = toCodepoints(source); |
| 76 charEncodingName = 'utf-8'; |
| 77 charEncodingCertain = true; |
| 78 } else if (source is List<int>) { |
| 79 _rawBytes = source; |
| 80 } else { |
| 81 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance, |
| 82 // but it's necessary because of how the UTF decoders work. |
| 83 _rawBytes = consoleSupport.bytesFromFile(source); |
| 84 |
| 85 if (_rawBytes == null) { |
| 86 // TODO(jmesserly): we should accept some kind of stream API too. |
| 87 // Unfortunately dart:io InputStream is async only, which won't work. |
| 88 throw new ArgumentError("'source' must be a String or " |
| 89 "List<int> (of bytes). You can also pass a RandomAccessFile if you" |
| 90 "`import 'package:html/parser_console.dart'` and call " |
| 91 "`useConsole()`."); |
| 92 } |
| 93 } |
| 94 |
| 95 // Detect encoding iff no explicit "transport level" encoding is supplied |
| 96 if (charEncodingName == null) { |
| 97 detectEncoding(parseMeta); |
| 98 } |
| 99 |
| 100 reset(); |
| 101 } |
| 102 |
| 103 void reset() { |
| 104 errors = new Queue<String>(); |
| 105 |
| 106 _offset = 0; |
| 107 _lineStarts = <int>[0]; |
| 108 _chars = <int>[]; |
| 109 |
| 110 if (_rawChars == null) { |
| 111 _rawChars = decodeBytes(charEncodingName, _rawBytes); |
| 112 } |
| 113 |
| 114 bool skipNewline = false; |
| 115 for (var c in _rawChars) { |
| 116 if (skipNewline) { |
| 117 skipNewline = false; |
| 118 if (c == NEWLINE) continue; |
| 119 } |
| 120 |
| 121 if (invalidUnicode(c)) errors.add('invalid-codepoint'); |
| 122 |
| 123 if (0xD800 <= c && c <= 0xDFFF) { |
| 124 c = 0xFFFD; |
| 125 } else if (c == RETURN) { |
| 126 skipNewline = true; |
| 127 c = NEWLINE; |
| 128 } |
| 129 |
| 130 _chars.add(c); |
| 131 if (c == NEWLINE) _lineStarts.add(_chars.length); |
| 132 } |
| 133 |
| 134 // Free decoded characters if they aren't needed anymore. |
| 135 if (_rawBytes != null) _rawChars = null; |
| 136 |
| 137 // TODO(sigmund): Don't parse the file at all if spans aren't being |
| 138 // generated. |
| 139 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl); |
| 140 } |
| 141 |
| 142 void detectEncoding([bool parseMeta = true]) { |
| 143 // First look for a BOM |
| 144 // This will also read past the BOM if present |
| 145 charEncodingName = detectBOM(); |
| 146 charEncodingCertain = true; |
| 147 |
| 148 // If there is no BOM need to look for meta elements with encoding |
| 149 // information |
| 150 if (charEncodingName == null && parseMeta) { |
| 151 charEncodingName = detectEncodingMeta(); |
| 152 charEncodingCertain = false; |
| 153 } |
| 154 // If all else fails use the default encoding |
| 155 if (charEncodingName == null) { |
| 156 charEncodingCertain = false; |
| 157 charEncodingName = defaultEncoding; |
| 158 } |
| 159 |
| 160 // Substitute for equivalent encodings: |
| 161 if (charEncodingName.toLowerCase() == 'iso-8859-1') { |
| 162 charEncodingName = 'windows-1252'; |
| 163 } |
| 164 } |
| 165 |
| 166 void changeEncoding(String newEncoding) { |
| 167 if (_rawBytes == null) { |
| 168 // We should never get here -- if encoding is certain we won't try to |
| 169 // change it. |
| 170 throw new StateError('cannot change encoding when parsing a String.'); |
| 171 } |
| 172 |
| 173 newEncoding = codecName(newEncoding); |
| 174 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) { |
| 175 newEncoding = 'utf-8'; |
| 176 } |
| 177 if (newEncoding == null) { |
| 178 return; |
| 179 } else if (newEncoding == charEncodingName) { |
| 180 charEncodingCertain = true; |
| 181 } else { |
| 182 charEncodingName = newEncoding; |
| 183 charEncodingCertain = true; |
| 184 _rawChars = null; |
| 185 reset(); |
| 186 throw new ReparseException( |
| 187 'Encoding changed from $charEncodingName to $newEncoding'); |
| 188 } |
| 189 } |
| 190 |
| 191 /// Attempts to detect at BOM at the start of the stream. If |
| 192 /// an encoding can be determined from the BOM return the name of the |
| 193 /// encoding otherwise return null. |
| 194 String detectBOM() { |
| 195 // Try detecting the BOM using bytes from the string |
| 196 if (hasUtf8Bom(_rawBytes)) { |
| 197 return 'utf-8'; |
| 198 } |
| 199 // Note: we don't need to remember whether it was big or little endian |
| 200 // because the decoder will do that later. It will also eat the BOM for us. |
| 201 if (hasUtf16Bom(_rawBytes)) { |
| 202 return 'utf-16'; |
| 203 } |
| 204 if (hasUtf32Bom(_rawBytes)) { |
| 205 return 'utf-32'; |
| 206 } |
| 207 return null; |
| 208 } |
| 209 |
| 210 /// Report the encoding declared by the meta element. |
| 211 String detectEncodingMeta() { |
| 212 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); |
| 213 var encoding = parser.getEncoding(); |
| 214 |
| 215 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { |
| 216 encoding = 'utf-8'; |
| 217 } |
| 218 |
| 219 return encoding; |
| 220 } |
| 221 |
| 222 /// Returns the current offset in the stream, i.e. the number of codepoints |
| 223 /// since the start of the file. |
| 224 int get position => _offset; |
| 225 |
| 226 /// Read one character from the stream or queue if available. Return |
| 227 /// EOF when EOF is reached. |
| 228 String char() { |
| 229 if (_offset >= _chars.length) return EOF; |
| 230 return new String.fromCharCodes([_chars[_offset++]]); |
| 231 } |
| 232 |
| 233 String peekChar() { |
| 234 if (_offset >= _chars.length) return EOF; |
| 235 return new String.fromCharCodes([_chars[_offset]]); |
| 236 } |
| 237 |
| 238 /// Returns a string of characters from the stream up to but not |
| 239 /// including any character in 'characters' or EOF. |
| 240 String charsUntil(String characters, [bool opposite = false]) { |
| 241 int start = _offset; |
| 242 String c; |
| 243 while ((c = peekChar()) != null && characters.contains(c) == opposite) { |
| 244 _offset++; |
| 245 } |
| 246 |
| 247 return new String.fromCharCodes(_chars.sublist(start, _offset)); |
| 248 } |
| 249 |
| 250 void unget(String ch) { |
| 251 // Only one character is allowed to be ungotten at once - it must |
| 252 // be consumed again before any further call to unget |
| 253 if (ch != null) { |
| 254 _offset--; |
| 255 assert(peekChar() == ch); |
| 256 } |
| 257 } |
| 258 } |
| 259 |
| 260 // TODO(jmesserly): the Python code used a regex to check for this. But |
| 261 // Dart doesn't let you create a regexp with invalid characters. |
| 262 bool invalidUnicode(int c) { |
| 263 if (0x0001 <= c && c <= 0x0008) return true; |
| 264 if (0x000E <= c && c <= 0x001F) return true; |
| 265 if (0x007F <= c && c <= 0x009F) return true; |
| 266 if (0xD800 <= c && c <= 0xDFFF) return true; |
| 267 if (0xFDD0 <= c && c <= 0xFDEF) return true; |
| 268 switch (c) { |
| 269 case 0x000B: |
| 270 case 0xFFFE: |
| 271 case 0xFFFF: |
| 272 case 0x01FFFE: |
| 273 case 0x01FFFF: |
| 274 case 0x02FFFE: |
| 275 case 0x02FFFF: |
| 276 case 0x03FFFE: |
| 277 case 0x03FFFF: |
| 278 case 0x04FFFE: |
| 279 case 0x04FFFF: |
| 280 case 0x05FFFE: |
| 281 case 0x05FFFF: |
| 282 case 0x06FFFE: |
| 283 case 0x06FFFF: |
| 284 case 0x07FFFE: |
| 285 case 0x07FFFF: |
| 286 case 0x08FFFE: |
| 287 case 0x08FFFF: |
| 288 case 0x09FFFE: |
| 289 case 0x09FFFF: |
| 290 case 0x0AFFFE: |
| 291 case 0x0AFFFF: |
| 292 case 0x0BFFFE: |
| 293 case 0x0BFFFF: |
| 294 case 0x0CFFFE: |
| 295 case 0x0CFFFF: |
| 296 case 0x0DFFFE: |
| 297 case 0x0DFFFF: |
| 298 case 0x0EFFFE: |
| 299 case 0x0EFFFF: |
| 300 case 0x0FFFFE: |
| 301 case 0x0FFFFF: |
| 302 case 0x10FFFE: |
| 303 case 0x10FFFF: |
| 304 return true; |
| 305 } |
| 306 return false; |
| 307 } |
| 308 |
| 309 /// Return the python codec name corresponding to an encoding or null if the |
| 310 /// string doesn't correspond to a valid encoding. |
| 311 String codecName(String encoding) { |
| 312 final asciiPunctuation = new RegExp( |
| 313 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); |
| 314 |
| 315 if (encoding == null) return null; |
| 316 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); |
| 317 return encodings[canonicalName]; |
| 318 } |
OLD | NEW |