Index: pkg/third_party/html5lib/lib/src/inputstream.dart |
diff --git a/pkg/third_party/html5lib/lib/src/inputstream.dart b/pkg/third_party/html5lib/lib/src/inputstream.dart |
deleted file mode 100644 |
index 231fed0c5c259d4cecd040288665e7b2bb39c4b3..0000000000000000000000000000000000000000 |
--- a/pkg/third_party/html5lib/lib/src/inputstream.dart |
+++ /dev/null |
@@ -1,295 +0,0 @@ |
-library inputstream; |
- |
-import 'dart:collection'; |
-import 'package:utf/utf.dart'; |
-import 'package:source_span/source_span.dart'; |
-import 'char_encodings.dart'; |
-import 'constants.dart'; |
-import 'utils.dart'; |
-import 'encoding_parser.dart'; |
- |
-/// Hooks to call into dart:io without directly referencing it. |
-class ConsoleSupport { |
- List<int> bytesFromFile(source) => null; |
-} |
- |
-// TODO(jmesserly): use lazy init here when supported. |
-ConsoleSupport consoleSupport = new ConsoleSupport(); |
- |
-/// Provides a unicode stream of characters to the HtmlTokenizer. |
-/// |
-/// This class takes care of character encoding and removing or replacing |
-/// incorrect byte-sequences and also provides column and line tracking. |
-class HtmlInputStream { |
- /// Number of bytes to use when looking for a meta element with |
- /// encoding information. |
- static const int numBytesMeta = 512; |
- |
- /// Encoding to use if no other information can be found. |
- static const String defaultEncoding = 'windows-1252'; |
- |
- /// The name of the character encoding. |
- String charEncodingName; |
- |
- /// True if we are certain about [charEncodingName], false for tenative. |
- bool charEncodingCertain = true; |
- |
- final bool generateSpans; |
- |
- /// Location where the contents of the stream were found. |
- final String sourceUrl; |
- |
- List<int> _rawBytes; |
- |
- /// Raw UTF-16 codes, used if a Dart String is passed in. |
- Iterable<int> _rawChars; |
- |
- Queue<String> errors; |
- |
- SourceFile fileInfo; |
- |
- List<int> _lineStarts; |
- |
- List<int> _chars; |
- |
- int _offset; |
- |
- /// Initialises the HtmlInputStream. |
- /// |
- /// HtmlInputStream(source, [encoding]) -> Normalized stream from source |
- /// for use by html5lib. |
- /// |
- /// [source] can be either a [String] or a [List<int>] containing the raw |
- /// bytes, or a file if [consoleSupport] is initialized. |
- /// |
- /// The optional encoding parameter must be a string that indicates |
- /// the encoding. If specified, that encoding will be used, |
- /// regardless of any BOM or later declaration (such as in a meta |
- /// element) |
- /// |
- /// [parseMeta] - Look for a <meta> element containing encoding information |
- HtmlInputStream(source, [String encoding, bool parseMeta = true, |
- this.generateSpans = false, this.sourceUrl]) |
- : charEncodingName = codecName(encoding) { |
- |
- if (source is String) { |
- _rawChars = toCodepoints(source); |
- charEncodingName = 'utf-8'; |
- charEncodingCertain = true; |
- } else if (source is List<int>) { |
- _rawBytes = source; |
- } else { |
- // TODO(jmesserly): it's unfortunate we need to read all bytes in advance, |
- // but it's necessary because of how the UTF decoders work. |
- _rawBytes = consoleSupport.bytesFromFile(source); |
- |
- if (_rawBytes == null) { |
- // TODO(jmesserly): we should accept some kind of stream API too. |
- // Unfortunately dart:io InputStream is async only, which won't work. |
- throw new ArgumentError("'source' must be a String or " |
- "List<int> (of bytes). You can also pass a RandomAccessFile if you" |
- "`import 'package:html5lib/parser_console.dart'` and call " |
- "`useConsole()`."); |
- } |
- } |
- |
- // Detect encoding iff no explicit "transport level" encoding is supplied |
- if (charEncodingName == null) { |
- detectEncoding(parseMeta); |
- } |
- |
- reset(); |
- } |
- |
- void reset() { |
- errors = new Queue<String>(); |
- |
- _offset = 0; |
- _lineStarts = <int>[0]; |
- _chars = <int>[]; |
- |
- if (_rawChars == null) { |
- _rawChars = decodeBytes(charEncodingName, _rawBytes); |
- } |
- |
- bool skipNewline = false; |
- for (var c in _rawChars) { |
- if (skipNewline) { |
- skipNewline = false; |
- if (c == NEWLINE) continue; |
- } |
- |
- if (invalidUnicode(c)) errors.add('invalid-codepoint'); |
- |
- if (0xD800 <= c && c <= 0xDFFF) { |
- c = 0xFFFD; |
- } else if (c == RETURN) { |
- skipNewline = true; |
- c = NEWLINE; |
- } |
- |
- _chars.add(c); |
- if (c == NEWLINE) _lineStarts.add(_chars.length); |
- } |
- |
- // Free decoded characters if they aren't needed anymore. |
- if (_rawBytes != null) _rawChars = null; |
- |
- // TODO(sigmund): Don't parse the file at all if spans aren't being |
- // generated. |
- fileInfo = new SourceFile.decoded(_chars, url: sourceUrl); |
- } |
- |
- |
- void detectEncoding([bool parseMeta = true]) { |
- // First look for a BOM |
- // This will also read past the BOM if present |
- charEncodingName = detectBOM(); |
- charEncodingCertain = true; |
- |
- // If there is no BOM need to look for meta elements with encoding |
- // information |
- if (charEncodingName == null && parseMeta) { |
- charEncodingName = detectEncodingMeta(); |
- charEncodingCertain = false; |
- } |
- // If all else fails use the default encoding |
- if (charEncodingName == null) { |
- charEncodingCertain = false; |
- charEncodingName = defaultEncoding; |
- } |
- |
- // Substitute for equivalent encodings: |
- if (charEncodingName.toLowerCase() == 'iso-8859-1') { |
- charEncodingName = 'windows-1252'; |
- } |
- } |
- |
- void changeEncoding(String newEncoding) { |
- if (_rawBytes == null) { |
- // We should never get here -- if encoding is certain we won't try to |
- // change it. |
- throw new StateError('cannot change encoding when parsing a String.'); |
- } |
- |
- newEncoding = codecName(newEncoding); |
- if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) { |
- newEncoding = 'utf-8'; |
- } |
- if (newEncoding == null) { |
- return; |
- } else if (newEncoding == charEncodingName) { |
- charEncodingCertain = true; |
- } else { |
- charEncodingName = newEncoding; |
- charEncodingCertain = true; |
- _rawChars = null; |
- reset(); |
- throw new ReparseException( |
- 'Encoding changed from $charEncodingName to $newEncoding'); |
- } |
- } |
- |
- /// Attempts to detect at BOM at the start of the stream. If |
- /// an encoding can be determined from the BOM return the name of the |
- /// encoding otherwise return null. |
- String detectBOM() { |
- // Try detecting the BOM using bytes from the string |
- if (hasUtf8Bom(_rawBytes)) { |
- return 'utf-8'; |
- } |
- // Note: we don't need to remember whether it was big or little endian |
- // because the decoder will do that later. It will also eat the BOM for us. |
- if (hasUtf16Bom(_rawBytes)) { |
- return 'utf-16'; |
- } |
- if (hasUtf32Bom(_rawBytes)) { |
- return 'utf-32'; |
- } |
- return null; |
- } |
- |
- /// Report the encoding declared by the meta element. |
- String detectEncodingMeta() { |
- var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); |
- var encoding = parser.getEncoding(); |
- |
- if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { |
- encoding = 'utf-8'; |
- } |
- |
- return encoding; |
- } |
- |
- /// Returns the current offset in the stream, i.e. the number of codepoints |
- /// since the start of the file. |
- int get position => _offset; |
- |
- /// Read one character from the stream or queue if available. Return |
- /// EOF when EOF is reached. |
- String char() { |
- if (_offset >= _chars.length) return EOF; |
- return new String.fromCharCodes([_chars[_offset++]]); |
- } |
- |
- String peekChar() { |
- if (_offset >= _chars.length) return EOF; |
- return new String.fromCharCodes([_chars[_offset]]); |
- } |
- |
- /// Returns a string of characters from the stream up to but not |
- /// including any character in 'characters' or EOF. |
- String charsUntil(String characters, [bool opposite = false]) { |
- int start = _offset; |
- String c; |
- while ((c = peekChar()) != null && characters.contains(c) == opposite) { |
- _offset++; |
- } |
- |
- return new String.fromCharCodes(_chars.sublist(start, _offset)); |
- } |
- |
- void unget(String ch) { |
- // Only one character is allowed to be ungotten at once - it must |
- // be consumed again before any further call to unget |
- if (ch != null) { |
- _offset--; |
- assert(peekChar() == ch); |
- } |
- } |
-} |
- |
- |
-// TODO(jmesserly): the Python code used a regex to check for this. But |
-// Dart doesn't let you create a regexp with invalid characters. |
-bool invalidUnicode(int c) { |
- if (0x0001 <= c && c <= 0x0008) return true; |
- if (0x000E <= c && c <= 0x001F) return true; |
- if (0x007F <= c && c <= 0x009F) return true; |
- if (0xD800 <= c && c <= 0xDFFF) return true; |
- if (0xFDD0 <= c && c <= 0xFDEF) return true; |
- switch (c) { |
- case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF: |
- case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF: |
- case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF: |
- case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF: |
- case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF: |
- case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF: |
- case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF: |
- case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF: |
- case 0x10FFFE: case 0x10FFFF: |
- return true; |
- } |
- return false; |
-} |
- |
-/// Return the python codec name corresponding to an encoding or null if the |
-/// string doesn't correspond to a valid encoding. |
-String codecName(String encoding) { |
- final asciiPunctuation = new RegExp( |
- "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); |
- |
- if (encoding == null) return null; |
- var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); |
- return encodings[canonicalName]; |
-} |