mojo/public/dart/third_party/html/lib/src/inputstream.dart - Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs

Unified Diff: mojo/public/dart/third_party/html/lib/src/inputstream.dart

Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « mojo/public/dart/third_party/html/lib/src/encoding_parser.dart ('k') | mojo/public/dart/third_party/html/lib/src/list_proxy.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: mojo/public/dart/third_party/html/lib/src/inputstream.dart

diff --git a/mojo/public/dart/third_party/html/lib/src/inputstream.dart b/mojo/public/dart/third_party/html/lib/src/inputstream.dart

new file mode 100644

index 0000000000000000000000000000000000000000..cb40bc91ae0a794c6a33c6460cfce614dfc3975f

--- /dev/null

+++ b/mojo/public/dart/third_party/html/lib/src/inputstream.dart

@@ -0,0 +1,318 @@

+library inputstream;

+import 'dart:collection';

+import 'package:utf/utf.dart';

+import 'package:source_span/source_span.dart';

+import 'char_encodings.dart';

+import 'constants.dart';

+import 'utils.dart';

+import 'encoding_parser.dart';

+/// Hooks to call into dart:io without directly referencing it.

+class ConsoleSupport {

+ List<int> bytesFromFile(source) => null;

+// TODO(jmesserly): use lazy init here when supported.

+ConsoleSupport consoleSupport = new ConsoleSupport();

+/// Provides a unicode stream of characters to the HtmlTokenizer.

+///

+/// This class takes care of character encoding and removing or replacing

+/// incorrect byte-sequences and also provides column and line tracking.

+class HtmlInputStream {

+ /// Number of bytes to use when looking for a meta element with

+ /// encoding information.

+ static const int numBytesMeta = 512;

+ /// Encoding to use if no other information can be found.

+ static const String defaultEncoding = 'windows-1252';

+ /// The name of the character encoding.

+ String charEncodingName;

+ /// True if we are certain about [charEncodingName], false for tenative.

+ bool charEncodingCertain = true;

+ final bool generateSpans;

+ /// Location where the contents of the stream were found.

+ final String sourceUrl;

+ List<int> _rawBytes;

+ /// Raw UTF-16 codes, used if a Dart String is passed in.

+ Iterable<int> _rawChars;

+ Queue<String> errors;

+ SourceFile fileInfo;

+ List<int> _lineStarts;

+ List<int> _chars;

+ int _offset;

+ /// Initialises the HtmlInputStream.

+ ///

+ /// HtmlInputStream(source, [encoding]) -> Normalized stream from source

+ /// for use by html5lib.

+ ///

+ /// [source] can be either a [String] or a [List<int>] containing the raw

+ /// bytes, or a file if [consoleSupport] is initialized.

+ ///

+ /// The optional encoding parameter must be a string that indicates

+ /// the encoding. If specified, that encoding will be used,

+ /// regardless of any BOM or later declaration (such as in a meta

+ /// element)

+ ///

+ /// [parseMeta] - Look for a <meta> element containing encoding information

+ HtmlInputStream(source, [String encoding, bool parseMeta = true,

+ this.generateSpans = false, this.sourceUrl])

+ : charEncodingName = codecName(encoding) {

+ if (source is String) {

+ _rawChars = toCodepoints(source);

+ charEncodingName = 'utf-8';

+ charEncodingCertain = true;

+ } else if (source is List<int>) {

+ _rawBytes = source;

+ } else {

+ // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

+ // but it's necessary because of how the UTF decoders work.

+ _rawBytes = consoleSupport.bytesFromFile(source);

+ if (_rawBytes == null) {

+ // TODO(jmesserly): we should accept some kind of stream API too.

+ // Unfortunately dart:io InputStream is async only, which won't work.

+ throw new ArgumentError("'source' must be a String or "

+ "List<int> (of bytes). You can also pass a RandomAccessFile if you"

+ "`import 'package:html/parser_console.dart'` and call "

+ "`useConsole()`.");

+ }

+ // Detect encoding iff no explicit "transport level" encoding is supplied

+ if (charEncodingName == null) {

+ detectEncoding(parseMeta);

+ }

+ reset();

+ }

+ void reset() {

+ errors = new Queue<String>();

+ _offset = 0;

+ _lineStarts = <int>[0];

+ _chars = <int>[];

+ if (_rawChars == null) {

+ _rawChars = decodeBytes(charEncodingName, _rawBytes);

+ }

+ bool skipNewline = false;

+ for (var c in _rawChars) {

+ if (skipNewline) {

+ skipNewline = false;

+ if (c == NEWLINE) continue;

+ }

+ if (invalidUnicode(c)) errors.add('invalid-codepoint');

+ if (0xD800 <= c && c <= 0xDFFF) {

+ c = 0xFFFD;

+ } else if (c == RETURN) {

+ skipNewline = true;

+ c = NEWLINE;

+ }

+ _chars.add(c);

+ if (c == NEWLINE) _lineStarts.add(_chars.length);

+ }

+ // Free decoded characters if they aren't needed anymore.

+ if (_rawBytes != null) _rawChars = null;

+ // TODO(sigmund): Don't parse the file at all if spans aren't being

+ // generated.

+ fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);

+ }

+ void detectEncoding([bool parseMeta = true]) {

+ // First look for a BOM

+ // This will also read past the BOM if present

+ charEncodingName = detectBOM();

+ charEncodingCertain = true;

+ // If there is no BOM need to look for meta elements with encoding

+ // information

+ if (charEncodingName == null && parseMeta) {

+ charEncodingName = detectEncodingMeta();

+ charEncodingCertain = false;

+ }

+ // If all else fails use the default encoding

+ if (charEncodingName == null) {

+ charEncodingCertain = false;

+ charEncodingName = defaultEncoding;

+ }

+ // Substitute for equivalent encodings:

+ if (charEncodingName.toLowerCase() == 'iso-8859-1') {

+ charEncodingName = 'windows-1252';

+ }

+ void changeEncoding(String newEncoding) {

+ if (_rawBytes == null) {

+ // We should never get here -- if encoding is certain we won't try to

+ // change it.

+ throw new StateError('cannot change encoding when parsing a String.');

+ }

+ newEncoding = codecName(newEncoding);

+ if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {

+ newEncoding = 'utf-8';

+ }

+ if (newEncoding == null) {

+ return;

+ } else if (newEncoding == charEncodingName) {

+ charEncodingCertain = true;

+ } else {

+ charEncodingName = newEncoding;

+ charEncodingCertain = true;

+ _rawChars = null;

+ reset();

+ throw new ReparseException(

+ 'Encoding changed from $charEncodingName to $newEncoding');

+ }

+ /// Attempts to detect at BOM at the start of the stream. If

+ /// an encoding can be determined from the BOM return the name of the

+ /// encoding otherwise return null.

+ String detectBOM() {

+ // Try detecting the BOM using bytes from the string

+ if (hasUtf8Bom(_rawBytes)) {

+ return 'utf-8';

+ }

+ // Note: we don't need to remember whether it was big or little endian

+ // because the decoder will do that later. It will also eat the BOM for us.

+ if (hasUtf16Bom(_rawBytes)) {

+ return 'utf-16';

+ }

+ if (hasUtf32Bom(_rawBytes)) {

+ return 'utf-32';

+ }

+ return null;

+ }

+ /// Report the encoding declared by the meta element.

+ String detectEncodingMeta() {

+ var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));

+ var encoding = parser.getEncoding();

+ if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {

+ encoding = 'utf-8';

+ }

+ return encoding;

+ }

+ /// Returns the current offset in the stream, i.e. the number of codepoints

+ /// since the start of the file.

+ int get position => _offset;

+ /// Read one character from the stream or queue if available. Return

+ /// EOF when EOF is reached.

+ String char() {

+ if (_offset >= _chars.length) return EOF;

+ return new String.fromCharCodes([_chars[_offset++]]);

+ }

+ String peekChar() {

+ if (_offset >= _chars.length) return EOF;

+ return new String.fromCharCodes([_chars[_offset]]);

+ }

+ /// Returns a string of characters from the stream up to but not

+ /// including any character in 'characters' or EOF.

+ String charsUntil(String characters, [bool opposite = false]) {

+ int start = _offset;

+ String c;

+ while ((c = peekChar()) != null && characters.contains(c) == opposite) {

+ _offset++;

+ }

+ return new String.fromCharCodes(_chars.sublist(start, _offset));

+ }

+ void unget(String ch) {

+ // Only one character is allowed to be ungotten at once - it must

+ // be consumed again before any further call to unget

+ if (ch != null) {

+ _offset--;

+ assert(peekChar() == ch);

+ }

+// TODO(jmesserly): the Python code used a regex to check for this. But

+// Dart doesn't let you create a regexp with invalid characters.

+bool invalidUnicode(int c) {

+ if (0x0001 <= c && c <= 0x0008) return true;

+ if (0x000E <= c && c <= 0x001F) return true;

+ if (0x007F <= c && c <= 0x009F) return true;

+ if (0xD800 <= c && c <= 0xDFFF) return true;

+ if (0xFDD0 <= c && c <= 0xFDEF) return true;

+ switch (c) {

+ case 0x000B:

+ case 0xFFFE:

+ case 0xFFFF:

+ case 0x01FFFE:

+ case 0x01FFFF:

+ case 0x02FFFE:

+ case 0x02FFFF:

+ case 0x03FFFE:

+ case 0x03FFFF:

+ case 0x04FFFE:

+ case 0x04FFFF:

+ case 0x05FFFE:

+ case 0x05FFFF:

+ case 0x06FFFE:

+ case 0x06FFFF:

+ case 0x07FFFE:

+ case 0x07FFFF:

+ case 0x08FFFE:

+ case 0x08FFFF:

+ case 0x09FFFE:

+ case 0x09FFFF:

+ case 0x0AFFFE:

+ case 0x0AFFFF:

+ case 0x0BFFFE:

+ case 0x0BFFFF:

+ case 0x0CFFFE:

+ case 0x0CFFFF:

+ case 0x0DFFFE:

+ case 0x0DFFFF:

+ case 0x0EFFFE:

+ case 0x0EFFFF:

+ case 0x0FFFFE:

+ case 0x0FFFFF:

+ case 0x10FFFE:

+ case 0x10FFFF:

+ return true;

+ }

+ return false;

+/// Return the python codec name corresponding to an encoding or null if the

+/// string doesn't correspond to a valid encoding.

+String codecName(String encoding) {

+ final asciiPunctuation = new RegExp(

+ "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

+ if (encoding == null) return null;

+ var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

+ return encodings[canonicalName];