Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(834)

Unified Diff: mojo/public/dart/third_party/html/lib/src/inputstream.dart

Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: mojo/public/dart/third_party/html/lib/src/inputstream.dart
diff --git a/mojo/public/dart/third_party/html/lib/src/inputstream.dart b/mojo/public/dart/third_party/html/lib/src/inputstream.dart
new file mode 100644
index 0000000000000000000000000000000000000000..cb40bc91ae0a794c6a33c6460cfce614dfc3975f
--- /dev/null
+++ b/mojo/public/dart/third_party/html/lib/src/inputstream.dart
@@ -0,0 +1,318 @@
+library inputstream;
+
+import 'dart:collection';
+import 'package:utf/utf.dart';
+import 'package:source_span/source_span.dart';
+import 'char_encodings.dart';
+import 'constants.dart';
+import 'utils.dart';
+import 'encoding_parser.dart';
+
+/// Hooks to call into dart:io without directly referencing it.
+class ConsoleSupport {
+ List<int> bytesFromFile(source) => null;
+}
+
+// TODO(jmesserly): use lazy init here when supported.
+ConsoleSupport consoleSupport = new ConsoleSupport();
+
+/// Provides a unicode stream of characters to the HtmlTokenizer.
+///
+/// This class takes care of character encoding and removing or replacing
+/// incorrect byte-sequences and also provides column and line tracking.
+class HtmlInputStream {
+ /// Number of bytes to use when looking for a meta element with
+ /// encoding information.
+ static const int numBytesMeta = 512;
+
+ /// Encoding to use if no other information can be found.
+ static const String defaultEncoding = 'windows-1252';
+
+ /// The name of the character encoding.
+ String charEncodingName;
+
+ /// True if we are certain about [charEncodingName], false for tenative.
+ bool charEncodingCertain = true;
+
+ final bool generateSpans;
+
+ /// Location where the contents of the stream were found.
+ final String sourceUrl;
+
+ List<int> _rawBytes;
+
+ /// Raw UTF-16 codes, used if a Dart String is passed in.
+ Iterable<int> _rawChars;
+
+ Queue<String> errors;
+
+ SourceFile fileInfo;
+
+ List<int> _lineStarts;
+
+ List<int> _chars;
+
+ int _offset;
+
+ /// Initialises the HtmlInputStream.
+ ///
+ /// HtmlInputStream(source, [encoding]) -> Normalized stream from source
+ /// for use by html5lib.
+ ///
+ /// [source] can be either a [String] or a [List<int>] containing the raw
+ /// bytes, or a file if [consoleSupport] is initialized.
+ ///
+ /// The optional encoding parameter must be a string that indicates
+ /// the encoding. If specified, that encoding will be used,
+ /// regardless of any BOM or later declaration (such as in a meta
+ /// element)
+ ///
+ /// [parseMeta] - Look for a <meta> element containing encoding information
+ HtmlInputStream(source, [String encoding, bool parseMeta = true,
+ this.generateSpans = false, this.sourceUrl])
+ : charEncodingName = codecName(encoding) {
+ if (source is String) {
+ _rawChars = toCodepoints(source);
+ charEncodingName = 'utf-8';
+ charEncodingCertain = true;
+ } else if (source is List<int>) {
+ _rawBytes = source;
+ } else {
+ // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,
+ // but it's necessary because of how the UTF decoders work.
+ _rawBytes = consoleSupport.bytesFromFile(source);
+
+ if (_rawBytes == null) {
+ // TODO(jmesserly): we should accept some kind of stream API too.
+ // Unfortunately dart:io InputStream is async only, which won't work.
+ throw new ArgumentError("'source' must be a String or "
+ "List<int> (of bytes). You can also pass a RandomAccessFile if you"
+ "`import 'package:html/parser_console.dart'` and call "
+ "`useConsole()`.");
+ }
+ }
+
+ // Detect encoding iff no explicit "transport level" encoding is supplied
+ if (charEncodingName == null) {
+ detectEncoding(parseMeta);
+ }
+
+ reset();
+ }
+
+ void reset() {
+ errors = new Queue<String>();
+
+ _offset = 0;
+ _lineStarts = <int>[0];
+ _chars = <int>[];
+
+ if (_rawChars == null) {
+ _rawChars = decodeBytes(charEncodingName, _rawBytes);
+ }
+
+ bool skipNewline = false;
+ for (var c in _rawChars) {
+ if (skipNewline) {
+ skipNewline = false;
+ if (c == NEWLINE) continue;
+ }
+
+ if (invalidUnicode(c)) errors.add('invalid-codepoint');
+
+ if (0xD800 <= c && c <= 0xDFFF) {
+ c = 0xFFFD;
+ } else if (c == RETURN) {
+ skipNewline = true;
+ c = NEWLINE;
+ }
+
+ _chars.add(c);
+ if (c == NEWLINE) _lineStarts.add(_chars.length);
+ }
+
+ // Free decoded characters if they aren't needed anymore.
+ if (_rawBytes != null) _rawChars = null;
+
+ // TODO(sigmund): Don't parse the file at all if spans aren't being
+ // generated.
+ fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);
+ }
+
+ void detectEncoding([bool parseMeta = true]) {
+ // First look for a BOM
+ // This will also read past the BOM if present
+ charEncodingName = detectBOM();
+ charEncodingCertain = true;
+
+ // If there is no BOM need to look for meta elements with encoding
+ // information
+ if (charEncodingName == null && parseMeta) {
+ charEncodingName = detectEncodingMeta();
+ charEncodingCertain = false;
+ }
+ // If all else fails use the default encoding
+ if (charEncodingName == null) {
+ charEncodingCertain = false;
+ charEncodingName = defaultEncoding;
+ }
+
+ // Substitute for equivalent encodings:
+ if (charEncodingName.toLowerCase() == 'iso-8859-1') {
+ charEncodingName = 'windows-1252';
+ }
+ }
+
+ void changeEncoding(String newEncoding) {
+ if (_rawBytes == null) {
+ // We should never get here -- if encoding is certain we won't try to
+ // change it.
+ throw new StateError('cannot change encoding when parsing a String.');
+ }
+
+ newEncoding = codecName(newEncoding);
+ if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {
+ newEncoding = 'utf-8';
+ }
+ if (newEncoding == null) {
+ return;
+ } else if (newEncoding == charEncodingName) {
+ charEncodingCertain = true;
+ } else {
+ charEncodingName = newEncoding;
+ charEncodingCertain = true;
+ _rawChars = null;
+ reset();
+ throw new ReparseException(
+ 'Encoding changed from $charEncodingName to $newEncoding');
+ }
+ }
+
+ /// Attempts to detect at BOM at the start of the stream. If
+ /// an encoding can be determined from the BOM return the name of the
+ /// encoding otherwise return null.
+ String detectBOM() {
+ // Try detecting the BOM using bytes from the string
+ if (hasUtf8Bom(_rawBytes)) {
+ return 'utf-8';
+ }
+ // Note: we don't need to remember whether it was big or little endian
+ // because the decoder will do that later. It will also eat the BOM for us.
+ if (hasUtf16Bom(_rawBytes)) {
+ return 'utf-16';
+ }
+ if (hasUtf32Bom(_rawBytes)) {
+ return 'utf-32';
+ }
+ return null;
+ }
+
+ /// Report the encoding declared by the meta element.
+ String detectEncodingMeta() {
+ var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));
+ var encoding = parser.getEncoding();
+
+ if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {
+ encoding = 'utf-8';
+ }
+
+ return encoding;
+ }
+
+ /// Returns the current offset in the stream, i.e. the number of codepoints
+ /// since the start of the file.
+ int get position => _offset;
+
+ /// Read one character from the stream or queue if available. Return
+ /// EOF when EOF is reached.
+ String char() {
+ if (_offset >= _chars.length) return EOF;
+ return new String.fromCharCodes([_chars[_offset++]]);
+ }
+
+ String peekChar() {
+ if (_offset >= _chars.length) return EOF;
+ return new String.fromCharCodes([_chars[_offset]]);
+ }
+
+ /// Returns a string of characters from the stream up to but not
+ /// including any character in 'characters' or EOF.
+ String charsUntil(String characters, [bool opposite = false]) {
+ int start = _offset;
+ String c;
+ while ((c = peekChar()) != null && characters.contains(c) == opposite) {
+ _offset++;
+ }
+
+ return new String.fromCharCodes(_chars.sublist(start, _offset));
+ }
+
+ void unget(String ch) {
+ // Only one character is allowed to be ungotten at once - it must
+ // be consumed again before any further call to unget
+ if (ch != null) {
+ _offset--;
+ assert(peekChar() == ch);
+ }
+ }
+}
+
+// TODO(jmesserly): the Python code used a regex to check for this. But
+// Dart doesn't let you create a regexp with invalid characters.
+bool invalidUnicode(int c) {
+ if (0x0001 <= c && c <= 0x0008) return true;
+ if (0x000E <= c && c <= 0x001F) return true;
+ if (0x007F <= c && c <= 0x009F) return true;
+ if (0xD800 <= c && c <= 0xDFFF) return true;
+ if (0xFDD0 <= c && c <= 0xFDEF) return true;
+ switch (c) {
+ case 0x000B:
+ case 0xFFFE:
+ case 0xFFFF:
+ case 0x01FFFE:
+ case 0x01FFFF:
+ case 0x02FFFE:
+ case 0x02FFFF:
+ case 0x03FFFE:
+ case 0x03FFFF:
+ case 0x04FFFE:
+ case 0x04FFFF:
+ case 0x05FFFE:
+ case 0x05FFFF:
+ case 0x06FFFE:
+ case 0x06FFFF:
+ case 0x07FFFE:
+ case 0x07FFFF:
+ case 0x08FFFE:
+ case 0x08FFFF:
+ case 0x09FFFE:
+ case 0x09FFFF:
+ case 0x0AFFFE:
+ case 0x0AFFFF:
+ case 0x0BFFFE:
+ case 0x0BFFFF:
+ case 0x0CFFFE:
+ case 0x0CFFFF:
+ case 0x0DFFFE:
+ case 0x0DFFFF:
+ case 0x0EFFFE:
+ case 0x0EFFFF:
+ case 0x0FFFFE:
+ case 0x0FFFFF:
+ case 0x10FFFE:
+ case 0x10FFFF:
+ return true;
+ }
+ return false;
+}
+
+/// Return the python codec name corresponding to an encoding or null if the
+/// string doesn't correspond to a valid encoding.
+String codecName(String encoding) {
+ final asciiPunctuation = new RegExp(
+ "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");
+
+ if (encoding == null) return null;
+ var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
+ return encodings[canonicalName];
+}

Powered by Google App Engine
This is Rietveld 408576698