mojo/public/dart/third_party/html/lib/src/inputstream.dart - Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs

Side by Side Diff: mojo/public/dart/third_party/html/lib/src/inputstream.dart

Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « mojo/public/dart/third_party/html/lib/src/encoding_parser.dart ('k') | mojo/public/dart/third_party/html/lib/src/list_proxy.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 library inputstream;

	2

	3 import 'dart:collection';

	4 import 'package:utf/utf.dart';

	5 import 'package:source_span/source_span.dart';

	6 import 'char_encodings.dart';

	7 import 'constants.dart';

	8 import 'utils.dart';

	9 import 'encoding_parser.dart';

	10

	11 /// Hooks to call into dart:io without directly referencing it.

	12 class ConsoleSupport {

	13 List<int> bytesFromFile(source) => null;

	14 }

	15

	16 // TODO(jmesserly): use lazy init here when supported.

	17 ConsoleSupport consoleSupport = new ConsoleSupport();

	18

	19 /// Provides a unicode stream of characters to the HtmlTokenizer.

	20 ///

	21 /// This class takes care of character encoding and removing or replacing

	22 /// incorrect byte-sequences and also provides column and line tracking.

	23 class HtmlInputStream {

	24 /// Number of bytes to use when looking for a meta element with

	25 /// encoding information.

	26 static const int numBytesMeta = 512;

	27

	28 /// Encoding to use if no other information can be found.

	29 static const String defaultEncoding = 'windows-1252';

	30

	31 /// The name of the character encoding.

	32 String charEncodingName;

	33

	34 /// True if we are certain about [charEncodingName], false for tenative.

	35 bool charEncodingCertain = true;

	36

	37 final bool generateSpans;

	38

	39 /// Location where the contents of the stream were found.

	40 final String sourceUrl;

	41

	42 List<int> _rawBytes;

	43

	44 /// Raw UTF-16 codes, used if a Dart String is passed in.

	45 Iterable<int> _rawChars;

	46

	47 Queue<String> errors;

	48

	49 SourceFile fileInfo;

	50

	51 List<int> _lineStarts;

	52

	53 List<int> _chars;

	54

	55 int _offset;

	56

	57 /// Initialises the HtmlInputStream.

	58 ///

	59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source

	60 /// for use by html5lib.

	61 ///

	62 /// [source] can be either a [String] or a [List<int>] containing the raw

	63 /// bytes, or a file if [consoleSupport] is initialized.

	64 ///

	65 /// The optional encoding parameter must be a string that indicates

	66 /// the encoding. If specified, that encoding will be used,

	67 /// regardless of any BOM or later declaration (such as in a meta

	68 /// element)

	69 ///

	70 /// [parseMeta] - Look for a <meta> element containing encoding information

	71 HtmlInputStream(source, [String encoding, bool parseMeta = true,

	72 this.generateSpans = false, this.sourceUrl])

	73 : charEncodingName = codecName(encoding) {

	74 if (source is String) {

	75 _rawChars = toCodepoints(source);

	76 charEncodingName = 'utf-8';

	77 charEncodingCertain = true;

	78 } else if (source is List<int>) {

	79 _rawBytes = source;

	80 } else {

	81 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

	82 // but it's necessary because of how the UTF decoders work.

	83 _rawBytes = consoleSupport.bytesFromFile(source);

	84

	85 if (_rawBytes == null) {

	86 // TODO(jmesserly): we should accept some kind of stream API too.

	87 // Unfortunately dart:io InputStream is async only, which won't work.

	88 throw new ArgumentError("'source' must be a String or "

	89 "List<int> (of bytes). You can also pass a RandomAccessFile if you"

	90 "`import 'package:html/parser_console.dart'` and call "

	91 "`useConsole()`.");

	92 }

	93 }

	94

	95 // Detect encoding iff no explicit "transport level" encoding is supplied

	96 if (charEncodingName == null) {

	97 detectEncoding(parseMeta);

	98 }

	99

	100 reset();

	101 }

	102

	103 void reset() {

	104 errors = new Queue<String>();

	105

	106 _offset = 0;

	107 _lineStarts = <int>[0];

	108 _chars = <int>[];

	109

	110 if (_rawChars == null) {

	111 _rawChars = decodeBytes(charEncodingName, _rawBytes);

	112 }

	113

	114 bool skipNewline = false;

	115 for (var c in _rawChars) {

	116 if (skipNewline) {

	117 skipNewline = false;

	118 if (c == NEWLINE) continue;

	119 }

	120

	121 if (invalidUnicode(c)) errors.add('invalid-codepoint');

	122

	123 if (0xD800 <= c && c <= 0xDFFF) {

	124 c = 0xFFFD;

	125 } else if (c == RETURN) {

	126 skipNewline = true;

	127 c = NEWLINE;

	128 }

	129

	130 _chars.add(c);

	131 if (c == NEWLINE) _lineStarts.add(_chars.length);

	132 }

	133

	134 // Free decoded characters if they aren't needed anymore.

	135 if (_rawBytes != null) _rawChars = null;

	136

	137 // TODO(sigmund): Don't parse the file at all if spans aren't being

	138 // generated.

	139 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);

	140 }

	141

	142 void detectEncoding([bool parseMeta = true]) {

	143 // First look for a BOM

	144 // This will also read past the BOM if present

	145 charEncodingName = detectBOM();

	146 charEncodingCertain = true;

	147

	148 // If there is no BOM need to look for meta elements with encoding

	149 // information

	150 if (charEncodingName == null && parseMeta) {

	151 charEncodingName = detectEncodingMeta();

	152 charEncodingCertain = false;

	153 }

	154 // If all else fails use the default encoding

	155 if (charEncodingName == null) {

	156 charEncodingCertain = false;

	157 charEncodingName = defaultEncoding;

	158 }

	159

	160 // Substitute for equivalent encodings:

	161 if (charEncodingName.toLowerCase() == 'iso-8859-1') {

	162 charEncodingName = 'windows-1252';

	163 }

	164 }

	165

	166 void changeEncoding(String newEncoding) {

	167 if (_rawBytes == null) {

	168 // We should never get here -- if encoding is certain we won't try to

	169 // change it.

	170 throw new StateError('cannot change encoding when parsing a String.');

	171 }

	172

	173 newEncoding = codecName(newEncoding);

	174 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {

	175 newEncoding = 'utf-8';

	176 }

	177 if (newEncoding == null) {

	178 return;

	179 } else if (newEncoding == charEncodingName) {

	180 charEncodingCertain = true;

	181 } else {

	182 charEncodingName = newEncoding;

	183 charEncodingCertain = true;

	184 _rawChars = null;

	185 reset();

	186 throw new ReparseException(

	187 'Encoding changed from $charEncodingName to $newEncoding');

	188 }

	189 }

	190

	191 /// Attempts to detect at BOM at the start of the stream. If

	192 /// an encoding can be determined from the BOM return the name of the

	193 /// encoding otherwise return null.

	194 String detectBOM() {

	195 // Try detecting the BOM using bytes from the string

	196 if (hasUtf8Bom(_rawBytes)) {

	197 return 'utf-8';

	198 }

	199 // Note: we don't need to remember whether it was big or little endian

	200 // because the decoder will do that later. It will also eat the BOM for us.

	201 if (hasUtf16Bom(_rawBytes)) {

	202 return 'utf-16';

	203 }

	204 if (hasUtf32Bom(_rawBytes)) {

	205 return 'utf-32';

	206 }

	207 return null;

	208 }

	209

	210 /// Report the encoding declared by the meta element.

	211 String detectEncodingMeta() {

	212 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));

	213 var encoding = parser.getEncoding();

	214

	215 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {

	216 encoding = 'utf-8';

	217 }

	218

	219 return encoding;

	220 }

	221

	222 /// Returns the current offset in the stream, i.e. the number of codepoints

	223 /// since the start of the file.

	224 int get position => _offset;

	225

	226 /// Read one character from the stream or queue if available. Return

	227 /// EOF when EOF is reached.

	228 String char() {

	229 if (_offset >= _chars.length) return EOF;

	230 return new String.fromCharCodes([_chars[_offset++]]);

	231 }

	232

	233 String peekChar() {

	234 if (_offset >= _chars.length) return EOF;

	235 return new String.fromCharCodes([_chars[_offset]]);

	236 }

	237

	238 /// Returns a string of characters from the stream up to but not

	239 /// including any character in 'characters' or EOF.

	240 String charsUntil(String characters, [bool opposite = false]) {

	241 int start = _offset;

	242 String c;

	243 while ((c = peekChar()) != null && characters.contains(c) == opposite) {

	244 _offset++;

	245 }

	246

	247 return new String.fromCharCodes(_chars.sublist(start, _offset));

	248 }

	249

	250 void unget(String ch) {

	251 // Only one character is allowed to be ungotten at once - it must

	252 // be consumed again before any further call to unget

	253 if (ch != null) {

	254 _offset--;

	255 assert(peekChar() == ch);

	256 }

	257 }

	258 }

	259

	260 // TODO(jmesserly): the Python code used a regex to check for this. But

	261 // Dart doesn't let you create a regexp with invalid characters.

	262 bool invalidUnicode(int c) {

	263 if (0x0001 <= c && c <= 0x0008) return true;

	264 if (0x000E <= c && c <= 0x001F) return true;

	265 if (0x007F <= c && c <= 0x009F) return true;

	266 if (0xD800 <= c && c <= 0xDFFF) return true;

	267 if (0xFDD0 <= c && c <= 0xFDEF) return true;

	268 switch (c) {

	269 case 0x000B:

	270 case 0xFFFE:

	271 case 0xFFFF:

	272 case 0x01FFFE:

	273 case 0x01FFFF:

	274 case 0x02FFFE:

	275 case 0x02FFFF:

	276 case 0x03FFFE:

	277 case 0x03FFFF:

	278 case 0x04FFFE:

	279 case 0x04FFFF:

	280 case 0x05FFFE:

	281 case 0x05FFFF:

	282 case 0x06FFFE:

	283 case 0x06FFFF:

	284 case 0x07FFFE:

	285 case 0x07FFFF:

	286 case 0x08FFFE:

	287 case 0x08FFFF:

	288 case 0x09FFFE:

	289 case 0x09FFFF:

	290 case 0x0AFFFE:

	291 case 0x0AFFFF:

	292 case 0x0BFFFE:

	293 case 0x0BFFFF:

	294 case 0x0CFFFE:

	295 case 0x0CFFFF:

	296 case 0x0DFFFE:

	297 case 0x0DFFFF:

	298 case 0x0EFFFE:

	299 case 0x0EFFFF:

	300 case 0x0FFFFE:

	301 case 0x0FFFFF:

	302 case 0x10FFFE:

	303 case 0x10FFFF:

	304 return true;

	305 }

	306 return false;

	307 }

	308

	309 /// Return the python codec name corresponding to an encoding or null if the

	310 /// string doesn't correspond to a valid encoding.

	311 String codecName(String encoding) {

	312 final asciiPunctuation = new RegExp(

	313 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

	314

	315 if (encoding == null) return null;

	316 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

	317 return encodings[canonicalName];

	318 }

OLD	NEW