observatory_pub_packages/html5lib/src/inputstream.dart - Issue 816693004: Add observatory_pub_packages snapshot to third_party

Side by Side Diff: observatory_pub_packages/html5lib/src/inputstream.dart

Issue 816693004: Add observatory_pub_packages snapshot to third_party (Closed) Base URL: http://dart.googlecode.com/svn/third_party/

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 library inputstream;

	2

	3 import 'dart:collection';

	4 import 'package:utf/utf.dart';

	5 import 'package:source_span/source_span.dart';

	6 import 'char_encodings.dart';

	7 import 'constants.dart';

	8 import 'utils.dart';

	9 import 'encoding_parser.dart';

	10

	11 /// Hooks to call into dart:io without directly referencing it.

	12 class ConsoleSupport {

	13 List<int> bytesFromFile(source) => null;

	14 }

	15

	16 // TODO(jmesserly): use lazy init here when supported.

	17 ConsoleSupport consoleSupport = new ConsoleSupport();

	18

	19 /// Provides a unicode stream of characters to the HtmlTokenizer.

	20 ///

	21 /// This class takes care of character encoding and removing or replacing

	22 /// incorrect byte-sequences and also provides column and line tracking.

	23 class HtmlInputStream {

	24 /// Number of bytes to use when looking for a meta element with

	25 /// encoding information.

	26 static const int numBytesMeta = 512;

	27

	28 /// Encoding to use if no other information can be found.

	29 static const String defaultEncoding = 'windows-1252';

	30

	31 /// The name of the character encoding.

	32 String charEncodingName;

	33

	34 /// True if we are certain about [charEncodingName], false for tenative.

	35 bool charEncodingCertain = true;

	36

	37 final bool generateSpans;

	38

	39 /// Location where the contents of the stream were found.

	40 final String sourceUrl;

	41

	42 List<int> _rawBytes;

	43

	44 /// Raw UTF-16 codes, used if a Dart String is passed in.

	45 Iterable<int> _rawChars;

	46

	47 Queue<String> errors;

	48

	49 SourceFile fileInfo;

	50

	51 List<int> _lineStarts;

	52

	53 List<int> _chars;

	54

	55 int _offset;

	56

	57 /// Initialises the HtmlInputStream.

	58 ///

	59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source

	60 /// for use by html5lib.

	61 ///

	62 /// [source] can be either a [String] or a [List<int>] containing the raw

	63 /// bytes, or a file if [consoleSupport] is initialized.

	64 ///

	65 /// The optional encoding parameter must be a string that indicates

	66 /// the encoding. If specified, that encoding will be used,

	67 /// regardless of any BOM or later declaration (such as in a meta

	68 /// element)

	69 ///

	70 /// [parseMeta] - Look for a <meta> element containing encoding information

	71 HtmlInputStream(source, [String encoding, bool parseMeta = true,

	72 this.generateSpans = false, this.sourceUrl])

	73 : charEncodingName = codecName(encoding) {

	74

	75 if (source is String) {

	76 _rawChars = toCodepoints(source);

	77 charEncodingName = 'utf-8';

	78 charEncodingCertain = true;

	79 } else if (source is List<int>) {

	80 _rawBytes = source;

	81 } else {

	82 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

	83 // but it's necessary because of how the UTF decoders work.

	84 _rawBytes = consoleSupport.bytesFromFile(source);

	85

	86 if (_rawBytes == null) {

	87 // TODO(jmesserly): we should accept some kind of stream API too.

	88 // Unfortunately dart:io InputStream is async only, which won't work.

	89 throw new ArgumentError("'source' must be a String or "

	90 "List<int> (of bytes). You can also pass a RandomAccessFile if you"

	91 "`import 'package:html5lib/parser_console.dart'` and call "

	92 "`useConsole()`.");

	93 }

	94 }

	95

	96 // Detect encoding iff no explicit "transport level" encoding is supplied

	97 if (charEncodingName == null) {

	98 detectEncoding(parseMeta);

	99 }

	100

	101 reset();

	102 }

	103

	104 void reset() {

	105 errors = new Queue<String>();

	106

	107 _offset = 0;

	108 _lineStarts = <int>[0];

	109 _chars = <int>[];

	110

	111 if (_rawChars == null) {

	112 _rawChars = decodeBytes(charEncodingName, _rawBytes);

	113 }

	114

	115 bool skipNewline = false;

	116 for (var c in _rawChars) {

	117 if (skipNewline) {

	118 skipNewline = false;

	119 if (c == NEWLINE) continue;

	120 }

	121

	122 if (invalidUnicode(c)) errors.add('invalid-codepoint');

	123

	124 if (0xD800 <= c && c <= 0xDFFF) {

	125 c = 0xFFFD;

	126 } else if (c == RETURN) {

	127 skipNewline = true;

	128 c = NEWLINE;

	129 }

	130

	131 _chars.add(c);

	132 if (c == NEWLINE) _lineStarts.add(_chars.length);

	133 }

	134

	135 // Free decoded characters if they aren't needed anymore.

	136 if (_rawBytes != null) _rawChars = null;

	137

	138 // TODO(sigmund): Don't parse the file at all if spans aren't being

	139 // generated.

	140 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);

	141 }

	142

	143

	144 void detectEncoding([bool parseMeta = true]) {

	145 // First look for a BOM

	146 // This will also read past the BOM if present

	147 charEncodingName = detectBOM();

	148 charEncodingCertain = true;

	149

	150 // If there is no BOM need to look for meta elements with encoding

	151 // information

	152 if (charEncodingName == null && parseMeta) {

	153 charEncodingName = detectEncodingMeta();

	154 charEncodingCertain = false;

	155 }

	156 // If all else fails use the default encoding

	157 if (charEncodingName == null) {

	158 charEncodingCertain = false;

	159 charEncodingName = defaultEncoding;

	160 }

	161

	162 // Substitute for equivalent encodings:

	163 if (charEncodingName.toLowerCase() == 'iso-8859-1') {

	164 charEncodingName = 'windows-1252';

	165 }

	166 }

	167

	168 void changeEncoding(String newEncoding) {

	169 if (_rawBytes == null) {

	170 // We should never get here -- if encoding is certain we won't try to

	171 // change it.

	172 throw new StateError('cannot change encoding when parsing a String.');

	173 }

	174

	175 newEncoding = codecName(newEncoding);

	176 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {

	177 newEncoding = 'utf-8';

	178 }

	179 if (newEncoding == null) {

	180 return;

	181 } else if (newEncoding == charEncodingName) {

	182 charEncodingCertain = true;

	183 } else {

	184 charEncodingName = newEncoding;

	185 charEncodingCertain = true;

	186 _rawChars = null;

	187 reset();

	188 throw new ReparseException(

	189 'Encoding changed from $charEncodingName to $newEncoding');

	190 }

	191 }

	192

	193 /// Attempts to detect at BOM at the start of the stream. If

	194 /// an encoding can be determined from the BOM return the name of the

	195 /// encoding otherwise return null.

	196 String detectBOM() {

	197 // Try detecting the BOM using bytes from the string

	198 if (hasUtf8Bom(_rawBytes)) {

	199 return 'utf-8';

	200 }

	201 // Note: we don't need to remember whether it was big or little endian

	202 // because the decoder will do that later. It will also eat the BOM for us.

	203 if (hasUtf16Bom(_rawBytes)) {

	204 return 'utf-16';

	205 }

	206 if (hasUtf32Bom(_rawBytes)) {

	207 return 'utf-32';

	208 }

	209 return null;

	210 }

	211

	212 /// Report the encoding declared by the meta element.

	213 String detectEncodingMeta() {

	214 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));

	215 var encoding = parser.getEncoding();

	216

	217 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {

	218 encoding = 'utf-8';

	219 }

	220

	221 return encoding;

	222 }

	223

	224 /// Returns the current offset in the stream, i.e. the number of codepoints

	225 /// since the start of the file.

	226 int get position => _offset;

	227

	228 /// Read one character from the stream or queue if available. Return

	229 /// EOF when EOF is reached.

	230 String char() {

	231 if (_offset >= _chars.length) return EOF;

	232 return new String.fromCharCodes([_chars[_offset++]]);

	233 }

	234

	235 String peekChar() {

	236 if (_offset >= _chars.length) return EOF;

	237 return new String.fromCharCodes([_chars[_offset]]);

	238 }

	239

	240 /// Returns a string of characters from the stream up to but not

	241 /// including any character in 'characters' or EOF.

	242 String charsUntil(String characters, [bool opposite = false]) {

	243 int start = _offset;

	244 String c;

	245 while ((c = peekChar()) != null && characters.contains(c) == opposite) {

	246 _offset++;

	247 }

	248

	249 return new String.fromCharCodes(_chars.sublist(start, _offset));

	250 }

	251

	252 void unget(String ch) {

	253 // Only one character is allowed to be ungotten at once - it must

	254 // be consumed again before any further call to unget

	255 if (ch != null) {

	256 _offset--;

	257 assert(peekChar() == ch);

	258 }

	259 }

	260 }

	261

	262

	263 // TODO(jmesserly): the Python code used a regex to check for this. But

	264 // Dart doesn't let you create a regexp with invalid characters.

	265 bool invalidUnicode(int c) {

	266 if (0x0001 <= c && c <= 0x0008) return true;

	267 if (0x000E <= c && c <= 0x001F) return true;

	268 if (0x007F <= c && c <= 0x009F) return true;

	269 if (0xD800 <= c && c <= 0xDFFF) return true;

	270 if (0xFDD0 <= c && c <= 0xFDEF) return true;

	271 switch (c) {

	272 case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF:

	273 case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF:

	274 case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF:

	275 case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF:

	276 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:

	277 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:

	278 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:

	279 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:

	280 case 0x10FFFE: case 0x10FFFF:

	281 return true;

	282 }

	283 return false;

	284 }

	285

	286 /// Return the python codec name corresponding to an encoding or null if the

	287 /// string doesn't correspond to a valid encoding.

	288 String codecName(String encoding) {

	289 final asciiPunctuation = new RegExp(

	290 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

	291

	292 if (encoding == null) return null;

	293 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

	294 return encodings[canonicalName];

	295 }

OLD	NEW