third_party/pkg/html5lib/lib/src/inputstream.dart - Issue 22375011: move html5lib code into dart svn repo

Side by Side Diff: third_party/pkg/html5lib/lib/src/inputstream.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 library inputstream;

	2

	3 import 'dart:collection';

	4 import 'dart:utf';

	5 import 'package:source_maps/span.dart' show SourceFile;

	6 import 'char_encodings.dart';

	7 import 'constants.dart';

	8 import 'utils.dart';

	9 import 'encoding_parser.dart';

	10

	11 /** Hooks to call into dart:io without directly referencing it. */

	12 class ConsoleSupport {

	13 List<int> bytesFromFile(source) => null;

	14 }

	15

	16 // TODO(jmesserly): use lazy init here when supported.

	17 ConsoleSupport consoleSupport = new ConsoleSupport();

	18

	19 /**

	20 * Provides a unicode stream of characters to the HtmlTokenizer.

	21 *

	22 * This class takes care of character encoding and removing or replacing

	23 * incorrect byte-sequences and also provides column and line tracking.

	24 */

	25 class HtmlInputStream {

	26 /**

	27 * Number of bytes to use when looking for a meta element with

	28 * encoding information.

	29 */

	30 static const int numBytesMeta = 512;

	31

	32 /** Encoding to use if no other information can be found. */

	33 static const String defaultEncoding = 'windows-1252';

	34

	35 /** The name of the character encoding. */

	36 String charEncodingName;

	37

	38 /** True if we are certain about [charEncodingName], false for tenative. */

	39 bool charEncodingCertain = true;

	40

	41 final bool generateSpans;

	42

	43 /** Location where the contents of the stream were found. */

	44 final String sourceUrl;

	45

	46 List<int> _rawBytes;

	47

	48 /** Raw UTF-16 codes, used if a Dart String is passed in. */

	49 Iterable<int> _rawChars;

	50

	51 Queue<String> errors;

	52

	53 SourceFile fileInfo;

	54

	55 List<int> _lineStarts;

	56

	57 List<int> _chars;

	58

	59 int _offset;

	60

	61 /**

	62 * Initialises the HtmlInputStream.

	63 *

	64 * HtmlInputStream(source, [encoding]) -> Normalized stream from source

	65 * for use by html5lib.

	66 *

	67 * [source] can be either a [String] or a [List<int>] containing the raw

	68 * bytes, or a file if [consoleSupport] is initialized.

	69 *

	70 * The optional encoding parameter must be a string that indicates

	71 * the encoding. If specified, that encoding will be used,

	72 * regardless of any BOM or later declaration (such as in a meta

	73 * element)

	74 *

	75 * [parseMeta] - Look for a <meta> element containing encoding information

	76 */

	77 HtmlInputStream(source, [String encoding, bool parseMeta = true,

	78 this.generateSpans = false, this.sourceUrl])

	79 : charEncodingName = codecName(encoding) {

	80

	81 if (source is String) {

	82 _rawChars = toCodepoints(source);

	83 charEncodingName = 'utf-8';

	84 charEncodingCertain = true;

	85 } else if (source is List<int>) {

	86 _rawBytes = source;

	87 } else {

	88 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

	89 // but it's necessary because of how the UTF decoders work.

	90 _rawBytes = consoleSupport.bytesFromFile(source);

	91

	92 if (_rawBytes == null) {

	93 // TODO(jmesserly): we should accept some kind of stream API too.

	94 // Unfortunately dart:io InputStream is async only, which won't work.

	95 throw new ArgumentError("'source' must be a String or "

	96 "List<int> (of bytes). You can also pass a RandomAccessFile if you"

	97 "`import 'package:html5lib/parser_console.dart'` and call "

	98 "`useConsole()`.");

	99 }

	100 }

	101

	102 // Detect encoding iff no explicit "transport level" encoding is supplied

	103 if (charEncodingName == null) {

	104 detectEncoding(parseMeta);

	105 }

	106

	107 reset();

	108 }

	109

	110 void reset() {

	111 errors = new Queue<String>();

	112

	113 _offset = 0;

	114 _lineStarts = <int>[0];

	115 _chars = <int>[];

	116

	117 if (_rawChars == null) {

	118 _rawChars = decodeBytes(charEncodingName, _rawBytes);

	119 }

	120

	121 bool skipNewline = false;

	122 for (var c in _rawChars) {

	123 if (skipNewline) {

	124 skipNewline = false;

	125 if (c == NEWLINE) continue;

	126 }

	127

	128 if (invalidUnicode(c)) errors.add('invalid-codepoint');

	129

	130 if (0xD800 <= c && c <= 0xDFFF) {

	131 c = 0xFFFD;

	132 } else if (c == RETURN) {

	133 skipNewline = true;

	134 c = NEWLINE;

	135 }

	136

	137 _chars.add(c);

	138 if (c == NEWLINE) _lineStarts.add(_chars.length);

	139 }

	140

	141 // Free decoded characters if they aren't needed anymore.

	142 if (_rawBytes != null) _rawChars = null;

	143

	144 fileInfo = new SourceFile(sourceUrl, _lineStarts,

	145 generateSpans ? _chars : null);

	146 }

	147

	148

	149 void detectEncoding([bool parseMeta = true]) {

	150 // First look for a BOM

	151 // This will also read past the BOM if present

	152 charEncodingName = detectBOM();

	153 charEncodingCertain = true;

	154

	155 // If there is no BOM need to look for meta elements with encoding

	156 // information

	157 if (charEncodingName == null && parseMeta) {

	158 charEncodingName = detectEncodingMeta();

	159 charEncodingCertain = false;

	160 }

	161 // If all else fails use the default encoding

	162 if (charEncodingName == null) {

	163 charEncodingCertain = false;

	164 charEncodingName = defaultEncoding;

	165 }

	166

	167 // Substitute for equivalent encodings:

	168 if (charEncodingName.toLowerCase() == 'iso-8859-1') {

	169 charEncodingName = 'windows-1252';

	170 }

	171 }

	172

	173 void changeEncoding(String newEncoding) {

	174 if (_rawBytes == null) {

	175 // We should never get here -- if encoding is certain we won't try to

	176 // change it.

	177 throw new StateError('cannot change encoding when parsing a String.');

	178 }

	179

	180 newEncoding = codecName(newEncoding);

	181 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {

	182 newEncoding = 'utf-8';

	183 }

	184 if (newEncoding == null) {

	185 return;

	186 } else if (newEncoding == charEncodingName) {

	187 charEncodingCertain = true;

	188 } else {

	189 charEncodingName = newEncoding;

	190 charEncodingCertain = true;

	191 _rawChars = null;

	192 reset();

	193 throw new ReparseException(

	194 'Encoding changed from $charEncodingName to $newEncoding');

	195 }

	196 }

	197

	198 /**

	199 * Attempts to detect at BOM at the start of the stream. If

	200 * an encoding can be determined from the BOM return the name of the

	201 * encoding otherwise return null.

	202 */

	203 String detectBOM() {

	204 // Try detecting the BOM using bytes from the string

	205 if (hasUtf8Bom(_rawBytes)) {

	206 return 'utf-8';

	207 }

	208 // Note: we don't need to remember whether it was big or little endian

	209 // because the decoder will do that later. It will also eat the BOM for us.

	210 if (hasUtf16Bom(_rawBytes)) {

	211 return 'utf-16';

	212 }

	213 if (hasUtf32Bom(_rawBytes)) {

	214 return 'utf-32';

	215 }

	216 return null;

	217 }

	218

	219 /** Report the encoding declared by the meta element. */

	220 String detectEncodingMeta() {

	221 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));

	222 var encoding = parser.getEncoding();

	223

	224 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {

	225 encoding = 'utf-8';

	226 }

	227

	228 return encoding;

	229 }

	230

	231 /**

	232 * Returns the current offset in the stream, i.e. the number of codepoints

	233 * since the start of the file.

	234 */

	235 int get position => _offset;

	236

	237 /**

	238 * Read one character from the stream or queue if available. Return

	239 * EOF when EOF is reached.

	240 */

	241 String char() {

	242 if (_offset >= _chars.length) return EOF;

	243 return new String.fromCharCodes([_chars[_offset++]]);

	244 }

	245

	246 String peekChar() {

	247 if (_offset >= _chars.length) return EOF;

	248 return new String.fromCharCodes([_chars[_offset]]);

	249 }

	250

	251 /**

	252 * Returns a string of characters from the stream up to but not

	253 * including any character in 'characters' or EOF.

	254 */

	255 String charsUntil(String characters, [bool opposite = false]) {

	256 int start = _offset;

	257 String c;

	258 while ((c = peekChar()) != null && characters.contains(c) == opposite) {

	259 _offset++;

	260 }

	261

	262 return new String.fromCharCodes(_chars.sublist(start, _offset));

	263 }

	264

	265 void unget(String ch) {

	266 // Only one character is allowed to be ungotten at once - it must

	267 // be consumed again before any further call to unget

	268 if (ch != null) {

	269 _offset--;

	270 assert(peekChar() == ch);

	271 }

	272 }

	273 }

	274

	275

	276 // TODO(jmesserly): the Python code used a regex to check for this. But

	277 // Dart doesn't let you create a regexp with invalid characters.

	278 bool invalidUnicode(int c) {

	279 if (0x0001 <= c && c <= 0x0008) return true;

	280 if (0x000E <= c && c <= 0x001F) return true;

	281 if (0x007F <= c && c <= 0x009F) return true;

	282 if (0xD800 <= c && c <= 0xDFFF) return true;

	283 if (0xFDD0 <= c && c <= 0xFDEF) return true;

	284 switch (c) {

	285 case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF:

	286 case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF:

	287 case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF:

	288 case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF:

	289 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:

	290 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:

	291 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:

	292 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:

	293 case 0x10FFFE: case 0x10FFFF:

	294 return true;

	295 }

	296 return false;

	297 }

	298

	299 /**

	300 * Return the python codec name corresponding to an encoding or null if the

	301 * string doesn't correspond to a valid encoding.

	302 */

	303 String codecName(String encoding) {

	304 final asciiPunctuation = new RegExp(

	305 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

	306

	307 if (encoding == null) return null;

	308 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

	309 return encodings[canonicalName];

	310 }

OLD	NEW