Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: third_party/pkg/html5lib/lib/src/inputstream.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 library inputstream;
2
3 import 'dart:collection';
4 import 'dart:utf';
5 import 'package:source_maps/span.dart' show SourceFile;
6 import 'char_encodings.dart';
7 import 'constants.dart';
8 import 'utils.dart';
9 import 'encoding_parser.dart';
10
11 /** Hooks to call into dart:io without directly referencing it. */
12 class ConsoleSupport {
13 List<int> bytesFromFile(source) => null;
14 }
15
16 // TODO(jmesserly): use lazy init here when supported.
17 ConsoleSupport consoleSupport = new ConsoleSupport();
18
19 /**
20 * Provides a unicode stream of characters to the HtmlTokenizer.
21 *
22 * This class takes care of character encoding and removing or replacing
23 * incorrect byte-sequences and also provides column and line tracking.
24 */
25 class HtmlInputStream {
26 /**
27 * Number of bytes to use when looking for a meta element with
28 * encoding information.
29 */
30 static const int numBytesMeta = 512;
31
32 /** Encoding to use if no other information can be found. */
33 static const String defaultEncoding = 'windows-1252';
34
35 /** The name of the character encoding. */
36 String charEncodingName;
37
38 /** True if we are certain about [charEncodingName], false for tenative. */
39 bool charEncodingCertain = true;
40
41 final bool generateSpans;
42
43 /** Location where the contents of the stream were found. */
44 final String sourceUrl;
45
46 List<int> _rawBytes;
47
48 /** Raw UTF-16 codes, used if a Dart String is passed in. */
49 Iterable<int> _rawChars;
50
51 Queue<String> errors;
52
53 SourceFile fileInfo;
54
55 List<int> _lineStarts;
56
57 List<int> _chars;
58
59 int _offset;
60
61 /**
62 * Initialises the HtmlInputStream.
63 *
64 * HtmlInputStream(source, [encoding]) -> Normalized stream from source
65 * for use by html5lib.
66 *
67 * [source] can be either a [String] or a [List<int>] containing the raw
68 * bytes, or a file if [consoleSupport] is initialized.
69 *
70 * The optional encoding parameter must be a string that indicates
71 * the encoding. If specified, that encoding will be used,
72 * regardless of any BOM or later declaration (such as in a meta
73 * element)
74 *
75 * [parseMeta] - Look for a <meta> element containing encoding information
76 */
77 HtmlInputStream(source, [String encoding, bool parseMeta = true,
78 this.generateSpans = false, this.sourceUrl])
79 : charEncodingName = codecName(encoding) {
80
81 if (source is String) {
82 _rawChars = toCodepoints(source);
83 charEncodingName = 'utf-8';
84 charEncodingCertain = true;
85 } else if (source is List<int>) {
86 _rawBytes = source;
87 } else {
88 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,
89 // but it's necessary because of how the UTF decoders work.
90 _rawBytes = consoleSupport.bytesFromFile(source);
91
92 if (_rawBytes == null) {
93 // TODO(jmesserly): we should accept some kind of stream API too.
94 // Unfortunately dart:io InputStream is async only, which won't work.
95 throw new ArgumentError("'source' must be a String or "
96 "List<int> (of bytes). You can also pass a RandomAccessFile if you"
97 "`import 'package:html5lib/parser_console.dart'` and call "
98 "`useConsole()`.");
99 }
100 }
101
102 // Detect encoding iff no explicit "transport level" encoding is supplied
103 if (charEncodingName == null) {
104 detectEncoding(parseMeta);
105 }
106
107 reset();
108 }
109
110 void reset() {
111 errors = new Queue<String>();
112
113 _offset = 0;
114 _lineStarts = <int>[0];
115 _chars = <int>[];
116
117 if (_rawChars == null) {
118 _rawChars = decodeBytes(charEncodingName, _rawBytes);
119 }
120
121 bool skipNewline = false;
122 for (var c in _rawChars) {
123 if (skipNewline) {
124 skipNewline = false;
125 if (c == NEWLINE) continue;
126 }
127
128 if (invalidUnicode(c)) errors.add('invalid-codepoint');
129
130 if (0xD800 <= c && c <= 0xDFFF) {
131 c = 0xFFFD;
132 } else if (c == RETURN) {
133 skipNewline = true;
134 c = NEWLINE;
135 }
136
137 _chars.add(c);
138 if (c == NEWLINE) _lineStarts.add(_chars.length);
139 }
140
141 // Free decoded characters if they aren't needed anymore.
142 if (_rawBytes != null) _rawChars = null;
143
144 fileInfo = new SourceFile(sourceUrl, _lineStarts,
145 generateSpans ? _chars : null);
146 }
147
148
149 void detectEncoding([bool parseMeta = true]) {
150 // First look for a BOM
151 // This will also read past the BOM if present
152 charEncodingName = detectBOM();
153 charEncodingCertain = true;
154
155 // If there is no BOM need to look for meta elements with encoding
156 // information
157 if (charEncodingName == null && parseMeta) {
158 charEncodingName = detectEncodingMeta();
159 charEncodingCertain = false;
160 }
161 // If all else fails use the default encoding
162 if (charEncodingName == null) {
163 charEncodingCertain = false;
164 charEncodingName = defaultEncoding;
165 }
166
167 // Substitute for equivalent encodings:
168 if (charEncodingName.toLowerCase() == 'iso-8859-1') {
169 charEncodingName = 'windows-1252';
170 }
171 }
172
173 void changeEncoding(String newEncoding) {
174 if (_rawBytes == null) {
175 // We should never get here -- if encoding is certain we won't try to
176 // change it.
177 throw new StateError('cannot change encoding when parsing a String.');
178 }
179
180 newEncoding = codecName(newEncoding);
181 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {
182 newEncoding = 'utf-8';
183 }
184 if (newEncoding == null) {
185 return;
186 } else if (newEncoding == charEncodingName) {
187 charEncodingCertain = true;
188 } else {
189 charEncodingName = newEncoding;
190 charEncodingCertain = true;
191 _rawChars = null;
192 reset();
193 throw new ReparseException(
194 'Encoding changed from $charEncodingName to $newEncoding');
195 }
196 }
197
198 /**
199 * Attempts to detect at BOM at the start of the stream. If
200 * an encoding can be determined from the BOM return the name of the
201 * encoding otherwise return null.
202 */
203 String detectBOM() {
204 // Try detecting the BOM using bytes from the string
205 if (hasUtf8Bom(_rawBytes)) {
206 return 'utf-8';
207 }
208 // Note: we don't need to remember whether it was big or little endian
209 // because the decoder will do that later. It will also eat the BOM for us.
210 if (hasUtf16Bom(_rawBytes)) {
211 return 'utf-16';
212 }
213 if (hasUtf32Bom(_rawBytes)) {
214 return 'utf-32';
215 }
216 return null;
217 }
218
219 /** Report the encoding declared by the meta element. */
220 String detectEncodingMeta() {
221 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));
222 var encoding = parser.getEncoding();
223
224 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {
225 encoding = 'utf-8';
226 }
227
228 return encoding;
229 }
230
231 /**
232 * Returns the current offset in the stream, i.e. the number of codepoints
233 * since the start of the file.
234 */
235 int get position => _offset;
236
237 /**
238 * Read one character from the stream or queue if available. Return
239 * EOF when EOF is reached.
240 */
241 String char() {
242 if (_offset >= _chars.length) return EOF;
243 return new String.fromCharCodes([_chars[_offset++]]);
244 }
245
246 String peekChar() {
247 if (_offset >= _chars.length) return EOF;
248 return new String.fromCharCodes([_chars[_offset]]);
249 }
250
251 /**
252 * Returns a string of characters from the stream up to but not
253 * including any character in 'characters' or EOF.
254 */
255 String charsUntil(String characters, [bool opposite = false]) {
256 int start = _offset;
257 String c;
258 while ((c = peekChar()) != null && characters.contains(c) == opposite) {
259 _offset++;
260 }
261
262 return new String.fromCharCodes(_chars.sublist(start, _offset));
263 }
264
265 void unget(String ch) {
266 // Only one character is allowed to be ungotten at once - it must
267 // be consumed again before any further call to unget
268 if (ch != null) {
269 _offset--;
270 assert(peekChar() == ch);
271 }
272 }
273 }
274
275
276 // TODO(jmesserly): the Python code used a regex to check for this. But
277 // Dart doesn't let you create a regexp with invalid characters.
278 bool invalidUnicode(int c) {
279 if (0x0001 <= c && c <= 0x0008) return true;
280 if (0x000E <= c && c <= 0x001F) return true;
281 if (0x007F <= c && c <= 0x009F) return true;
282 if (0xD800 <= c && c <= 0xDFFF) return true;
283 if (0xFDD0 <= c && c <= 0xFDEF) return true;
284 switch (c) {
285 case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF:
286 case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF:
287 case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF:
288 case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF:
289 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:
290 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:
291 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:
292 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:
293 case 0x10FFFE: case 0x10FFFF:
294 return true;
295 }
296 return false;
297 }
298
299 /**
300 * Return the python codec name corresponding to an encoding or null if the
301 * string doesn't correspond to a valid encoding.
302 */
303 String codecName(String encoding) {
304 final asciiPunctuation = new RegExp(
305 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");
306
307 if (encoding == null) return null;
308 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
309 return encodings[canonicalName];
310 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698