OLD | NEW |
| (Empty) |
1 library inputstream; | |
2 | |
3 import 'dart:collection'; | |
4 import 'package:utf/utf.dart'; | |
5 import 'package:source_span/source_span.dart'; | |
6 import 'char_encodings.dart'; | |
7 import 'constants.dart'; | |
8 import 'utils.dart'; | |
9 import 'encoding_parser.dart'; | |
10 | |
11 /// Hooks to call into dart:io without directly referencing it. | |
12 class ConsoleSupport { | |
13 List<int> bytesFromFile(source) => null; | |
14 } | |
15 | |
16 // TODO(jmesserly): use lazy init here when supported. | |
17 ConsoleSupport consoleSupport = new ConsoleSupport(); | |
18 | |
19 /// Provides a unicode stream of characters to the HtmlTokenizer. | |
20 /// | |
21 /// This class takes care of character encoding and removing or replacing | |
22 /// incorrect byte-sequences and also provides column and line tracking. | |
23 class HtmlInputStream { | |
24 /// Number of bytes to use when looking for a meta element with | |
25 /// encoding information. | |
26 static const int numBytesMeta = 512; | |
27 | |
28 /// Encoding to use if no other information can be found. | |
29 static const String defaultEncoding = 'windows-1252'; | |
30 | |
31 /// The name of the character encoding. | |
32 String charEncodingName; | |
33 | |
34 /// True if we are certain about [charEncodingName], false for tenative. | |
35 bool charEncodingCertain = true; | |
36 | |
37 final bool generateSpans; | |
38 | |
39 /// Location where the contents of the stream were found. | |
40 final String sourceUrl; | |
41 | |
42 List<int> _rawBytes; | |
43 | |
44 /// Raw UTF-16 codes, used if a Dart String is passed in. | |
45 Iterable<int> _rawChars; | |
46 | |
47 Queue<String> errors; | |
48 | |
49 SourceFile fileInfo; | |
50 | |
51 List<int> _lineStarts; | |
52 | |
53 List<int> _chars; | |
54 | |
55 int _offset; | |
56 | |
57 /// Initialises the HtmlInputStream. | |
58 /// | |
59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source | |
60 /// for use by html5lib. | |
61 /// | |
62 /// [source] can be either a [String] or a [List<int>] containing the raw | |
63 /// bytes, or a file if [consoleSupport] is initialized. | |
64 /// | |
65 /// The optional encoding parameter must be a string that indicates | |
66 /// the encoding. If specified, that encoding will be used, | |
67 /// regardless of any BOM or later declaration (such as in a meta | |
68 /// element) | |
69 /// | |
70 /// [parseMeta] - Look for a <meta> element containing encoding information | |
71 HtmlInputStream(source, [String encoding, bool parseMeta = true, | |
72 this.generateSpans = false, this.sourceUrl]) | |
73 : charEncodingName = codecName(encoding) { | |
74 if (source is String) { | |
75 _rawChars = toCodepoints(source); | |
76 charEncodingName = 'utf-8'; | |
77 charEncodingCertain = true; | |
78 } else if (source is List<int>) { | |
79 _rawBytes = source; | |
80 } else { | |
81 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance, | |
82 // but it's necessary because of how the UTF decoders work. | |
83 _rawBytes = consoleSupport.bytesFromFile(source); | |
84 | |
85 if (_rawBytes == null) { | |
86 // TODO(jmesserly): we should accept some kind of stream API too. | |
87 // Unfortunately dart:io InputStream is async only, which won't work. | |
88 throw new ArgumentError("'source' must be a String or " | |
89 "List<int> (of bytes). You can also pass a RandomAccessFile if you" | |
90 "`import 'package:html/parser_console.dart'` and call " | |
91 "`useConsole()`."); | |
92 } | |
93 } | |
94 | |
95 // Detect encoding iff no explicit "transport level" encoding is supplied | |
96 if (charEncodingName == null) { | |
97 detectEncoding(parseMeta); | |
98 } | |
99 | |
100 reset(); | |
101 } | |
102 | |
103 void reset() { | |
104 errors = new Queue<String>(); | |
105 | |
106 _offset = 0; | |
107 _lineStarts = <int>[0]; | |
108 _chars = <int>[]; | |
109 | |
110 if (_rawChars == null) { | |
111 _rawChars = decodeBytes(charEncodingName, _rawBytes); | |
112 } | |
113 | |
114 bool skipNewline = false; | |
115 for (var c in _rawChars) { | |
116 if (skipNewline) { | |
117 skipNewline = false; | |
118 if (c == NEWLINE) continue; | |
119 } | |
120 | |
121 if (invalidUnicode(c)) errors.add('invalid-codepoint'); | |
122 | |
123 if (0xD800 <= c && c <= 0xDFFF) { | |
124 c = 0xFFFD; | |
125 } else if (c == RETURN) { | |
126 skipNewline = true; | |
127 c = NEWLINE; | |
128 } | |
129 | |
130 _chars.add(c); | |
131 if (c == NEWLINE) _lineStarts.add(_chars.length); | |
132 } | |
133 | |
134 // Free decoded characters if they aren't needed anymore. | |
135 if (_rawBytes != null) _rawChars = null; | |
136 | |
137 // TODO(sigmund): Don't parse the file at all if spans aren't being | |
138 // generated. | |
139 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl); | |
140 } | |
141 | |
142 void detectEncoding([bool parseMeta = true]) { | |
143 // First look for a BOM | |
144 // This will also read past the BOM if present | |
145 charEncodingName = detectBOM(); | |
146 charEncodingCertain = true; | |
147 | |
148 // If there is no BOM need to look for meta elements with encoding | |
149 // information | |
150 if (charEncodingName == null && parseMeta) { | |
151 charEncodingName = detectEncodingMeta(); | |
152 charEncodingCertain = false; | |
153 } | |
154 // If all else fails use the default encoding | |
155 if (charEncodingName == null) { | |
156 charEncodingCertain = false; | |
157 charEncodingName = defaultEncoding; | |
158 } | |
159 | |
160 // Substitute for equivalent encodings: | |
161 if (charEncodingName.toLowerCase() == 'iso-8859-1') { | |
162 charEncodingName = 'windows-1252'; | |
163 } | |
164 } | |
165 | |
166 void changeEncoding(String newEncoding) { | |
167 if (_rawBytes == null) { | |
168 // We should never get here -- if encoding is certain we won't try to | |
169 // change it. | |
170 throw new StateError('cannot change encoding when parsing a String.'); | |
171 } | |
172 | |
173 newEncoding = codecName(newEncoding); | |
174 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) { | |
175 newEncoding = 'utf-8'; | |
176 } | |
177 if (newEncoding == null) { | |
178 return; | |
179 } else if (newEncoding == charEncodingName) { | |
180 charEncodingCertain = true; | |
181 } else { | |
182 charEncodingName = newEncoding; | |
183 charEncodingCertain = true; | |
184 _rawChars = null; | |
185 reset(); | |
186 throw new ReparseException( | |
187 'Encoding changed from $charEncodingName to $newEncoding'); | |
188 } | |
189 } | |
190 | |
191 /// Attempts to detect at BOM at the start of the stream. If | |
192 /// an encoding can be determined from the BOM return the name of the | |
193 /// encoding otherwise return null. | |
194 String detectBOM() { | |
195 // Try detecting the BOM using bytes from the string | |
196 if (hasUtf8Bom(_rawBytes)) { | |
197 return 'utf-8'; | |
198 } | |
199 // Note: we don't need to remember whether it was big or little endian | |
200 // because the decoder will do that later. It will also eat the BOM for us. | |
201 if (hasUtf16Bom(_rawBytes)) { | |
202 return 'utf-16'; | |
203 } | |
204 if (hasUtf32Bom(_rawBytes)) { | |
205 return 'utf-32'; | |
206 } | |
207 return null; | |
208 } | |
209 | |
210 /// Report the encoding declared by the meta element. | |
211 String detectEncodingMeta() { | |
212 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); | |
213 var encoding = parser.getEncoding(); | |
214 | |
215 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { | |
216 encoding = 'utf-8'; | |
217 } | |
218 | |
219 return encoding; | |
220 } | |
221 | |
222 /// Returns the current offset in the stream, i.e. the number of codepoints | |
223 /// since the start of the file. | |
224 int get position => _offset; | |
225 | |
226 /// Read one character from the stream or queue if available. Return | |
227 /// EOF when EOF is reached. | |
228 String char() { | |
229 if (_offset >= _chars.length) return EOF; | |
230 return new String.fromCharCodes([_chars[_offset++]]); | |
231 } | |
232 | |
233 String peekChar() { | |
234 if (_offset >= _chars.length) return EOF; | |
235 return new String.fromCharCodes([_chars[_offset]]); | |
236 } | |
237 | |
238 /// Returns a string of characters from the stream up to but not | |
239 /// including any character in 'characters' or EOF. | |
240 String charsUntil(String characters, [bool opposite = false]) { | |
241 int start = _offset; | |
242 String c; | |
243 while ((c = peekChar()) != null && characters.contains(c) == opposite) { | |
244 _offset++; | |
245 } | |
246 | |
247 return new String.fromCharCodes(_chars.sublist(start, _offset)); | |
248 } | |
249 | |
250 void unget(String ch) { | |
251 // Only one character is allowed to be ungotten at once - it must | |
252 // be consumed again before any further call to unget | |
253 if (ch != null) { | |
254 _offset--; | |
255 assert(peekChar() == ch); | |
256 } | |
257 } | |
258 } | |
259 | |
260 // TODO(jmesserly): the Python code used a regex to check for this. But | |
261 // Dart doesn't let you create a regexp with invalid characters. | |
262 bool invalidUnicode(int c) { | |
263 if (0x0001 <= c && c <= 0x0008) return true; | |
264 if (0x000E <= c && c <= 0x001F) return true; | |
265 if (0x007F <= c && c <= 0x009F) return true; | |
266 if (0xD800 <= c && c <= 0xDFFF) return true; | |
267 if (0xFDD0 <= c && c <= 0xFDEF) return true; | |
268 switch (c) { | |
269 case 0x000B: | |
270 case 0xFFFE: | |
271 case 0xFFFF: | |
272 case 0x01FFFE: | |
273 case 0x01FFFF: | |
274 case 0x02FFFE: | |
275 case 0x02FFFF: | |
276 case 0x03FFFE: | |
277 case 0x03FFFF: | |
278 case 0x04FFFE: | |
279 case 0x04FFFF: | |
280 case 0x05FFFE: | |
281 case 0x05FFFF: | |
282 case 0x06FFFE: | |
283 case 0x06FFFF: | |
284 case 0x07FFFE: | |
285 case 0x07FFFF: | |
286 case 0x08FFFE: | |
287 case 0x08FFFF: | |
288 case 0x09FFFE: | |
289 case 0x09FFFF: | |
290 case 0x0AFFFE: | |
291 case 0x0AFFFF: | |
292 case 0x0BFFFE: | |
293 case 0x0BFFFF: | |
294 case 0x0CFFFE: | |
295 case 0x0CFFFF: | |
296 case 0x0DFFFE: | |
297 case 0x0DFFFF: | |
298 case 0x0EFFFE: | |
299 case 0x0EFFFF: | |
300 case 0x0FFFFE: | |
301 case 0x0FFFFF: | |
302 case 0x10FFFE: | |
303 case 0x10FFFF: | |
304 return true; | |
305 } | |
306 return false; | |
307 } | |
308 | |
309 /// Return the python codec name corresponding to an encoding or null if the | |
310 /// string doesn't correspond to a valid encoding. | |
311 String codecName(String encoding) { | |
312 final asciiPunctuation = new RegExp( | |
313 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); | |
314 | |
315 if (encoding == null) return null; | |
316 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); | |
317 return encodings[canonicalName]; | |
318 } | |
OLD | NEW |