OLD | NEW |
| (Empty) |
1 library inputstream; | |
2 | |
3 import 'dart:collection'; | |
4 import 'package:utf/utf.dart'; | |
5 import 'package:source_span/source_span.dart'; | |
6 import 'char_encodings.dart'; | |
7 import 'constants.dart'; | |
8 import 'utils.dart'; | |
9 import 'encoding_parser.dart'; | |
10 | |
11 /// Hooks to call into dart:io without directly referencing it. | |
12 class ConsoleSupport { | |
13 List<int> bytesFromFile(source) => null; | |
14 } | |
15 | |
16 // TODO(jmesserly): use lazy init here when supported. | |
17 ConsoleSupport consoleSupport = new ConsoleSupport(); | |
18 | |
19 /// Provides a unicode stream of characters to the HtmlTokenizer. | |
20 /// | |
21 /// This class takes care of character encoding and removing or replacing | |
22 /// incorrect byte-sequences and also provides column and line tracking. | |
23 class HtmlInputStream { | |
24 /// Number of bytes to use when looking for a meta element with | |
25 /// encoding information. | |
26 static const int numBytesMeta = 512; | |
27 | |
28 /// Encoding to use if no other information can be found. | |
29 static const String defaultEncoding = 'windows-1252'; | |
30 | |
31 /// The name of the character encoding. | |
32 String charEncodingName; | |
33 | |
34 /// True if we are certain about [charEncodingName], false for tenative. | |
35 bool charEncodingCertain = true; | |
36 | |
37 final bool generateSpans; | |
38 | |
39 /// Location where the contents of the stream were found. | |
40 final String sourceUrl; | |
41 | |
42 List<int> _rawBytes; | |
43 | |
44 /// Raw UTF-16 codes, used if a Dart String is passed in. | |
45 Iterable<int> _rawChars; | |
46 | |
47 Queue<String> errors; | |
48 | |
49 SourceFile fileInfo; | |
50 | |
51 List<int> _lineStarts; | |
52 | |
53 List<int> _chars; | |
54 | |
55 int _offset; | |
56 | |
57 /// Initialises the HtmlInputStream. | |
58 /// | |
59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source | |
60 /// for use by html5lib. | |
61 /// | |
62 /// [source] can be either a [String] or a [List<int>] containing the raw | |
63 /// bytes, or a file if [consoleSupport] is initialized. | |
64 /// | |
65 /// The optional encoding parameter must be a string that indicates | |
66 /// the encoding. If specified, that encoding will be used, | |
67 /// regardless of any BOM or later declaration (such as in a meta | |
68 /// element) | |
69 /// | |
70 /// [parseMeta] - Look for a <meta> element containing encoding information | |
71 HtmlInputStream(source, [String encoding, bool parseMeta = true, | |
72 this.generateSpans = false, this.sourceUrl]) | |
73 : charEncodingName = codecName(encoding) { | |
74 | |
75 if (source is String) { | |
76 _rawChars = toCodepoints(source); | |
77 charEncodingName = 'utf-8'; | |
78 charEncodingCertain = true; | |
79 } else if (source is List<int>) { | |
80 _rawBytes = source; | |
81 } else { | |
82 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance, | |
83 // but it's necessary because of how the UTF decoders work. | |
84 _rawBytes = consoleSupport.bytesFromFile(source); | |
85 | |
86 if (_rawBytes == null) { | |
87 // TODO(jmesserly): we should accept some kind of stream API too. | |
88 // Unfortunately dart:io InputStream is async only, which won't work. | |
89 throw new ArgumentError("'source' must be a String or " | |
90 "List<int> (of bytes). You can also pass a RandomAccessFile if you" | |
91 "`import 'package:html5lib/parser_console.dart'` and call " | |
92 "`useConsole()`."); | |
93 } | |
94 } | |
95 | |
96 // Detect encoding iff no explicit "transport level" encoding is supplied | |
97 if (charEncodingName == null) { | |
98 detectEncoding(parseMeta); | |
99 } | |
100 | |
101 reset(); | |
102 } | |
103 | |
104 void reset() { | |
105 errors = new Queue<String>(); | |
106 | |
107 _offset = 0; | |
108 _lineStarts = <int>[0]; | |
109 _chars = <int>[]; | |
110 | |
111 if (_rawChars == null) { | |
112 _rawChars = decodeBytes(charEncodingName, _rawBytes); | |
113 } | |
114 | |
115 bool skipNewline = false; | |
116 for (var c in _rawChars) { | |
117 if (skipNewline) { | |
118 skipNewline = false; | |
119 if (c == NEWLINE) continue; | |
120 } | |
121 | |
122 if (invalidUnicode(c)) errors.add('invalid-codepoint'); | |
123 | |
124 if (0xD800 <= c && c <= 0xDFFF) { | |
125 c = 0xFFFD; | |
126 } else if (c == RETURN) { | |
127 skipNewline = true; | |
128 c = NEWLINE; | |
129 } | |
130 | |
131 _chars.add(c); | |
132 if (c == NEWLINE) _lineStarts.add(_chars.length); | |
133 } | |
134 | |
135 // Free decoded characters if they aren't needed anymore. | |
136 if (_rawBytes != null) _rawChars = null; | |
137 | |
138 // TODO(sigmund): Don't parse the file at all if spans aren't being | |
139 // generated. | |
140 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl); | |
141 } | |
142 | |
143 | |
144 void detectEncoding([bool parseMeta = true]) { | |
145 // First look for a BOM | |
146 // This will also read past the BOM if present | |
147 charEncodingName = detectBOM(); | |
148 charEncodingCertain = true; | |
149 | |
150 // If there is no BOM need to look for meta elements with encoding | |
151 // information | |
152 if (charEncodingName == null && parseMeta) { | |
153 charEncodingName = detectEncodingMeta(); | |
154 charEncodingCertain = false; | |
155 } | |
156 // If all else fails use the default encoding | |
157 if (charEncodingName == null) { | |
158 charEncodingCertain = false; | |
159 charEncodingName = defaultEncoding; | |
160 } | |
161 | |
162 // Substitute for equivalent encodings: | |
163 if (charEncodingName.toLowerCase() == 'iso-8859-1') { | |
164 charEncodingName = 'windows-1252'; | |
165 } | |
166 } | |
167 | |
168 void changeEncoding(String newEncoding) { | |
169 if (_rawBytes == null) { | |
170 // We should never get here -- if encoding is certain we won't try to | |
171 // change it. | |
172 throw new StateError('cannot change encoding when parsing a String.'); | |
173 } | |
174 | |
175 newEncoding = codecName(newEncoding); | |
176 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) { | |
177 newEncoding = 'utf-8'; | |
178 } | |
179 if (newEncoding == null) { | |
180 return; | |
181 } else if (newEncoding == charEncodingName) { | |
182 charEncodingCertain = true; | |
183 } else { | |
184 charEncodingName = newEncoding; | |
185 charEncodingCertain = true; | |
186 _rawChars = null; | |
187 reset(); | |
188 throw new ReparseException( | |
189 'Encoding changed from $charEncodingName to $newEncoding'); | |
190 } | |
191 } | |
192 | |
193 /// Attempts to detect at BOM at the start of the stream. If | |
194 /// an encoding can be determined from the BOM return the name of the | |
195 /// encoding otherwise return null. | |
196 String detectBOM() { | |
197 // Try detecting the BOM using bytes from the string | |
198 if (hasUtf8Bom(_rawBytes)) { | |
199 return 'utf-8'; | |
200 } | |
201 // Note: we don't need to remember whether it was big or little endian | |
202 // because the decoder will do that later. It will also eat the BOM for us. | |
203 if (hasUtf16Bom(_rawBytes)) { | |
204 return 'utf-16'; | |
205 } | |
206 if (hasUtf32Bom(_rawBytes)) { | |
207 return 'utf-32'; | |
208 } | |
209 return null; | |
210 } | |
211 | |
212 /// Report the encoding declared by the meta element. | |
213 String detectEncodingMeta() { | |
214 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); | |
215 var encoding = parser.getEncoding(); | |
216 | |
217 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { | |
218 encoding = 'utf-8'; | |
219 } | |
220 | |
221 return encoding; | |
222 } | |
223 | |
224 /// Returns the current offset in the stream, i.e. the number of codepoints | |
225 /// since the start of the file. | |
226 int get position => _offset; | |
227 | |
228 /// Read one character from the stream or queue if available. Return | |
229 /// EOF when EOF is reached. | |
230 String char() { | |
231 if (_offset >= _chars.length) return EOF; | |
232 return new String.fromCharCodes([_chars[_offset++]]); | |
233 } | |
234 | |
235 String peekChar() { | |
236 if (_offset >= _chars.length) return EOF; | |
237 return new String.fromCharCodes([_chars[_offset]]); | |
238 } | |
239 | |
240 /// Returns a string of characters from the stream up to but not | |
241 /// including any character in 'characters' or EOF. | |
242 String charsUntil(String characters, [bool opposite = false]) { | |
243 int start = _offset; | |
244 String c; | |
245 while ((c = peekChar()) != null && characters.contains(c) == opposite) { | |
246 _offset++; | |
247 } | |
248 | |
249 return new String.fromCharCodes(_chars.sublist(start, _offset)); | |
250 } | |
251 | |
252 void unget(String ch) { | |
253 // Only one character is allowed to be ungotten at once - it must | |
254 // be consumed again before any further call to unget | |
255 if (ch != null) { | |
256 _offset--; | |
257 assert(peekChar() == ch); | |
258 } | |
259 } | |
260 } | |
261 | |
262 | |
263 // TODO(jmesserly): the Python code used a regex to check for this. But | |
264 // Dart doesn't let you create a regexp with invalid characters. | |
265 bool invalidUnicode(int c) { | |
266 if (0x0001 <= c && c <= 0x0008) return true; | |
267 if (0x000E <= c && c <= 0x001F) return true; | |
268 if (0x007F <= c && c <= 0x009F) return true; | |
269 if (0xD800 <= c && c <= 0xDFFF) return true; | |
270 if (0xFDD0 <= c && c <= 0xFDEF) return true; | |
271 switch (c) { | |
272 case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF: | |
273 case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF: | |
274 case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF: | |
275 case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF: | |
276 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF: | |
277 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF: | |
278 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF: | |
279 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF: | |
280 case 0x10FFFE: case 0x10FFFF: | |
281 return true; | |
282 } | |
283 return false; | |
284 } | |
285 | |
286 /// Return the python codec name corresponding to an encoding or null if the | |
287 /// string doesn't correspond to a valid encoding. | |
288 String codecName(String encoding) { | |
289 final asciiPunctuation = new RegExp( | |
290 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); | |
291 | |
292 if (encoding == null) return null; | |
293 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); | |
294 return encodings[canonicalName]; | |
295 } | |
OLD | NEW |