OLD | NEW |
1 library inputstream; | 1 library inputstream; |
2 | 2 |
3 import 'dart:collection'; | 3 import 'dart:collection'; |
4 import 'package:utf/utf.dart'; | 4 import 'package:utf/utf.dart'; |
5 import 'package:source_maps/span.dart' show SourceFile; | 5 import 'package:source_maps/span.dart' show SourceFile; |
6 import 'char_encodings.dart'; | 6 import 'char_encodings.dart'; |
7 import 'constants.dart'; | 7 import 'constants.dart'; |
8 import 'utils.dart'; | 8 import 'utils.dart'; |
9 import 'encoding_parser.dart'; | 9 import 'encoding_parser.dart'; |
10 | 10 |
11 /** Hooks to call into dart:io without directly referencing it. */ | 11 /// Hooks to call into dart:io without directly referencing it. |
12 class ConsoleSupport { | 12 class ConsoleSupport { |
13 List<int> bytesFromFile(source) => null; | 13 List<int> bytesFromFile(source) => null; |
14 } | 14 } |
15 | 15 |
16 // TODO(jmesserly): use lazy init here when supported. | 16 // TODO(jmesserly): use lazy init here when supported. |
17 ConsoleSupport consoleSupport = new ConsoleSupport(); | 17 ConsoleSupport consoleSupport = new ConsoleSupport(); |
18 | 18 |
19 /** | 19 /// Provides a unicode stream of characters to the HtmlTokenizer. |
20 * Provides a unicode stream of characters to the HtmlTokenizer. | 20 /// |
21 * | 21 /// This class takes care of character encoding and removing or replacing |
22 * This class takes care of character encoding and removing or replacing | 22 /// incorrect byte-sequences and also provides column and line tracking. |
23 * incorrect byte-sequences and also provides column and line tracking. | |
24 */ | |
25 class HtmlInputStream { | 23 class HtmlInputStream { |
26 /** | 24 /// Number of bytes to use when looking for a meta element with |
27 * Number of bytes to use when looking for a meta element with | 25 /// encoding information. |
28 * encoding information. | |
29 */ | |
30 static const int numBytesMeta = 512; | 26 static const int numBytesMeta = 512; |
31 | 27 |
32 /** Encoding to use if no other information can be found. */ | 28 /// Encoding to use if no other information can be found. |
33 static const String defaultEncoding = 'windows-1252'; | 29 static const String defaultEncoding = 'windows-1252'; |
34 | 30 |
35 /** The name of the character encoding. */ | 31 /// The name of the character encoding. |
36 String charEncodingName; | 32 String charEncodingName; |
37 | 33 |
38 /** True if we are certain about [charEncodingName], false for tenative. */ | 34 /// True if we are certain about [charEncodingName], false for tenative. |
39 bool charEncodingCertain = true; | 35 bool charEncodingCertain = true; |
40 | 36 |
41 final bool generateSpans; | 37 final bool generateSpans; |
42 | 38 |
43 /** Location where the contents of the stream were found. */ | 39 /// Location where the contents of the stream were found. |
44 final String sourceUrl; | 40 final String sourceUrl; |
45 | 41 |
46 List<int> _rawBytes; | 42 List<int> _rawBytes; |
47 | 43 |
48 /** Raw UTF-16 codes, used if a Dart String is passed in. */ | 44 /// Raw UTF-16 codes, used if a Dart String is passed in. |
49 Iterable<int> _rawChars; | 45 Iterable<int> _rawChars; |
50 | 46 |
51 Queue<String> errors; | 47 Queue<String> errors; |
52 | 48 |
53 SourceFile fileInfo; | 49 SourceFile fileInfo; |
54 | 50 |
55 List<int> _lineStarts; | 51 List<int> _lineStarts; |
56 | 52 |
57 List<int> _chars; | 53 List<int> _chars; |
58 | 54 |
59 int _offset; | 55 int _offset; |
60 | 56 |
61 /** | 57 /// Initialises the HtmlInputStream. |
62 * Initialises the HtmlInputStream. | 58 /// |
63 * | 59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source |
64 * HtmlInputStream(source, [encoding]) -> Normalized stream from source | 60 /// for use by html5lib. |
65 * for use by html5lib. | 61 /// |
66 * | 62 /// [source] can be either a [String] or a [List<int>] containing the raw |
67 * [source] can be either a [String] or a [List<int>] containing the raw | 63 /// bytes, or a file if [consoleSupport] is initialized. |
68 * bytes, or a file if [consoleSupport] is initialized. | 64 /// |
69 * | 65 /// The optional encoding parameter must be a string that indicates |
70 * The optional encoding parameter must be a string that indicates | 66 /// the encoding. If specified, that encoding will be used, |
71 * the encoding. If specified, that encoding will be used, | 67 /// regardless of any BOM or later declaration (such as in a meta |
72 * regardless of any BOM or later declaration (such as in a meta | 68 /// element) |
73 * element) | 69 /// |
74 * | 70 /// [parseMeta] - Look for a <meta> element containing encoding information |
75 * [parseMeta] - Look for a <meta> element containing encoding information | |
76 */ | |
77 HtmlInputStream(source, [String encoding, bool parseMeta = true, | 71 HtmlInputStream(source, [String encoding, bool parseMeta = true, |
78 this.generateSpans = false, this.sourceUrl]) | 72 this.generateSpans = false, this.sourceUrl]) |
79 : charEncodingName = codecName(encoding) { | 73 : charEncodingName = codecName(encoding) { |
80 | 74 |
81 if (source is String) { | 75 if (source is String) { |
82 _rawChars = toCodepoints(source); | 76 _rawChars = toCodepoints(source); |
83 charEncodingName = 'utf-8'; | 77 charEncodingName = 'utf-8'; |
84 charEncodingCertain = true; | 78 charEncodingCertain = true; |
85 } else if (source is List<int>) { | 79 } else if (source is List<int>) { |
86 _rawBytes = source; | 80 _rawBytes = source; |
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
188 } else { | 182 } else { |
189 charEncodingName = newEncoding; | 183 charEncodingName = newEncoding; |
190 charEncodingCertain = true; | 184 charEncodingCertain = true; |
191 _rawChars = null; | 185 _rawChars = null; |
192 reset(); | 186 reset(); |
193 throw new ReparseException( | 187 throw new ReparseException( |
194 'Encoding changed from $charEncodingName to $newEncoding'); | 188 'Encoding changed from $charEncodingName to $newEncoding'); |
195 } | 189 } |
196 } | 190 } |
197 | 191 |
198 /** | 192 /// Attempts to detect at BOM at the start of the stream. If |
199 * Attempts to detect at BOM at the start of the stream. If | 193 /// an encoding can be determined from the BOM return the name of the |
200 * an encoding can be determined from the BOM return the name of the | 194 /// encoding otherwise return null. |
201 * encoding otherwise return null. | |
202 */ | |
203 String detectBOM() { | 195 String detectBOM() { |
204 // Try detecting the BOM using bytes from the string | 196 // Try detecting the BOM using bytes from the string |
205 if (hasUtf8Bom(_rawBytes)) { | 197 if (hasUtf8Bom(_rawBytes)) { |
206 return 'utf-8'; | 198 return 'utf-8'; |
207 } | 199 } |
208 // Note: we don't need to remember whether it was big or little endian | 200 // Note: we don't need to remember whether it was big or little endian |
209 // because the decoder will do that later. It will also eat the BOM for us. | 201 // because the decoder will do that later. It will also eat the BOM for us. |
210 if (hasUtf16Bom(_rawBytes)) { | 202 if (hasUtf16Bom(_rawBytes)) { |
211 return 'utf-16'; | 203 return 'utf-16'; |
212 } | 204 } |
213 if (hasUtf32Bom(_rawBytes)) { | 205 if (hasUtf32Bom(_rawBytes)) { |
214 return 'utf-32'; | 206 return 'utf-32'; |
215 } | 207 } |
216 return null; | 208 return null; |
217 } | 209 } |
218 | 210 |
219 /** Report the encoding declared by the meta element. */ | 211 /// Report the encoding declared by the meta element. |
220 String detectEncodingMeta() { | 212 String detectEncodingMeta() { |
221 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); | 213 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); |
222 var encoding = parser.getEncoding(); | 214 var encoding = parser.getEncoding(); |
223 | 215 |
224 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { | 216 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { |
225 encoding = 'utf-8'; | 217 encoding = 'utf-8'; |
226 } | 218 } |
227 | 219 |
228 return encoding; | 220 return encoding; |
229 } | 221 } |
230 | 222 |
231 /** | 223 /// Returns the current offset in the stream, i.e. the number of codepoints |
232 * Returns the current offset in the stream, i.e. the number of codepoints | 224 /// since the start of the file. |
233 * since the start of the file. | |
234 */ | |
235 int get position => _offset; | 225 int get position => _offset; |
236 | 226 |
237 /** | 227 /// Read one character from the stream or queue if available. Return |
238 * Read one character from the stream or queue if available. Return | 228 /// EOF when EOF is reached. |
239 * EOF when EOF is reached. | |
240 */ | |
241 String char() { | 229 String char() { |
242 if (_offset >= _chars.length) return EOF; | 230 if (_offset >= _chars.length) return EOF; |
243 return new String.fromCharCodes([_chars[_offset++]]); | 231 return new String.fromCharCodes([_chars[_offset++]]); |
244 } | 232 } |
245 | 233 |
246 String peekChar() { | 234 String peekChar() { |
247 if (_offset >= _chars.length) return EOF; | 235 if (_offset >= _chars.length) return EOF; |
248 return new String.fromCharCodes([_chars[_offset]]); | 236 return new String.fromCharCodes([_chars[_offset]]); |
249 } | 237 } |
250 | 238 |
251 /** | 239 /// Returns a string of characters from the stream up to but not |
252 * Returns a string of characters from the stream up to but not | 240 /// including any character in 'characters' or EOF. |
253 * including any character in 'characters' or EOF. | |
254 */ | |
255 String charsUntil(String characters, [bool opposite = false]) { | 241 String charsUntil(String characters, [bool opposite = false]) { |
256 int start = _offset; | 242 int start = _offset; |
257 String c; | 243 String c; |
258 while ((c = peekChar()) != null && characters.contains(c) == opposite) { | 244 while ((c = peekChar()) != null && characters.contains(c) == opposite) { |
259 _offset++; | 245 _offset++; |
260 } | 246 } |
261 | 247 |
262 return new String.fromCharCodes(_chars.sublist(start, _offset)); | 248 return new String.fromCharCodes(_chars.sublist(start, _offset)); |
263 } | 249 } |
264 | 250 |
(...skipping 24 matching lines...) Expand all Loading... |
289 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF: | 275 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF: |
290 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF: | 276 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF: |
291 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF: | 277 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF: |
292 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF: | 278 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF: |
293 case 0x10FFFE: case 0x10FFFF: | 279 case 0x10FFFE: case 0x10FFFF: |
294 return true; | 280 return true; |
295 } | 281 } |
296 return false; | 282 return false; |
297 } | 283 } |
298 | 284 |
299 /** | 285 /// Return the python codec name corresponding to an encoding or null if the |
300 * Return the python codec name corresponding to an encoding or null if the | 286 /// string doesn't correspond to a valid encoding. |
301 * string doesn't correspond to a valid encoding. | |
302 */ | |
303 String codecName(String encoding) { | 287 String codecName(String encoding) { |
304 final asciiPunctuation = new RegExp( | 288 final asciiPunctuation = new RegExp( |
305 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); | 289 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); |
306 | 290 |
307 if (encoding == null) return null; | 291 if (encoding == null) return null; |
308 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); | 292 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); |
309 return encodings[canonicalName]; | 293 return encodings[canonicalName]; |
310 } | 294 } |
OLD | NEW |