Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(81)

Side by Side Diff: html/lib/src/inputstream.dart

Issue 1400473008: Roll Observatory packages and add a roll script (Closed) Base URL: git@github.com:dart-lang/observatory_pub_packages.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « html/lib/src/encoding_parser.dart ('k') | html/lib/src/list_proxy.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 library inputstream;
2
3 import 'dart:collection';
4 import 'package:utf/utf.dart';
5 import 'package:source_span/source_span.dart';
6 import 'char_encodings.dart';
7 import 'constants.dart';
8 import 'utils.dart';
9 import 'encoding_parser.dart';
10
11 /// Hooks to call into dart:io without directly referencing it.
12 class ConsoleSupport {
13 List<int> bytesFromFile(source) => null;
14 }
15
16 // TODO(jmesserly): use lazy init here when supported.
17 ConsoleSupport consoleSupport = new ConsoleSupport();
18
19 /// Provides a unicode stream of characters to the HtmlTokenizer.
20 ///
21 /// This class takes care of character encoding and removing or replacing
22 /// incorrect byte-sequences and also provides column and line tracking.
23 class HtmlInputStream {
24 /// Number of bytes to use when looking for a meta element with
25 /// encoding information.
26 static const int numBytesMeta = 512;
27
28 /// Encoding to use if no other information can be found.
29 static const String defaultEncoding = 'windows-1252';
30
31 /// The name of the character encoding.
32 String charEncodingName;
33
34 /// True if we are certain about [charEncodingName], false for tenative.
35 bool charEncodingCertain = true;
36
37 final bool generateSpans;
38
39 /// Location where the contents of the stream were found.
40 final String sourceUrl;
41
42 List<int> _rawBytes;
43
44 /// Raw UTF-16 codes, used if a Dart String is passed in.
45 Iterable<int> _rawChars;
46
47 Queue<String> errors;
48
49 SourceFile fileInfo;
50
51 List<int> _lineStarts;
52
53 List<int> _chars;
54
55 int _offset;
56
57 /// Initialises the HtmlInputStream.
58 ///
59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source
60 /// for use by html5lib.
61 ///
62 /// [source] can be either a [String] or a [List<int>] containing the raw
63 /// bytes, or a file if [consoleSupport] is initialized.
64 ///
65 /// The optional encoding parameter must be a string that indicates
66 /// the encoding. If specified, that encoding will be used,
67 /// regardless of any BOM or later declaration (such as in a meta
68 /// element)
69 ///
70 /// [parseMeta] - Look for a <meta> element containing encoding information
71 HtmlInputStream(source, [String encoding, bool parseMeta = true,
72 this.generateSpans = false, this.sourceUrl])
73 : charEncodingName = codecName(encoding) {
74 if (source is String) {
75 _rawChars = toCodepoints(source);
76 charEncodingName = 'utf-8';
77 charEncodingCertain = true;
78 } else if (source is List<int>) {
79 _rawBytes = source;
80 } else {
81 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,
82 // but it's necessary because of how the UTF decoders work.
83 _rawBytes = consoleSupport.bytesFromFile(source);
84
85 if (_rawBytes == null) {
86 // TODO(jmesserly): we should accept some kind of stream API too.
87 // Unfortunately dart:io InputStream is async only, which won't work.
88 throw new ArgumentError("'source' must be a String or "
89 "List<int> (of bytes). You can also pass a RandomAccessFile if you"
90 "`import 'package:html/parser_console.dart'` and call "
91 "`useConsole()`.");
92 }
93 }
94
95 // Detect encoding iff no explicit "transport level" encoding is supplied
96 if (charEncodingName == null) {
97 detectEncoding(parseMeta);
98 }
99
100 reset();
101 }
102
103 void reset() {
104 errors = new Queue<String>();
105
106 _offset = 0;
107 _lineStarts = <int>[0];
108 _chars = <int>[];
109
110 if (_rawChars == null) {
111 _rawChars = decodeBytes(charEncodingName, _rawBytes);
112 }
113
114 bool skipNewline = false;
115 for (var c in _rawChars) {
116 if (skipNewline) {
117 skipNewline = false;
118 if (c == NEWLINE) continue;
119 }
120
121 if (invalidUnicode(c)) errors.add('invalid-codepoint');
122
123 if (0xD800 <= c && c <= 0xDFFF) {
124 c = 0xFFFD;
125 } else if (c == RETURN) {
126 skipNewline = true;
127 c = NEWLINE;
128 }
129
130 _chars.add(c);
131 if (c == NEWLINE) _lineStarts.add(_chars.length);
132 }
133
134 // Free decoded characters if they aren't needed anymore.
135 if (_rawBytes != null) _rawChars = null;
136
137 // TODO(sigmund): Don't parse the file at all if spans aren't being
138 // generated.
139 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);
140 }
141
142 void detectEncoding([bool parseMeta = true]) {
143 // First look for a BOM
144 // This will also read past the BOM if present
145 charEncodingName = detectBOM();
146 charEncodingCertain = true;
147
148 // If there is no BOM need to look for meta elements with encoding
149 // information
150 if (charEncodingName == null && parseMeta) {
151 charEncodingName = detectEncodingMeta();
152 charEncodingCertain = false;
153 }
154 // If all else fails use the default encoding
155 if (charEncodingName == null) {
156 charEncodingCertain = false;
157 charEncodingName = defaultEncoding;
158 }
159
160 // Substitute for equivalent encodings:
161 if (charEncodingName.toLowerCase() == 'iso-8859-1') {
162 charEncodingName = 'windows-1252';
163 }
164 }
165
166 void changeEncoding(String newEncoding) {
167 if (_rawBytes == null) {
168 // We should never get here -- if encoding is certain we won't try to
169 // change it.
170 throw new StateError('cannot change encoding when parsing a String.');
171 }
172
173 newEncoding = codecName(newEncoding);
174 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {
175 newEncoding = 'utf-8';
176 }
177 if (newEncoding == null) {
178 return;
179 } else if (newEncoding == charEncodingName) {
180 charEncodingCertain = true;
181 } else {
182 charEncodingName = newEncoding;
183 charEncodingCertain = true;
184 _rawChars = null;
185 reset();
186 throw new ReparseException(
187 'Encoding changed from $charEncodingName to $newEncoding');
188 }
189 }
190
191 /// Attempts to detect at BOM at the start of the stream. If
192 /// an encoding can be determined from the BOM return the name of the
193 /// encoding otherwise return null.
194 String detectBOM() {
195 // Try detecting the BOM using bytes from the string
196 if (hasUtf8Bom(_rawBytes)) {
197 return 'utf-8';
198 }
199 // Note: we don't need to remember whether it was big or little endian
200 // because the decoder will do that later. It will also eat the BOM for us.
201 if (hasUtf16Bom(_rawBytes)) {
202 return 'utf-16';
203 }
204 if (hasUtf32Bom(_rawBytes)) {
205 return 'utf-32';
206 }
207 return null;
208 }
209
210 /// Report the encoding declared by the meta element.
211 String detectEncodingMeta() {
212 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));
213 var encoding = parser.getEncoding();
214
215 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {
216 encoding = 'utf-8';
217 }
218
219 return encoding;
220 }
221
222 /// Returns the current offset in the stream, i.e. the number of codepoints
223 /// since the start of the file.
224 int get position => _offset;
225
226 /// Read one character from the stream or queue if available. Return
227 /// EOF when EOF is reached.
228 String char() {
229 if (_offset >= _chars.length) return EOF;
230 return new String.fromCharCodes([_chars[_offset++]]);
231 }
232
233 String peekChar() {
234 if (_offset >= _chars.length) return EOF;
235 return new String.fromCharCodes([_chars[_offset]]);
236 }
237
238 /// Returns a string of characters from the stream up to but not
239 /// including any character in 'characters' or EOF.
240 String charsUntil(String characters, [bool opposite = false]) {
241 int start = _offset;
242 String c;
243 while ((c = peekChar()) != null && characters.contains(c) == opposite) {
244 _offset++;
245 }
246
247 return new String.fromCharCodes(_chars.sublist(start, _offset));
248 }
249
250 void unget(String ch) {
251 // Only one character is allowed to be ungotten at once - it must
252 // be consumed again before any further call to unget
253 if (ch != null) {
254 _offset--;
255 assert(peekChar() == ch);
256 }
257 }
258 }
259
260 // TODO(jmesserly): the Python code used a regex to check for this. But
261 // Dart doesn't let you create a regexp with invalid characters.
262 bool invalidUnicode(int c) {
263 if (0x0001 <= c && c <= 0x0008) return true;
264 if (0x000E <= c && c <= 0x001F) return true;
265 if (0x007F <= c && c <= 0x009F) return true;
266 if (0xD800 <= c && c <= 0xDFFF) return true;
267 if (0xFDD0 <= c && c <= 0xFDEF) return true;
268 switch (c) {
269 case 0x000B:
270 case 0xFFFE:
271 case 0xFFFF:
272 case 0x01FFFE:
273 case 0x01FFFF:
274 case 0x02FFFE:
275 case 0x02FFFF:
276 case 0x03FFFE:
277 case 0x03FFFF:
278 case 0x04FFFE:
279 case 0x04FFFF:
280 case 0x05FFFE:
281 case 0x05FFFF:
282 case 0x06FFFE:
283 case 0x06FFFF:
284 case 0x07FFFE:
285 case 0x07FFFF:
286 case 0x08FFFE:
287 case 0x08FFFF:
288 case 0x09FFFE:
289 case 0x09FFFF:
290 case 0x0AFFFE:
291 case 0x0AFFFF:
292 case 0x0BFFFE:
293 case 0x0BFFFF:
294 case 0x0CFFFE:
295 case 0x0CFFFF:
296 case 0x0DFFFE:
297 case 0x0DFFFF:
298 case 0x0EFFFE:
299 case 0x0EFFFF:
300 case 0x0FFFFE:
301 case 0x0FFFFF:
302 case 0x10FFFE:
303 case 0x10FFFF:
304 return true;
305 }
306 return false;
307 }
308
309 /// Return the python codec name corresponding to an encoding or null if the
310 /// string doesn't correspond to a valid encoding.
311 String codecName(String encoding) {
312 final asciiPunctuation = new RegExp(
313 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");
314
315 if (encoding == null) return null;
316 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
317 return encodings[canonicalName];
318 }
OLDNEW
« no previous file with comments | « html/lib/src/encoding_parser.dart ('k') | html/lib/src/list_proxy.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698