Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(495)

Side by Side Diff: pkg/third_party/html5lib/lib/src/inputstream.dart

Issue 178843003: [html5lib] triple slash comment style (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: remove extra check Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 library inputstream; 1 library inputstream;
2 2
3 import 'dart:collection'; 3 import 'dart:collection';
4 import 'package:utf/utf.dart'; 4 import 'package:utf/utf.dart';
5 import 'package:source_maps/span.dart' show SourceFile; 5 import 'package:source_maps/span.dart' show SourceFile;
6 import 'char_encodings.dart'; 6 import 'char_encodings.dart';
7 import 'constants.dart'; 7 import 'constants.dart';
8 import 'utils.dart'; 8 import 'utils.dart';
9 import 'encoding_parser.dart'; 9 import 'encoding_parser.dart';
10 10
11 /** Hooks to call into dart:io without directly referencing it. */ 11 /// Hooks to call into dart:io without directly referencing it.
12 class ConsoleSupport { 12 class ConsoleSupport {
13 List<int> bytesFromFile(source) => null; 13 List<int> bytesFromFile(source) => null;
14 } 14 }
15 15
16 // TODO(jmesserly): use lazy init here when supported. 16 // TODO(jmesserly): use lazy init here when supported.
17 ConsoleSupport consoleSupport = new ConsoleSupport(); 17 ConsoleSupport consoleSupport = new ConsoleSupport();
18 18
19 /** 19 /// Provides a unicode stream of characters to the HtmlTokenizer.
20 * Provides a unicode stream of characters to the HtmlTokenizer. 20 ///
21 * 21 /// This class takes care of character encoding and removing or replacing
22 * This class takes care of character encoding and removing or replacing 22 /// incorrect byte-sequences and also provides column and line tracking.
23 * incorrect byte-sequences and also provides column and line tracking.
24 */
25 class HtmlInputStream { 23 class HtmlInputStream {
26 /** 24 /// Number of bytes to use when looking for a meta element with
27 * Number of bytes to use when looking for a meta element with 25 /// encoding information.
28 * encoding information.
29 */
30 static const int numBytesMeta = 512; 26 static const int numBytesMeta = 512;
31 27
32 /** Encoding to use if no other information can be found. */ 28 /// Encoding to use if no other information can be found.
33 static const String defaultEncoding = 'windows-1252'; 29 static const String defaultEncoding = 'windows-1252';
34 30
35 /** The name of the character encoding. */ 31 /// The name of the character encoding.
36 String charEncodingName; 32 String charEncodingName;
37 33
38 /** True if we are certain about [charEncodingName], false for tenative. */ 34 /// True if we are certain about [charEncodingName], false for tenative.
39 bool charEncodingCertain = true; 35 bool charEncodingCertain = true;
40 36
41 final bool generateSpans; 37 final bool generateSpans;
42 38
43 /** Location where the contents of the stream were found. */ 39 /// Location where the contents of the stream were found.
44 final String sourceUrl; 40 final String sourceUrl;
45 41
46 List<int> _rawBytes; 42 List<int> _rawBytes;
47 43
48 /** Raw UTF-16 codes, used if a Dart String is passed in. */ 44 /// Raw UTF-16 codes, used if a Dart String is passed in.
49 Iterable<int> _rawChars; 45 Iterable<int> _rawChars;
50 46
51 Queue<String> errors; 47 Queue<String> errors;
52 48
53 SourceFile fileInfo; 49 SourceFile fileInfo;
54 50
55 List<int> _lineStarts; 51 List<int> _lineStarts;
56 52
57 List<int> _chars; 53 List<int> _chars;
58 54
59 int _offset; 55 int _offset;
60 56
61 /** 57 /// Initialises the HtmlInputStream.
62 * Initialises the HtmlInputStream. 58 ///
63 * 59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source
64 * HtmlInputStream(source, [encoding]) -> Normalized stream from source 60 /// for use by html5lib.
65 * for use by html5lib. 61 ///
66 * 62 /// [source] can be either a [String] or a [List<int>] containing the raw
67 * [source] can be either a [String] or a [List<int>] containing the raw 63 /// bytes, or a file if [consoleSupport] is initialized.
68 * bytes, or a file if [consoleSupport] is initialized. 64 ///
69 * 65 /// The optional encoding parameter must be a string that indicates
70 * The optional encoding parameter must be a string that indicates 66 /// the encoding. If specified, that encoding will be used,
71 * the encoding. If specified, that encoding will be used, 67 /// regardless of any BOM or later declaration (such as in a meta
72 * regardless of any BOM or later declaration (such as in a meta 68 /// element)
73 * element) 69 ///
74 * 70 /// [parseMeta] - Look for a <meta> element containing encoding information
75 * [parseMeta] - Look for a <meta> element containing encoding information
76 */
77 HtmlInputStream(source, [String encoding, bool parseMeta = true, 71 HtmlInputStream(source, [String encoding, bool parseMeta = true,
78 this.generateSpans = false, this.sourceUrl]) 72 this.generateSpans = false, this.sourceUrl])
79 : charEncodingName = codecName(encoding) { 73 : charEncodingName = codecName(encoding) {
80 74
81 if (source is String) { 75 if (source is String) {
82 _rawChars = toCodepoints(source); 76 _rawChars = toCodepoints(source);
83 charEncodingName = 'utf-8'; 77 charEncodingName = 'utf-8';
84 charEncodingCertain = true; 78 charEncodingCertain = true;
85 } else if (source is List<int>) { 79 } else if (source is List<int>) {
86 _rawBytes = source; 80 _rawBytes = source;
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
188 } else { 182 } else {
189 charEncodingName = newEncoding; 183 charEncodingName = newEncoding;
190 charEncodingCertain = true; 184 charEncodingCertain = true;
191 _rawChars = null; 185 _rawChars = null;
192 reset(); 186 reset();
193 throw new ReparseException( 187 throw new ReparseException(
194 'Encoding changed from $charEncodingName to $newEncoding'); 188 'Encoding changed from $charEncodingName to $newEncoding');
195 } 189 }
196 } 190 }
197 191
198 /** 192 /// Attempts to detect at BOM at the start of the stream. If
199 * Attempts to detect at BOM at the start of the stream. If 193 /// an encoding can be determined from the BOM return the name of the
200 * an encoding can be determined from the BOM return the name of the 194 /// encoding otherwise return null.
201 * encoding otherwise return null.
202 */
203 String detectBOM() { 195 String detectBOM() {
204 // Try detecting the BOM using bytes from the string 196 // Try detecting the BOM using bytes from the string
205 if (hasUtf8Bom(_rawBytes)) { 197 if (hasUtf8Bom(_rawBytes)) {
206 return 'utf-8'; 198 return 'utf-8';
207 } 199 }
208 // Note: we don't need to remember whether it was big or little endian 200 // Note: we don't need to remember whether it was big or little endian
209 // because the decoder will do that later. It will also eat the BOM for us. 201 // because the decoder will do that later. It will also eat the BOM for us.
210 if (hasUtf16Bom(_rawBytes)) { 202 if (hasUtf16Bom(_rawBytes)) {
211 return 'utf-16'; 203 return 'utf-16';
212 } 204 }
213 if (hasUtf32Bom(_rawBytes)) { 205 if (hasUtf32Bom(_rawBytes)) {
214 return 'utf-32'; 206 return 'utf-32';
215 } 207 }
216 return null; 208 return null;
217 } 209 }
218 210
219 /** Report the encoding declared by the meta element. */ 211 /// Report the encoding declared by the meta element.
220 String detectEncodingMeta() { 212 String detectEncodingMeta() {
221 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta)); 213 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));
222 var encoding = parser.getEncoding(); 214 var encoding = parser.getEncoding();
223 215
224 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) { 216 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {
225 encoding = 'utf-8'; 217 encoding = 'utf-8';
226 } 218 }
227 219
228 return encoding; 220 return encoding;
229 } 221 }
230 222
231 /** 223 /// Returns the current offset in the stream, i.e. the number of codepoints
232 * Returns the current offset in the stream, i.e. the number of codepoints 224 /// since the start of the file.
233 * since the start of the file.
234 */
235 int get position => _offset; 225 int get position => _offset;
236 226
237 /** 227 /// Read one character from the stream or queue if available. Return
238 * Read one character from the stream or queue if available. Return 228 /// EOF when EOF is reached.
239 * EOF when EOF is reached.
240 */
241 String char() { 229 String char() {
242 if (_offset >= _chars.length) return EOF; 230 if (_offset >= _chars.length) return EOF;
243 return new String.fromCharCodes([_chars[_offset++]]); 231 return new String.fromCharCodes([_chars[_offset++]]);
244 } 232 }
245 233
246 String peekChar() { 234 String peekChar() {
247 if (_offset >= _chars.length) return EOF; 235 if (_offset >= _chars.length) return EOF;
248 return new String.fromCharCodes([_chars[_offset]]); 236 return new String.fromCharCodes([_chars[_offset]]);
249 } 237 }
250 238
251 /** 239 /// Returns a string of characters from the stream up to but not
252 * Returns a string of characters from the stream up to but not 240 /// including any character in 'characters' or EOF.
253 * including any character in 'characters' or EOF.
254 */
255 String charsUntil(String characters, [bool opposite = false]) { 241 String charsUntil(String characters, [bool opposite = false]) {
256 int start = _offset; 242 int start = _offset;
257 String c; 243 String c;
258 while ((c = peekChar()) != null && characters.contains(c) == opposite) { 244 while ((c = peekChar()) != null && characters.contains(c) == opposite) {
259 _offset++; 245 _offset++;
260 } 246 }
261 247
262 return new String.fromCharCodes(_chars.sublist(start, _offset)); 248 return new String.fromCharCodes(_chars.sublist(start, _offset));
263 } 249 }
264 250
(...skipping 24 matching lines...) Expand all
289 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF: 275 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:
290 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF: 276 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:
291 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF: 277 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:
292 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF: 278 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:
293 case 0x10FFFE: case 0x10FFFF: 279 case 0x10FFFE: case 0x10FFFF:
294 return true; 280 return true;
295 } 281 }
296 return false; 282 return false;
297 } 283 }
298 284
299 /** 285 /// Return the python codec name corresponding to an encoding or null if the
300 * Return the python codec name corresponding to an encoding or null if the 286 /// string doesn't correspond to a valid encoding.
301 * string doesn't correspond to a valid encoding.
302 */
303 String codecName(String encoding) { 287 String codecName(String encoding) {
304 final asciiPunctuation = new RegExp( 288 final asciiPunctuation = new RegExp(
305 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); 289 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");
306 290
307 if (encoding == null) return null; 291 if (encoding == null) return null;
308 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); 292 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
309 return encodings[canonicalName]; 293 return encodings[canonicalName];
310 } 294 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698