html/lib/src/inputstream.dart - Issue 1400473008: Roll Observatory packages and add a roll script

Side by Side Diff: html/lib/src/inputstream.dart

Issue 1400473008: Roll Observatory packages and add a roll script (Closed) Base URL: git@github.com:dart-lang/observatory_pub_packages.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 library inputstream;

2

3 import 'dart:collection';

4 import 'package:utf/utf.dart';

5 import 'package:source_span/source_span.dart';

6 import 'char_encodings.dart';

7 import 'constants.dart';

8 import 'utils.dart';

9 import 'encoding_parser.dart';

10

11 /// Hooks to call into dart:io without directly referencing it.

12 class ConsoleSupport {

13 List<int> bytesFromFile(source) => null;

14 }

15

16 // TODO(jmesserly): use lazy init here when supported.

17 ConsoleSupport consoleSupport = new ConsoleSupport();

18

19 /// Provides a unicode stream of characters to the HtmlTokenizer.

20 ///

21 /// This class takes care of character encoding and removing or replacing

22 /// incorrect byte-sequences and also provides column and line tracking.

23 class HtmlInputStream {

24 /// Number of bytes to use when looking for a meta element with

25 /// encoding information.

26 static const int numBytesMeta = 512;

27

28 /// Encoding to use if no other information can be found.

29 static const String defaultEncoding = 'windows-1252';

30

31 /// The name of the character encoding.

32 String charEncodingName;

33

34 /// True if we are certain about [charEncodingName], false for tenative.

35 bool charEncodingCertain = true;

36

37 final bool generateSpans;

38

39 /// Location where the contents of the stream were found.

40 final String sourceUrl;

41

42 List<int> _rawBytes;

43

44 /// Raw UTF-16 codes, used if a Dart String is passed in.

45 Iterable<int> _rawChars;

46

47 Queue<String> errors;

48

49 SourceFile fileInfo;

50

51 List<int> _lineStarts;

52

53 List<int> _chars;

54

55 int _offset;

56

57 /// Initialises the HtmlInputStream.

58 ///

59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source

60 /// for use by html5lib.

61 ///

62 /// [source] can be either a [String] or a [List<int>] containing the raw

63 /// bytes, or a file if [consoleSupport] is initialized.

64 ///

65 /// The optional encoding parameter must be a string that indicates

66 /// the encoding. If specified, that encoding will be used,

67 /// regardless of any BOM or later declaration (such as in a meta

68 /// element)

69 ///

70 /// [parseMeta] - Look for a <meta> element containing encoding information

71 HtmlInputStream(source, [String encoding, bool parseMeta = true,

72 this.generateSpans = false, this.sourceUrl])

73 : charEncodingName = codecName(encoding) {

74 if (source is String) {

75 _rawChars = toCodepoints(source);

76 charEncodingName = 'utf-8';

77 charEncodingCertain = true;

78 } else if (source is List<int>) {

79 _rawBytes = source;

80 } else {

81 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

82 // but it's necessary because of how the UTF decoders work.

83 _rawBytes = consoleSupport.bytesFromFile(source);

84

85 if (_rawBytes == null) {

86 // TODO(jmesserly): we should accept some kind of stream API too.

87 // Unfortunately dart:io InputStream is async only, which won't work.

88 throw new ArgumentError("'source' must be a String or "

89 "List<int> (of bytes). You can also pass a RandomAccessFile if you"

90 "`import 'package:html/parser_console.dart'` and call "

91 "`useConsole()`.");

92 }

93 }

94

95 // Detect encoding iff no explicit "transport level" encoding is supplied

96 if (charEncodingName == null) {

97 detectEncoding(parseMeta);

98 }

99

100 reset();

101 }

102

103 void reset() {

104 errors = new Queue<String>();

105

106 _offset = 0;

107 _lineStarts = <int>[0];

108 _chars = <int>[];

109

110 if (_rawChars == null) {

111 _rawChars = decodeBytes(charEncodingName, _rawBytes);

112 }

113

114 bool skipNewline = false;

115 for (var c in _rawChars) {

116 if (skipNewline) {

117 skipNewline = false;

118 if (c == NEWLINE) continue;

119 }

120

121 if (invalidUnicode(c)) errors.add('invalid-codepoint');

122

123 if (0xD800 <= c && c <= 0xDFFF) {

124 c = 0xFFFD;

125 } else if (c == RETURN) {

126 skipNewline = true;

127 c = NEWLINE;

128 }

129

130 _chars.add(c);

131 if (c == NEWLINE) _lineStarts.add(_chars.length);

132 }

133

134 // Free decoded characters if they aren't needed anymore.

135 if (_rawBytes != null) _rawChars = null;

136

137 // TODO(sigmund): Don't parse the file at all if spans aren't being

138 // generated.

139 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);

140 }

141

142 void detectEncoding([bool parseMeta = true]) {

143 // First look for a BOM

144 // This will also read past the BOM if present

145 charEncodingName = detectBOM();

146 charEncodingCertain = true;

147

148 // If there is no BOM need to look for meta elements with encoding

149 // information

150 if (charEncodingName == null && parseMeta) {

151 charEncodingName = detectEncodingMeta();

152 charEncodingCertain = false;

153 }

154 // If all else fails use the default encoding

155 if (charEncodingName == null) {

156 charEncodingCertain = false;

157 charEncodingName = defaultEncoding;

158 }

159

160 // Substitute for equivalent encodings:

161 if (charEncodingName.toLowerCase() == 'iso-8859-1') {

162 charEncodingName = 'windows-1252';

163 }

164 }

165

166 void changeEncoding(String newEncoding) {

167 if (_rawBytes == null) {

168 // We should never get here -- if encoding is certain we won't try to

169 // change it.

170 throw new StateError('cannot change encoding when parsing a String.');

171 }

172

173 newEncoding = codecName(newEncoding);

174 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {

175 newEncoding = 'utf-8';

176 }

177 if (newEncoding == null) {

178 return;

179 } else if (newEncoding == charEncodingName) {

180 charEncodingCertain = true;

181 } else {

182 charEncodingName = newEncoding;

183 charEncodingCertain = true;

184 _rawChars = null;

185 reset();

186 throw new ReparseException(

187 'Encoding changed from $charEncodingName to $newEncoding');

188 }

189 }

190

191 /// Attempts to detect at BOM at the start of the stream. If

192 /// an encoding can be determined from the BOM return the name of the

193 /// encoding otherwise return null.

194 String detectBOM() {

195 // Try detecting the BOM using bytes from the string

196 if (hasUtf8Bom(_rawBytes)) {

197 return 'utf-8';

198 }

199 // Note: we don't need to remember whether it was big or little endian

200 // because the decoder will do that later. It will also eat the BOM for us.

201 if (hasUtf16Bom(_rawBytes)) {

202 return 'utf-16';

203 }

204 if (hasUtf32Bom(_rawBytes)) {

205 return 'utf-32';

206 }

207 return null;

208 }

209

210 /// Report the encoding declared by the meta element.

211 String detectEncodingMeta() {

212 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));

213 var encoding = parser.getEncoding();

214

215 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {

216 encoding = 'utf-8';

217 }

218

219 return encoding;

220 }

221

222 /// Returns the current offset in the stream, i.e. the number of codepoints

223 /// since the start of the file.

224 int get position => _offset;

225

226 /// Read one character from the stream or queue if available. Return

227 /// EOF when EOF is reached.

228 String char() {

229 if (_offset >= _chars.length) return EOF;

230 return new String.fromCharCodes([_chars[_offset++]]);

231 }

232

233 String peekChar() {

234 if (_offset >= _chars.length) return EOF;

235 return new String.fromCharCodes([_chars[_offset]]);

236 }

237

238 /// Returns a string of characters from the stream up to but not

239 /// including any character in 'characters' or EOF.

240 String charsUntil(String characters, [bool opposite = false]) {

241 int start = _offset;

242 String c;

243 while ((c = peekChar()) != null && characters.contains(c) == opposite) {

244 _offset++;

245 }

246

247 return new String.fromCharCodes(_chars.sublist(start, _offset));

248 }

249

250 void unget(String ch) {

251 // Only one character is allowed to be ungotten at once - it must

252 // be consumed again before any further call to unget

253 if (ch != null) {

254 _offset--;

255 assert(peekChar() == ch);

256 }

257 }

258 }

259

260 // TODO(jmesserly): the Python code used a regex to check for this. But

261 // Dart doesn't let you create a regexp with invalid characters.

262 bool invalidUnicode(int c) {

263 if (0x0001 <= c && c <= 0x0008) return true;

264 if (0x000E <= c && c <= 0x001F) return true;

265 if (0x007F <= c && c <= 0x009F) return true;

266 if (0xD800 <= c && c <= 0xDFFF) return true;

267 if (0xFDD0 <= c && c <= 0xFDEF) return true;

268 switch (c) {

269 case 0x000B:

270 case 0xFFFE:

271 case 0xFFFF:

272 case 0x01FFFE:

273 case 0x01FFFF:

274 case 0x02FFFE:

275 case 0x02FFFF:

276 case 0x03FFFE:

277 case 0x03FFFF:

278 case 0x04FFFE:

279 case 0x04FFFF:

280 case 0x05FFFE:

281 case 0x05FFFF:

282 case 0x06FFFE:

283 case 0x06FFFF:

284 case 0x07FFFE:

285 case 0x07FFFF:

286 case 0x08FFFE:

287 case 0x08FFFF:

288 case 0x09FFFE:

289 case 0x09FFFF:

290 case 0x0AFFFE:

291 case 0x0AFFFF:

292 case 0x0BFFFE:

293 case 0x0BFFFF:

294 case 0x0CFFFE:

295 case 0x0CFFFF:

296 case 0x0DFFFE:

297 case 0x0DFFFF:

298 case 0x0EFFFE:

299 case 0x0EFFFF:

300 case 0x0FFFFE:

301 case 0x0FFFFF:

302 case 0x10FFFE:

303 case 0x10FFFF:

304 return true;

305 }

306 return false;

307 }

308

309 /// Return the python codec name corresponding to an encoding or null if the

310 /// string doesn't correspond to a valid encoding.

311 String codecName(String encoding) {

312 final asciiPunctuation = new RegExp(

313 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

314

315 if (encoding == null) return null;

316 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

317 return encodings[canonicalName];

318 }

OLD	NEW

« no previous file with comments | « html/lib/src/encoding_parser.dart ('k') | html/lib/src/list_proxy.dart » ('j') | no next file with comments »