pkg/third_party/html5lib/lib/src/inputstream.dart - Issue 814113004: Pull args, intl, logging, shelf, and source_maps out of the SDK.

Side by Side Diff: pkg/third_party/html5lib/lib/src/inputstream.dart

Issue 814113004: Pull args, intl, logging, shelf, and source_maps out of the SDK. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Also csslib. Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 library inputstream;

2

3 import 'dart:collection';

4 import 'package:utf/utf.dart';

5 import 'package:source_span/source_span.dart';

6 import 'char_encodings.dart';

7 import 'constants.dart';

8 import 'utils.dart';

9 import 'encoding_parser.dart';

10

11 /// Hooks to call into dart:io without directly referencing it.

12 class ConsoleSupport {

13 List<int> bytesFromFile(source) => null;

14 }

15

16 // TODO(jmesserly): use lazy init here when supported.

17 ConsoleSupport consoleSupport = new ConsoleSupport();

18

19 /// Provides a unicode stream of characters to the HtmlTokenizer.

20 ///

21 /// This class takes care of character encoding and removing or replacing

22 /// incorrect byte-sequences and also provides column and line tracking.

23 class HtmlInputStream {

24 /// Number of bytes to use when looking for a meta element with

25 /// encoding information.

26 static const int numBytesMeta = 512;

27

28 /// Encoding to use if no other information can be found.

29 static const String defaultEncoding = 'windows-1252';

30

31 /// The name of the character encoding.

32 String charEncodingName;

33

34 /// True if we are certain about [charEncodingName], false for tenative.

35 bool charEncodingCertain = true;

36

37 final bool generateSpans;

38

39 /// Location where the contents of the stream were found.

40 final String sourceUrl;

41

42 List<int> _rawBytes;

43

44 /// Raw UTF-16 codes, used if a Dart String is passed in.

45 Iterable<int> _rawChars;

46

47 Queue<String> errors;

48

49 SourceFile fileInfo;

50

51 List<int> _lineStarts;

52

53 List<int> _chars;

54

55 int _offset;

56

57 /// Initialises the HtmlInputStream.

58 ///

59 /// HtmlInputStream(source, [encoding]) -> Normalized stream from source

60 /// for use by html5lib.

61 ///

62 /// [source] can be either a [String] or a [List<int>] containing the raw

63 /// bytes, or a file if [consoleSupport] is initialized.

64 ///

65 /// The optional encoding parameter must be a string that indicates

66 /// the encoding. If specified, that encoding will be used,

67 /// regardless of any BOM or later declaration (such as in a meta

68 /// element)

69 ///

70 /// [parseMeta] - Look for a <meta> element containing encoding information

71 HtmlInputStream(source, [String encoding, bool parseMeta = true,

72 this.generateSpans = false, this.sourceUrl])

73 : charEncodingName = codecName(encoding) {

74

75 if (source is String) {

76 _rawChars = toCodepoints(source);

77 charEncodingName = 'utf-8';

78 charEncodingCertain = true;

79 } else if (source is List<int>) {

80 _rawBytes = source;

81 } else {

82 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

83 // but it's necessary because of how the UTF decoders work.

84 _rawBytes = consoleSupport.bytesFromFile(source);

85

86 if (_rawBytes == null) {

87 // TODO(jmesserly): we should accept some kind of stream API too.

88 // Unfortunately dart:io InputStream is async only, which won't work.

89 throw new ArgumentError("'source' must be a String or "

90 "List<int> (of bytes). You can also pass a RandomAccessFile if you"

91 "`import 'package:html5lib/parser_console.dart'` and call "

92 "`useConsole()`.");

93 }

94 }

95

96 // Detect encoding iff no explicit "transport level" encoding is supplied

97 if (charEncodingName == null) {

98 detectEncoding(parseMeta);

99 }

100

101 reset();

102 }

103

104 void reset() {

105 errors = new Queue<String>();

106

107 _offset = 0;

108 _lineStarts = <int>[0];

109 _chars = <int>[];

110

111 if (_rawChars == null) {

112 _rawChars = decodeBytes(charEncodingName, _rawBytes);

113 }

114

115 bool skipNewline = false;

116 for (var c in _rawChars) {

117 if (skipNewline) {

118 skipNewline = false;

119 if (c == NEWLINE) continue;

120 }

121

122 if (invalidUnicode(c)) errors.add('invalid-codepoint');

123

124 if (0xD800 <= c && c <= 0xDFFF) {

125 c = 0xFFFD;

126 } else if (c == RETURN) {

127 skipNewline = true;

128 c = NEWLINE;

129 }

130

131 _chars.add(c);

132 if (c == NEWLINE) _lineStarts.add(_chars.length);

133 }

134

135 // Free decoded characters if they aren't needed anymore.

136 if (_rawBytes != null) _rawChars = null;

137

138 // TODO(sigmund): Don't parse the file at all if spans aren't being

139 // generated.

140 fileInfo = new SourceFile.decoded(_chars, url: sourceUrl);

141 }

142

143

144 void detectEncoding([bool parseMeta = true]) {

145 // First look for a BOM

146 // This will also read past the BOM if present

147 charEncodingName = detectBOM();

148 charEncodingCertain = true;

149

150 // If there is no BOM need to look for meta elements with encoding

151 // information

152 if (charEncodingName == null && parseMeta) {

153 charEncodingName = detectEncodingMeta();

154 charEncodingCertain = false;

155 }

156 // If all else fails use the default encoding

157 if (charEncodingName == null) {

158 charEncodingCertain = false;

159 charEncodingName = defaultEncoding;

160 }

161

162 // Substitute for equivalent encodings:

163 if (charEncodingName.toLowerCase() == 'iso-8859-1') {

164 charEncodingName = 'windows-1252';

165 }

166 }

167

168 void changeEncoding(String newEncoding) {

169 if (_rawBytes == null) {

170 // We should never get here -- if encoding is certain we won't try to

171 // change it.

172 throw new StateError('cannot change encoding when parsing a String.');

173 }

174

175 newEncoding = codecName(newEncoding);

176 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(newEncoding)) {

177 newEncoding = 'utf-8';

178 }

179 if (newEncoding == null) {

180 return;

181 } else if (newEncoding == charEncodingName) {

182 charEncodingCertain = true;

183 } else {

184 charEncodingName = newEncoding;

185 charEncodingCertain = true;

186 _rawChars = null;

187 reset();

188 throw new ReparseException(

189 'Encoding changed from $charEncodingName to $newEncoding');

190 }

191 }

192

193 /// Attempts to detect at BOM at the start of the stream. If

194 /// an encoding can be determined from the BOM return the name of the

195 /// encoding otherwise return null.

196 String detectBOM() {

197 // Try detecting the BOM using bytes from the string

198 if (hasUtf8Bom(_rawBytes)) {

199 return 'utf-8';

200 }

201 // Note: we don't need to remember whether it was big or little endian

202 // because the decoder will do that later. It will also eat the BOM for us.

203 if (hasUtf16Bom(_rawBytes)) {

204 return 'utf-16';

205 }

206 if (hasUtf32Bom(_rawBytes)) {

207 return 'utf-32';

208 }

209 return null;

210 }

211

212 /// Report the encoding declared by the meta element.

213 String detectEncodingMeta() {

214 var parser = new EncodingParser(slice(_rawBytes, 0, numBytesMeta));

215 var encoding = parser.getEncoding();

216

217 if (const ['utf-16', 'utf-16-be', 'utf-16-le'].contains(encoding)) {

218 encoding = 'utf-8';

219 }

220

221 return encoding;

222 }

223

224 /// Returns the current offset in the stream, i.e. the number of codepoints

225 /// since the start of the file.

226 int get position => _offset;

227

228 /// Read one character from the stream or queue if available. Return

229 /// EOF when EOF is reached.

230 String char() {

231 if (_offset >= _chars.length) return EOF;

232 return new String.fromCharCodes([_chars[_offset++]]);

233 }

234

235 String peekChar() {

236 if (_offset >= _chars.length) return EOF;

237 return new String.fromCharCodes([_chars[_offset]]);

238 }

239

240 /// Returns a string of characters from the stream up to but not

241 /// including any character in 'characters' or EOF.

242 String charsUntil(String characters, [bool opposite = false]) {

243 int start = _offset;

244 String c;

245 while ((c = peekChar()) != null && characters.contains(c) == opposite) {

246 _offset++;

247 }

248

249 return new String.fromCharCodes(_chars.sublist(start, _offset));

250 }

251

252 void unget(String ch) {

253 // Only one character is allowed to be ungotten at once - it must

254 // be consumed again before any further call to unget

255 if (ch != null) {

256 _offset--;

257 assert(peekChar() == ch);

258 }

259 }

260 }

261

262

263 // TODO(jmesserly): the Python code used a regex to check for this. But

264 // Dart doesn't let you create a regexp with invalid characters.

265 bool invalidUnicode(int c) {

266 if (0x0001 <= c && c <= 0x0008) return true;

267 if (0x000E <= c && c <= 0x001F) return true;

268 if (0x007F <= c && c <= 0x009F) return true;

269 if (0xD800 <= c && c <= 0xDFFF) return true;

270 if (0xFDD0 <= c && c <= 0xFDEF) return true;

271 switch (c) {

272 case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF:

273 case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF:

274 case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF:

275 case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF:

276 case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:

277 case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:

278 case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:

279 case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:

280 case 0x10FFFE: case 0x10FFFF:

281 return true;

282 }

283 return false;

284 }

285

286 /// Return the python codec name corresponding to an encoding or null if the

287 /// string doesn't correspond to a valid encoding.

288 String codecName(String encoding) {

289 final asciiPunctuation = new RegExp(

290 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

291

292 if (encoding == null) return null;

293 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

294 return encodings[canonicalName];

295 }

OLD	NEW

« no previous file with comments | « pkg/third_party/html5lib/lib/src/encoding_parser.dart ('k') | pkg/third_party/html5lib/lib/src/list_proxy.dart » ('j') | no next file with comments »