OLD | NEW |
1 /// This library has a parser for HTML5 documents, that lets you parse HTML | 1 /// This library has a parser for HTML5 documents, that lets you parse HTML |
2 /// easily from a script or server side application: | 2 /// easily from a script or server side application: |
3 /// | 3 /// |
4 /// import 'package:html5lib/parser.dart' show parse; | 4 /// import 'package:html5lib/parser.dart' show parse; |
5 /// import 'package:html5lib/dom.dart'; | 5 /// import 'package:html5lib/dom.dart'; |
6 /// main() { | 6 /// main() { |
7 /// var document = parse( | 7 /// var document = parse( |
8 /// '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!'); | 8 /// '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!'); |
9 /// print(document.outerHtml); | 9 /// print(document.outerHtml); |
10 /// } | 10 /// } |
11 /// | 11 /// |
12 /// The resulting document you get back has a DOM-like API for easy tree | 12 /// The resulting document you get back has a DOM-like API for easy tree |
13 /// traversal and manipulation. | 13 /// traversal and manipulation. |
14 library parser; | 14 library parser; |
15 | 15 |
16 import 'dart:collection'; | 16 import 'dart:collection'; |
17 import 'dart:math'; | 17 import 'dart:math'; |
18 import 'package:source_maps/span.dart' show Span, FileSpan; | 18 import 'package:source_span/source_span.dart'; |
19 | 19 |
20 import 'src/treebuilder.dart'; | 20 import 'src/treebuilder.dart'; |
21 import 'src/constants.dart'; | 21 import 'src/constants.dart'; |
22 import 'src/encoding_parser.dart'; | 22 import 'src/encoding_parser.dart'; |
23 import 'src/token.dart'; | 23 import 'src/token.dart'; |
24 import 'src/tokenizer.dart'; | 24 import 'src/tokenizer.dart'; |
25 import 'src/utils.dart'; | 25 import 'src/utils.dart'; |
26 import 'dom.dart'; | 26 import 'dom.dart'; |
27 | 27 |
28 /// Parse the [input] html5 document into a tree. The [input] can be | 28 /// Parse the [input] html5 document into a tree. The [input] can be |
29 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. | 29 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. |
30 /// | 30 /// |
31 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's | 31 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's |
32 /// [encoding], which must be a string. If specified that encoding will be | 32 /// [encoding], which must be a string. If specified that encoding will be |
33 /// used regardless of any BOM or later declaration (such as in a meta element). | 33 /// used regardless of any BOM or later declaration (such as in a meta element). |
34 /// | 34 /// |
35 /// Set [generateSpans] if you want to generate [Span]s, otherwise the | 35 /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the |
36 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you | 36 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you |
37 /// can additionally pass [sourceUrl] to indicate where the [input] was | 37 /// can additionally pass [sourceUrl] to indicate where the [input] was |
38 /// extracted from. | 38 /// extracted from. |
39 Document parse(input, {String encoding, bool generateSpans: false, | 39 Document parse(input, {String encoding, bool generateSpans: false, |
40 String sourceUrl}) { | 40 String sourceUrl}) { |
41 var p = new HtmlParser(input, encoding: encoding, | 41 var p = new HtmlParser(input, encoding: encoding, |
42 generateSpans: generateSpans, sourceUrl: sourceUrl); | 42 generateSpans: generateSpans, sourceUrl: sourceUrl); |
43 return p.parse(); | 43 return p.parse(); |
44 } | 44 } |
45 | 45 |
46 | 46 |
47 /// Parse the [input] html5 document fragment into a tree. The [input] can be | 47 /// Parse the [input] html5 document fragment into a tree. The [input] can be |
48 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. The [container] | 48 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. The [container] |
49 /// element can optionally be specified, otherwise it defaults to "div". | 49 /// element can optionally be specified, otherwise it defaults to "div". |
50 /// | 50 /// |
51 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's | 51 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's |
52 /// [encoding], which must be a string. If specified, that encoding will be used
, | 52 /// [encoding], which must be a string. If specified, that encoding will be used
, |
53 /// regardless of any BOM or later declaration (such as in a meta element). | 53 /// regardless of any BOM or later declaration (such as in a meta element). |
54 /// | 54 /// |
55 /// Set [generateSpans] if you want to generate [Span]s, otherwise the | 55 /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the |
56 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you ca
n | 56 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you ca
n |
57 /// additionally pass [sourceUrl] to indicate where the [input] was extracted | 57 /// additionally pass [sourceUrl] to indicate where the [input] was extracted |
58 /// from. | 58 /// from. |
59 DocumentFragment parseFragment(input, {String container: "div", | 59 DocumentFragment parseFragment(input, {String container: "div", |
60 String encoding, bool generateSpans: false, String sourceUrl}) { | 60 String encoding, bool generateSpans: false, String sourceUrl}) { |
61 var p = new HtmlParser(input, encoding: encoding, | 61 var p = new HtmlParser(input, encoding: encoding, |
62 generateSpans: generateSpans, sourceUrl: sourceUrl); | 62 generateSpans: generateSpans, sourceUrl: sourceUrl); |
63 return p.parseFragment(container); | 63 return p.parseFragment(container); |
64 } | 64 } |
65 | 65 |
66 | 66 |
67 /// Parser for HTML, which generates a tree structure from a stream of | 67 /// Parser for HTML, which generates a tree structure from a stream of |
68 /// (possibly malformed) characters. | 68 /// (possibly malformed) characters. |
69 class HtmlParser { | 69 class HtmlParser { |
70 /// Raise an exception on the first error encountered. | 70 /// Raise an exception on the first error encountered. |
71 final bool strict; | 71 final bool strict; |
72 | 72 |
73 /// True to generate [Span]s for the [Node.sourceSpan] property. | 73 /// True to generate [SourceSpan]s for the [Node.sourceSpan] property. |
74 final bool generateSpans; | 74 final bool generateSpans; |
75 | 75 |
76 final HtmlTokenizer tokenizer; | 76 final HtmlTokenizer tokenizer; |
77 | 77 |
78 final TreeBuilder tree; | 78 final TreeBuilder tree; |
79 | 79 |
80 final List<ParseError> errors = <ParseError>[]; | 80 final List<ParseError> errors = <ParseError>[]; |
81 | 81 |
82 String container; | 82 String container; |
83 | 83 |
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
356 reprocessPhases.add(phase); | 356 reprocessPhases.add(phase); |
357 reprocess = phase.processEOF(); | 357 reprocess = phase.processEOF(); |
358 if (reprocess) { | 358 if (reprocess) { |
359 assert(!reprocessPhases.contains(phase)); | 359 assert(!reprocessPhases.contains(phase)); |
360 } | 360 } |
361 } | 361 } |
362 } | 362 } |
363 | 363 |
364 /// The last span available. Used for EOF errors if we don't have something | 364 /// The last span available. Used for EOF errors if we don't have something |
365 /// better. | 365 /// better. |
366 Span get _lastSpan { | 366 SourceSpan get _lastSpan { |
| 367 if (tokenizer.stream.fileInfo == null) return null; |
367 var pos = tokenizer.stream.position; | 368 var pos = tokenizer.stream.position; |
368 return new FileSpan(tokenizer.stream.fileInfo, pos, pos); | 369 return tokenizer.stream.fileInfo.location(pos).pointSpan(); |
369 } | 370 } |
370 | 371 |
371 void parseError(Span span, String errorcode, | 372 void parseError(SourceSpan span, String errorcode, |
372 [Map datavars = const {}]) { | 373 [Map datavars = const {}]) { |
373 | 374 |
374 if (!generateSpans && span == null) { | 375 if (!generateSpans && span == null) { |
375 span = _lastSpan; | 376 span = _lastSpan; |
376 } | 377 } |
377 | 378 |
378 var err = new ParseError(errorcode, span, datavars); | 379 var err = new ParseError(errorcode, span, datavars); |
379 errors.add(err); | 380 errors.add(err); |
380 if (strict) throw err; | 381 if (strict) throw err; |
381 } | 382 } |
(...skipping 1788 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2170 super(parser); | 2171 super(parser); |
2171 | 2172 |
2172 void flushCharacters() { | 2173 void flushCharacters() { |
2173 if (characterTokens.length == 0) return; | 2174 if (characterTokens.length == 0) return; |
2174 | 2175 |
2175 // TODO(sigmund,jmesserly): remove '' (dartbug.com/8480) | 2176 // TODO(sigmund,jmesserly): remove '' (dartbug.com/8480) |
2176 var data = characterTokens.map((t) => t.data).join(''); | 2177 var data = characterTokens.map((t) => t.data).join(''); |
2177 var span = null; | 2178 var span = null; |
2178 | 2179 |
2179 if (parser.generateSpans) { | 2180 if (parser.generateSpans) { |
2180 span = new FileSpan.union( | 2181 span = characterTokens[0].span.expand(characterTokens.last.span); |
2181 characterTokens[0].span, | |
2182 characterTokens.last.span); | |
2183 } | 2182 } |
2184 | 2183 |
2185 if (!allWhitespace(data)) { | 2184 if (!allWhitespace(data)) { |
2186 parser._inTablePhase.insertText(new CharactersToken(data)..span = span); | 2185 parser._inTablePhase.insertText(new CharactersToken(data)..span = span); |
2187 } else if (data.length > 0) { | 2186 } else if (data.length > 0) { |
2188 tree.insertText(data, span); | 2187 tree.insertText(data, span); |
2189 } | 2188 } |
2190 characterTokens = <StringToken>[]; | 2189 characterTokens = <StringToken>[]; |
2191 } | 2190 } |
2192 | 2191 |
(...skipping 1133 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3326 | 3325 |
3327 Token processEndTag(EndTagToken token) { | 3326 Token processEndTag(EndTagToken token) { |
3328 parser.parseError(token.span, "expected-eof-but-got-end-tag", | 3327 parser.parseError(token.span, "expected-eof-but-got-end-tag", |
3329 {"name": token.name}); | 3328 {"name": token.name}); |
3330 return null; | 3329 return null; |
3331 } | 3330 } |
3332 } | 3331 } |
3333 | 3332 |
3334 | 3333 |
3335 /// Error in parsed document. | 3334 /// Error in parsed document. |
3336 class ParseError implements Exception { | 3335 class ParseError implements SourceSpanException { |
3337 final String errorCode; | 3336 final String errorCode; |
3338 final Span span; | 3337 final SourceSpan span; |
3339 final Map data; | 3338 final Map data; |
3340 | 3339 |
3341 ParseError(this.errorCode, this.span, this.data); | 3340 ParseError(this.errorCode, this.span, this.data); |
3342 | 3341 |
3343 int get line => span.start.line; | 3342 int get line => span.start.line; |
3344 | 3343 |
3345 int get column => span.start.column; | 3344 int get column => span.start.column; |
3346 | 3345 |
3347 /// Gets the human readable error message for this error. Use | 3346 /// Gets the human readable error message for this error. Use |
3348 /// [span.getLocationMessage] or [toString] to get a message including span | 3347 /// [span.getLocationMessage] or [toString] to get a message including span |
3349 /// information. If there is a file associated with the span, both | 3348 /// information. If there is a file associated with the span, both |
3350 /// [span.getLocationMessage] and [toString] are equivalent. Otherwise, | 3349 /// [span.getLocationMessage] and [toString] are equivalent. Otherwise, |
3351 /// [span.getLocationMessage] will not show any source url information, but | 3350 /// [span.getLocationMessage] will not show any source url information, but |
3352 /// [toString] will include 'ParserError:' as a prefix. | 3351 /// [toString] will include 'ParserError:' as a prefix. |
3353 String get message => formatStr(errorMessages[errorCode], data); | 3352 String get message => formatStr(errorMessages[errorCode], data); |
3354 | 3353 |
3355 String toString() { | 3354 String toString({color}) { |
3356 var res = span.getLocationMessage(message); | 3355 var res = span.message(message, color: color); |
3357 return span.sourceUrl == null ? 'ParserError on $res' : 'On $res'; | 3356 return span.sourceUrl == null ? 'ParserError on $res' : 'On $res'; |
3358 } | 3357 } |
3359 } | 3358 } |
3360 | 3359 |
3361 | 3360 |
3362 /// Convenience function to get the pair of namespace and localName. | 3361 /// Convenience function to get the pair of namespace and localName. |
3363 Pair<String, String> getElementNameTuple(Element e) { | 3362 Pair<String, String> getElementNameTuple(Element e) { |
3364 var ns = e.namespaceUri; | 3363 var ns = e.namespaceUri; |
3365 if (ns == null) ns = Namespaces.html; | 3364 if (ns == null) ns = Namespaces.html; |
3366 return new Pair(ns, e.localName); | 3365 return new Pair(ns, e.localName); |
3367 } | 3366 } |
OLD | NEW |