| OLD | NEW |
| 1 /// This library has a parser for HTML5 documents, that lets you parse HTML | 1 /// This library has a parser for HTML5 documents, that lets you parse HTML |
| 2 /// easily from a script or server side application: | 2 /// easily from a script or server side application: |
| 3 /// | 3 /// |
| 4 /// import 'package:html5lib/parser.dart' show parse; | 4 /// import 'package:html5lib/parser.dart' show parse; |
| 5 /// import 'package:html5lib/dom.dart'; | 5 /// import 'package:html5lib/dom.dart'; |
| 6 /// main() { | 6 /// main() { |
| 7 /// var document = parse( | 7 /// var document = parse( |
| 8 /// '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!'); | 8 /// '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!'); |
| 9 /// print(document.outerHtml); | 9 /// print(document.outerHtml); |
| 10 /// } | 10 /// } |
| 11 /// | 11 /// |
| 12 /// The resulting document you get back has a DOM-like API for easy tree | 12 /// The resulting document you get back has a DOM-like API for easy tree |
| 13 /// traversal and manipulation. | 13 /// traversal and manipulation. |
| 14 library parser; | 14 library parser; |
| 15 | 15 |
| 16 import 'dart:collection'; | 16 import 'dart:collection'; |
| 17 import 'dart:math'; | 17 import 'dart:math'; |
| 18 import 'package:source_maps/span.dart' show Span, FileSpan; | 18 import 'package:source_span/source_span.dart'; |
| 19 | 19 |
| 20 import 'src/treebuilder.dart'; | 20 import 'src/treebuilder.dart'; |
| 21 import 'src/constants.dart'; | 21 import 'src/constants.dart'; |
| 22 import 'src/encoding_parser.dart'; | 22 import 'src/encoding_parser.dart'; |
| 23 import 'src/token.dart'; | 23 import 'src/token.dart'; |
| 24 import 'src/tokenizer.dart'; | 24 import 'src/tokenizer.dart'; |
| 25 import 'src/utils.dart'; | 25 import 'src/utils.dart'; |
| 26 import 'dom.dart'; | 26 import 'dom.dart'; |
| 27 | 27 |
| 28 /// Parse the [input] html5 document into a tree. The [input] can be | 28 /// Parse the [input] html5 document into a tree. The [input] can be |
| 29 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. | 29 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. |
| 30 /// | 30 /// |
| 31 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's | 31 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's |
| 32 /// [encoding], which must be a string. If specified that encoding will be | 32 /// [encoding], which must be a string. If specified that encoding will be |
| 33 /// used regardless of any BOM or later declaration (such as in a meta element). | 33 /// used regardless of any BOM or later declaration (such as in a meta element). |
| 34 /// | 34 /// |
| 35 /// Set [generateSpans] if you want to generate [Span]s, otherwise the | 35 /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the |
| 36 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you | 36 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you |
| 37 /// can additionally pass [sourceUrl] to indicate where the [input] was | 37 /// can additionally pass [sourceUrl] to indicate where the [input] was |
| 38 /// extracted from. | 38 /// extracted from. |
| 39 Document parse(input, {String encoding, bool generateSpans: false, | 39 Document parse(input, {String encoding, bool generateSpans: false, |
| 40 String sourceUrl}) { | 40 String sourceUrl}) { |
| 41 var p = new HtmlParser(input, encoding: encoding, | 41 var p = new HtmlParser(input, encoding: encoding, |
| 42 generateSpans: generateSpans, sourceUrl: sourceUrl); | 42 generateSpans: generateSpans, sourceUrl: sourceUrl); |
| 43 return p.parse(); | 43 return p.parse(); |
| 44 } | 44 } |
| 45 | 45 |
| 46 | 46 |
| 47 /// Parse the [input] html5 document fragment into a tree. The [input] can be | 47 /// Parse the [input] html5 document fragment into a tree. The [input] can be |
| 48 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. The [container] | 48 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. The [container] |
| 49 /// element can optionally be specified, otherwise it defaults to "div". | 49 /// element can optionally be specified, otherwise it defaults to "div". |
| 50 /// | 50 /// |
| 51 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's | 51 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's |
| 52 /// [encoding], which must be a string. If specified, that encoding will be used
, | 52 /// [encoding], which must be a string. If specified, that encoding will be used
, |
| 53 /// regardless of any BOM or later declaration (such as in a meta element). | 53 /// regardless of any BOM or later declaration (such as in a meta element). |
| 54 /// | 54 /// |
| 55 /// Set [generateSpans] if you want to generate [Span]s, otherwise the | 55 /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the |
| 56 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you ca
n | 56 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you ca
n |
| 57 /// additionally pass [sourceUrl] to indicate where the [input] was extracted | 57 /// additionally pass [sourceUrl] to indicate where the [input] was extracted |
| 58 /// from. | 58 /// from. |
| 59 DocumentFragment parseFragment(input, {String container: "div", | 59 DocumentFragment parseFragment(input, {String container: "div", |
| 60 String encoding, bool generateSpans: false, String sourceUrl}) { | 60 String encoding, bool generateSpans: false, String sourceUrl}) { |
| 61 var p = new HtmlParser(input, encoding: encoding, | 61 var p = new HtmlParser(input, encoding: encoding, |
| 62 generateSpans: generateSpans, sourceUrl: sourceUrl); | 62 generateSpans: generateSpans, sourceUrl: sourceUrl); |
| 63 return p.parseFragment(container); | 63 return p.parseFragment(container); |
| 64 } | 64 } |
| 65 | 65 |
| 66 | 66 |
| 67 /// Parser for HTML, which generates a tree structure from a stream of | 67 /// Parser for HTML, which generates a tree structure from a stream of |
| 68 /// (possibly malformed) characters. | 68 /// (possibly malformed) characters. |
| 69 class HtmlParser { | 69 class HtmlParser { |
| 70 /// Raise an exception on the first error encountered. | 70 /// Raise an exception on the first error encountered. |
| 71 final bool strict; | 71 final bool strict; |
| 72 | 72 |
| 73 /// True to generate [Span]s for the [Node.sourceSpan] property. | 73 /// True to generate [SourceSpan]s for the [Node.sourceSpan] property. |
| 74 final bool generateSpans; | 74 final bool generateSpans; |
| 75 | 75 |
| 76 final HtmlTokenizer tokenizer; | 76 final HtmlTokenizer tokenizer; |
| 77 | 77 |
| 78 final TreeBuilder tree; | 78 final TreeBuilder tree; |
| 79 | 79 |
| 80 final List<ParseError> errors = <ParseError>[]; | 80 final List<ParseError> errors = <ParseError>[]; |
| 81 | 81 |
| 82 String container; | 82 String container; |
| 83 | 83 |
| (...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 356 reprocessPhases.add(phase); | 356 reprocessPhases.add(phase); |
| 357 reprocess = phase.processEOF(); | 357 reprocess = phase.processEOF(); |
| 358 if (reprocess) { | 358 if (reprocess) { |
| 359 assert(!reprocessPhases.contains(phase)); | 359 assert(!reprocessPhases.contains(phase)); |
| 360 } | 360 } |
| 361 } | 361 } |
| 362 } | 362 } |
| 363 | 363 |
| 364 /// The last span available. Used for EOF errors if we don't have something | 364 /// The last span available. Used for EOF errors if we don't have something |
| 365 /// better. | 365 /// better. |
| 366 Span get _lastSpan { | 366 SourceSpan get _lastSpan { |
| 367 var pos = tokenizer.stream.position; | 367 var pos = tokenizer.stream.position; |
| 368 return new FileSpan(tokenizer.stream.fileInfo, pos, pos); | 368 return tokenizer.stream.fileInfo.location(pos).pointSpan(); |
| 369 } | 369 } |
| 370 | 370 |
| 371 void parseError(Span span, String errorcode, | 371 void parseError(SourceSpan span, String errorcode, |
| 372 [Map datavars = const {}]) { | 372 [Map datavars = const {}]) { |
| 373 | 373 |
| 374 if (!generateSpans && span == null) { | 374 if (!generateSpans && span == null) { |
| 375 span = _lastSpan; | 375 span = _lastSpan; |
| 376 } | 376 } |
| 377 | 377 |
| 378 var err = new ParseError(errorcode, span, datavars); | 378 var err = new ParseError(errorcode, span, datavars); |
| 379 errors.add(err); | 379 errors.add(err); |
| 380 if (strict) throw err; | 380 if (strict) throw err; |
| 381 } | 381 } |
| (...skipping 1788 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2170 super(parser); | 2170 super(parser); |
| 2171 | 2171 |
| 2172 void flushCharacters() { | 2172 void flushCharacters() { |
| 2173 if (characterTokens.length == 0) return; | 2173 if (characterTokens.length == 0) return; |
| 2174 | 2174 |
| 2175 // TODO(sigmund,jmesserly): remove '' (dartbug.com/8480) | 2175 // TODO(sigmund,jmesserly): remove '' (dartbug.com/8480) |
| 2176 var data = characterTokens.map((t) => t.data).join(''); | 2176 var data = characterTokens.map((t) => t.data).join(''); |
| 2177 var span = null; | 2177 var span = null; |
| 2178 | 2178 |
| 2179 if (parser.generateSpans) { | 2179 if (parser.generateSpans) { |
| 2180 span = new FileSpan.union( | 2180 span = characterTokens[0].span.union(characterTokens.last.span); |
| 2181 characterTokens[0].span, | |
| 2182 characterTokens.last.span); | |
| 2183 } | 2181 } |
| 2184 | 2182 |
| 2185 if (!allWhitespace(data)) { | 2183 if (!allWhitespace(data)) { |
| 2186 parser._inTablePhase.insertText(new CharactersToken(data)..span = span); | 2184 parser._inTablePhase.insertText(new CharactersToken(data)..span = span); |
| 2187 } else if (data.length > 0) { | 2185 } else if (data.length > 0) { |
| 2188 tree.insertText(data, span); | 2186 tree.insertText(data, span); |
| 2189 } | 2187 } |
| 2190 characterTokens = <StringToken>[]; | 2188 characterTokens = <StringToken>[]; |
| 2191 } | 2189 } |
| 2192 | 2190 |
| (...skipping 1133 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3326 | 3324 |
| 3327 Token processEndTag(EndTagToken token) { | 3325 Token processEndTag(EndTagToken token) { |
| 3328 parser.parseError(token.span, "expected-eof-but-got-end-tag", | 3326 parser.parseError(token.span, "expected-eof-but-got-end-tag", |
| 3329 {"name": token.name}); | 3327 {"name": token.name}); |
| 3330 return null; | 3328 return null; |
| 3331 } | 3329 } |
| 3332 } | 3330 } |
| 3333 | 3331 |
| 3334 | 3332 |
| 3335 /// Error in parsed document. | 3333 /// Error in parsed document. |
| 3336 class ParseError implements Exception { | 3334 class ParseError implements SourceSpanException { |
| 3337 final String errorCode; | 3335 final String errorCode; |
| 3338 final Span span; | 3336 final SourceSpan span; |
| 3339 final Map data; | 3337 final Map data; |
| 3340 | 3338 |
| 3341 ParseError(this.errorCode, this.span, this.data); | 3339 ParseError(this.errorCode, this.span, this.data); |
| 3342 | 3340 |
| 3343 int get line => span.start.line; | 3341 int get line => span.start.line; |
| 3344 | 3342 |
| 3345 int get column => span.start.column; | 3343 int get column => span.start.column; |
| 3346 | 3344 |
| 3347 /// Gets the human readable error message for this error. Use | 3345 /// Gets the human readable error message for this error. Use |
| 3348 /// [span.getLocationMessage] or [toString] to get a message including span | 3346 /// [span.getLocationMessage] or [toString] to get a message including span |
| 3349 /// information. If there is a file associated with the span, both | 3347 /// information. If there is a file associated with the span, both |
| 3350 /// [span.getLocationMessage] and [toString] are equivalent. Otherwise, | 3348 /// [span.getLocationMessage] and [toString] are equivalent. Otherwise, |
| 3351 /// [span.getLocationMessage] will not show any source url information, but | 3349 /// [span.getLocationMessage] will not show any source url information, but |
| 3352 /// [toString] will include 'ParserError:' as a prefix. | 3350 /// [toString] will include 'ParserError:' as a prefix. |
| 3353 String get message => formatStr(errorMessages[errorCode], data); | 3351 String get message => formatStr(errorMessages[errorCode], data); |
| 3354 | 3352 |
| 3355 String toString() { | 3353 String toString({color}) { |
| 3356 var res = span.getLocationMessage(message); | 3354 var res = span.message(message, color: color); |
| 3357 return span.sourceUrl == null ? 'ParserError on $res' : 'On $res'; | 3355 return span.sourceUrl == null ? 'ParserError on $res' : 'On $res'; |
| 3358 } | 3356 } |
| 3359 } | 3357 } |
| 3360 | 3358 |
| 3361 | 3359 |
| 3362 /// Convenience function to get the pair of namespace and localName. | 3360 /// Convenience function to get the pair of namespace and localName. |
| 3363 Pair<String, String> getElementNameTuple(Element e) { | 3361 Pair<String, String> getElementNameTuple(Element e) { |
| 3364 var ns = e.namespaceUri; | 3362 var ns = e.namespaceUri; |
| 3365 if (ns == null) ns = Namespaces.html; | 3363 if (ns == null) ns = Namespaces.html; |
| 3366 return new Pair(ns, e.localName); | 3364 return new Pair(ns, e.localName); |
| 3367 } | 3365 } |
| OLD | NEW |