Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(68)

Side by Side Diff: pkg/third_party/html5lib/lib/parser.dart

Issue 421503004: Switch transformers over to source_span (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /// This library has a parser for HTML5 documents, that lets you parse HTML 1 /// This library has a parser for HTML5 documents, that lets you parse HTML
2 /// easily from a script or server side application: 2 /// easily from a script or server side application:
3 /// 3 ///
4 /// import 'package:html5lib/parser.dart' show parse; 4 /// import 'package:html5lib/parser.dart' show parse;
5 /// import 'package:html5lib/dom.dart'; 5 /// import 'package:html5lib/dom.dart';
6 /// main() { 6 /// main() {
7 /// var document = parse( 7 /// var document = parse(
8 /// '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!'); 8 /// '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!');
9 /// print(document.outerHtml); 9 /// print(document.outerHtml);
10 /// } 10 /// }
11 /// 11 ///
12 /// The resulting document you get back has a DOM-like API for easy tree 12 /// The resulting document you get back has a DOM-like API for easy tree
13 /// traversal and manipulation. 13 /// traversal and manipulation.
14 library parser; 14 library parser;
15 15
16 import 'dart:collection'; 16 import 'dart:collection';
17 import 'dart:math'; 17 import 'dart:math';
18 import 'package:source_maps/span.dart' show Span, FileSpan; 18 import 'package:source_span/source_span.dart';
19 19
20 import 'src/treebuilder.dart'; 20 import 'src/treebuilder.dart';
21 import 'src/constants.dart'; 21 import 'src/constants.dart';
22 import 'src/encoding_parser.dart'; 22 import 'src/encoding_parser.dart';
23 import 'src/token.dart'; 23 import 'src/token.dart';
24 import 'src/tokenizer.dart'; 24 import 'src/tokenizer.dart';
25 import 'src/utils.dart'; 25 import 'src/utils.dart';
26 import 'dom.dart'; 26 import 'dom.dart';
27 27
28 /// Parse the [input] html5 document into a tree. The [input] can be 28 /// Parse the [input] html5 document into a tree. The [input] can be
29 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. 29 /// a [String], [List<int>] of bytes or an [HtmlTokenizer].
30 /// 30 ///
31 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's 31 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's
32 /// [encoding], which must be a string. If specified that encoding will be 32 /// [encoding], which must be a string. If specified that encoding will be
33 /// used regardless of any BOM or later declaration (such as in a meta element). 33 /// used regardless of any BOM or later declaration (such as in a meta element).
34 /// 34 ///
35 /// Set [generateSpans] if you want to generate [Span]s, otherwise the 35 /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the
36 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you 36 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you
37 /// can additionally pass [sourceUrl] to indicate where the [input] was 37 /// can additionally pass [sourceUrl] to indicate where the [input] was
38 /// extracted from. 38 /// extracted from.
39 Document parse(input, {String encoding, bool generateSpans: false, 39 Document parse(input, {String encoding, bool generateSpans: false,
40 String sourceUrl}) { 40 String sourceUrl}) {
41 var p = new HtmlParser(input, encoding: encoding, 41 var p = new HtmlParser(input, encoding: encoding,
42 generateSpans: generateSpans, sourceUrl: sourceUrl); 42 generateSpans: generateSpans, sourceUrl: sourceUrl);
43 return p.parse(); 43 return p.parse();
44 } 44 }
45 45
46 46
47 /// Parse the [input] html5 document fragment into a tree. The [input] can be 47 /// Parse the [input] html5 document fragment into a tree. The [input] can be
48 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. The [container] 48 /// a [String], [List<int>] of bytes or an [HtmlTokenizer]. The [container]
49 /// element can optionally be specified, otherwise it defaults to "div". 49 /// element can optionally be specified, otherwise it defaults to "div".
50 /// 50 ///
51 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's 51 /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's
52 /// [encoding], which must be a string. If specified, that encoding will be used , 52 /// [encoding], which must be a string. If specified, that encoding will be used ,
53 /// regardless of any BOM or later declaration (such as in a meta element). 53 /// regardless of any BOM or later declaration (such as in a meta element).
54 /// 54 ///
55 /// Set [generateSpans] if you want to generate [Span]s, otherwise the 55 /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the
56 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you ca n 56 /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you ca n
57 /// additionally pass [sourceUrl] to indicate where the [input] was extracted 57 /// additionally pass [sourceUrl] to indicate where the [input] was extracted
58 /// from. 58 /// from.
59 DocumentFragment parseFragment(input, {String container: "div", 59 DocumentFragment parseFragment(input, {String container: "div",
60 String encoding, bool generateSpans: false, String sourceUrl}) { 60 String encoding, bool generateSpans: false, String sourceUrl}) {
61 var p = new HtmlParser(input, encoding: encoding, 61 var p = new HtmlParser(input, encoding: encoding,
62 generateSpans: generateSpans, sourceUrl: sourceUrl); 62 generateSpans: generateSpans, sourceUrl: sourceUrl);
63 return p.parseFragment(container); 63 return p.parseFragment(container);
64 } 64 }
65 65
66 66
67 /// Parser for HTML, which generates a tree structure from a stream of 67 /// Parser for HTML, which generates a tree structure from a stream of
68 /// (possibly malformed) characters. 68 /// (possibly malformed) characters.
69 class HtmlParser { 69 class HtmlParser {
70 /// Raise an exception on the first error encountered. 70 /// Raise an exception on the first error encountered.
71 final bool strict; 71 final bool strict;
72 72
73 /// True to generate [Span]s for the [Node.sourceSpan] property. 73 /// True to generate [SourceSpan]s for the [Node.sourceSpan] property.
74 final bool generateSpans; 74 final bool generateSpans;
75 75
76 final HtmlTokenizer tokenizer; 76 final HtmlTokenizer tokenizer;
77 77
78 final TreeBuilder tree; 78 final TreeBuilder tree;
79 79
80 final List<ParseError> errors = <ParseError>[]; 80 final List<ParseError> errors = <ParseError>[];
81 81
82 String container; 82 String container;
83 83
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after
356 reprocessPhases.add(phase); 356 reprocessPhases.add(phase);
357 reprocess = phase.processEOF(); 357 reprocess = phase.processEOF();
358 if (reprocess) { 358 if (reprocess) {
359 assert(!reprocessPhases.contains(phase)); 359 assert(!reprocessPhases.contains(phase));
360 } 360 }
361 } 361 }
362 } 362 }
363 363
364 /// The last span available. Used for EOF errors if we don't have something 364 /// The last span available. Used for EOF errors if we don't have something
365 /// better. 365 /// better.
366 Span get _lastSpan { 366 SourceSpan get _lastSpan {
367 var pos = tokenizer.stream.position; 367 var pos = tokenizer.stream.position;
368 return new FileSpan(tokenizer.stream.fileInfo, pos, pos); 368 return tokenizer.stream.fileInfo.location(pos).pointSpan();
369 } 369 }
370 370
371 void parseError(Span span, String errorcode, 371 void parseError(SourceSpan span, String errorcode,
372 [Map datavars = const {}]) { 372 [Map datavars = const {}]) {
373 373
374 if (!generateSpans && span == null) { 374 if (!generateSpans && span == null) {
375 span = _lastSpan; 375 span = _lastSpan;
376 } 376 }
377 377
378 var err = new ParseError(errorcode, span, datavars); 378 var err = new ParseError(errorcode, span, datavars);
379 errors.add(err); 379 errors.add(err);
380 if (strict) throw err; 380 if (strict) throw err;
381 } 381 }
(...skipping 1788 matching lines...) Expand 10 before | Expand all | Expand 10 after
2170 super(parser); 2170 super(parser);
2171 2171
2172 void flushCharacters() { 2172 void flushCharacters() {
2173 if (characterTokens.length == 0) return; 2173 if (characterTokens.length == 0) return;
2174 2174
2175 // TODO(sigmund,jmesserly): remove '' (dartbug.com/8480) 2175 // TODO(sigmund,jmesserly): remove '' (dartbug.com/8480)
2176 var data = characterTokens.map((t) => t.data).join(''); 2176 var data = characterTokens.map((t) => t.data).join('');
2177 var span = null; 2177 var span = null;
2178 2178
2179 if (parser.generateSpans) { 2179 if (parser.generateSpans) {
2180 span = new FileSpan.union( 2180 span = characterTokens[0].span.union(characterTokens.last.span);
2181 characterTokens[0].span,
2182 characterTokens.last.span);
2183 } 2181 }
2184 2182
2185 if (!allWhitespace(data)) { 2183 if (!allWhitespace(data)) {
2186 parser._inTablePhase.insertText(new CharactersToken(data)..span = span); 2184 parser._inTablePhase.insertText(new CharactersToken(data)..span = span);
2187 } else if (data.length > 0) { 2185 } else if (data.length > 0) {
2188 tree.insertText(data, span); 2186 tree.insertText(data, span);
2189 } 2187 }
2190 characterTokens = <StringToken>[]; 2188 characterTokens = <StringToken>[];
2191 } 2189 }
2192 2190
(...skipping 1133 matching lines...) Expand 10 before | Expand all | Expand 10 after
3326 3324
3327 Token processEndTag(EndTagToken token) { 3325 Token processEndTag(EndTagToken token) {
3328 parser.parseError(token.span, "expected-eof-but-got-end-tag", 3326 parser.parseError(token.span, "expected-eof-but-got-end-tag",
3329 {"name": token.name}); 3327 {"name": token.name});
3330 return null; 3328 return null;
3331 } 3329 }
3332 } 3330 }
3333 3331
3334 3332
3335 /// Error in parsed document. 3333 /// Error in parsed document.
3336 class ParseError implements Exception { 3334 class ParseError implements SourceSpanException {
3337 final String errorCode; 3335 final String errorCode;
3338 final Span span; 3336 final SourceSpan span;
3339 final Map data; 3337 final Map data;
3340 3338
3341 ParseError(this.errorCode, this.span, this.data); 3339 ParseError(this.errorCode, this.span, this.data);
3342 3340
3343 int get line => span.start.line; 3341 int get line => span.start.line;
3344 3342
3345 int get column => span.start.column; 3343 int get column => span.start.column;
3346 3344
3347 /// Gets the human readable error message for this error. Use 3345 /// Gets the human readable error message for this error. Use
3348 /// [span.getLocationMessage] or [toString] to get a message including span 3346 /// [span.getLocationMessage] or [toString] to get a message including span
3349 /// information. If there is a file associated with the span, both 3347 /// information. If there is a file associated with the span, both
3350 /// [span.getLocationMessage] and [toString] are equivalent. Otherwise, 3348 /// [span.getLocationMessage] and [toString] are equivalent. Otherwise,
3351 /// [span.getLocationMessage] will not show any source url information, but 3349 /// [span.getLocationMessage] will not show any source url information, but
3352 /// [toString] will include 'ParserError:' as a prefix. 3350 /// [toString] will include 'ParserError:' as a prefix.
3353 String get message => formatStr(errorMessages[errorCode], data); 3351 String get message => formatStr(errorMessages[errorCode], data);
3354 3352
3355 String toString() { 3353 String toString({color}) {
3356 var res = span.getLocationMessage(message); 3354 var res = span.message(message, color: color);
3357 return span.sourceUrl == null ? 'ParserError on $res' : 'On $res'; 3355 return span.sourceUrl == null ? 'ParserError on $res' : 'On $res';
3358 } 3356 }
3359 } 3357 }
3360 3358
3361 3359
3362 /// Convenience function to get the pair of namespace and localName. 3360 /// Convenience function to get the pair of namespace and localName.
3363 Pair<String, String> getElementNameTuple(Element e) { 3361 Pair<String, String> getElementNameTuple(Element e) {
3364 var ns = e.namespaceUri; 3362 var ns = e.namespaceUri;
3365 if (ns == null) ns = Namespaces.html; 3363 if (ns == null) ns = Namespaces.html;
3366 return new Pair(ns, e.localName); 3364 return new Pair(ns, e.localName);
3367 } 3365 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698