Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(654)

Side by Side Diff: pkg/third_party/html5lib/lib/parser.dart

Issue 157983005: pkg/third_party/html5lib: lots of cleanup (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: bump version Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /** 1 /**
2 * This library has a parser for HTML5 documents, that lets you parse HTML 2 * This library has a parser for HTML5 documents, that lets you parse HTML
3 * easily from a script or server side application: 3 * easily from a script or server side application:
4 * 4 *
5 * import 'package:html5lib/parser.dart' show parse; 5 * import 'package:html5lib/parser.dart' show parse;
6 * import 'package:html5lib/dom.dart'; 6 * import 'package:html5lib/dom.dart';
7 * main() { 7 * main() {
8 * var document = parse( 8 * var document = parse(
9 * '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!'); 9 * '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!');
10 * print(document.outerHtml); 10 * print(document.outerHtml);
11 * } 11 * }
12 * 12 *
13 * The resulting document you get back has a DOM-like API for easy tree 13 * The resulting document you get back has a DOM-like API for easy tree
14 * traversal and manipulation. 14 * traversal and manipulation.
15 */ 15 */
16 library parser; 16 library parser;
17 17
18 import 'dart:collection'; 18 import 'dart:collection';
19 import 'dart:math'; 19 import 'dart:math';
20 import 'package:source_maps/span.dart' show Span, FileSpan; 20 import 'package:source_maps/span.dart' show Span, FileSpan;
21 21
22 import 'src/treebuilder.dart'; 22 import 'src/treebuilder.dart';
23 import 'src/constants.dart'; 23 import 'src/constants.dart';
24 import 'src/encoding_parser.dart'; 24 import 'src/encoding_parser.dart';
25 import 'src/token.dart'; 25 import 'src/token.dart';
26 import 'src/tokenizer.dart'; 26 import 'src/tokenizer.dart';
27 import 'src/utils.dart'; 27 import 'src/utils.dart';
28 import 'dom.dart'; 28 import 'dom.dart';
29 import 'dom_parsing.dart';
30 29
31 /** 30 /**
32 * Parse the [input] html5 document into a tree. The [input] can be 31 * Parse the [input] html5 document into a tree. The [input] can be
33 * a [String], [List<int>] of bytes or an [HtmlTokenizer]. 32 * a [String], [List<int>] of bytes or an [HtmlTokenizer].
34 * 33 *
35 * If [input] is not a [HtmlTokenizer], you can optionally specify the file's 34 * If [input] is not a [HtmlTokenizer], you can optionally specify the file's
36 * [encoding], which must be a string. If specified, that encoding will be used, 35 * [encoding], which must be a string. If specified, that encoding will be used,
37 * regardless of any BOM or later declaration (such as in a meta element). 36 * regardless of any BOM or later declaration (such as in a meta element).
38 * 37 *
39 * Set [generateSpans] if you want to generate [Span]s, otherwise the 38 * Set [generateSpans] if you want to generate [Span]s, otherwise the
(...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after
235 void reset() { 234 void reset() {
236 tokenizer.reset(); 235 tokenizer.reset();
237 236
238 tree.reset(); 237 tree.reset();
239 firstStartTag = false; 238 firstStartTag = false;
240 errors.clear(); 239 errors.clear();
241 // "quirks" / "limited quirks" / "no quirks" 240 // "quirks" / "limited quirks" / "no quirks"
242 compatMode = "no quirks"; 241 compatMode = "no quirks";
243 242
244 if (innerHTMLMode) { 243 if (innerHTMLMode) {
245 if (cdataElements.contains(innerHTML)) { 244 if (CDATA_ELEMENTS.contains(innerHTML)) {
246 tokenizer.state = tokenizer.rcdataState; 245 tokenizer.state = tokenizer.rcdataState;
247 } else if (rcdataElements.contains(innerHTML)) { 246 } else if (RCDATA_ELEMENTS.contains(innerHTML)) {
248 tokenizer.state = tokenizer.rawtextState; 247 tokenizer.state = tokenizer.rawtextState;
249 } else if (innerHTML == 'plaintext') { 248 } else if (innerHTML == 'plaintext') {
250 tokenizer.state = tokenizer.plaintextState; 249 tokenizer.state = tokenizer.plaintextState;
251 } else { 250 } else {
252 // state already is data state 251 // state already is data state
253 // tokenizer.state = tokenizer.dataState; 252 // tokenizer.state = tokenizer.dataState;
254 } 253 }
255 phase = _beforeHtmlPhase; 254 phase = _beforeHtmlPhase;
256 _beforeHtmlPhase.insertHtmlElement(); 255 _beforeHtmlPhase.insertHtmlElement();
257 resetInsertionMode(); 256 resetInsertionMode();
258 } else { 257 } else {
259 phase = _initialPhase; 258 phase = _initialPhase;
260 } 259 }
261 260
262 lastPhase = null; 261 lastPhase = null;
263 beforeRCDataPhase = null; 262 beforeRCDataPhase = null;
264 framesetOK = true; 263 framesetOK = true;
265 } 264 }
266 265
267 bool isHTMLIntegrationPoint(Node element) { 266 bool isHTMLIntegrationPoint(Node element) {
268 if (element.tagName == "annotation-xml" && 267 if (element.tagName == "annotation-xml" &&
269 element.namespace == Namespaces.mathml) { 268 element.namespace == Namespaces.mathml) {
270 var enc = element.attributes["encoding"]; 269 var enc = element.attributes["encoding"];
271 if (enc != null) enc = asciiUpper2Lower(enc); 270 if (enc != null) enc = asciiUpper2Lower(enc);
272 return enc == "text/html" || enc == "application/xhtml+xml"; 271 return enc == "text/html" || enc == "application/xhtml+xml";
273 } else { 272 } else {
274 return htmlIntegrationPointElements.contains( 273 return HTML_INTEGRATION_POINT_ELEMENTS.contains(
275 new Pair(element.namespace, element.tagName)); 274 new Pair(element.namespace, element.tagName));
276 } 275 }
277 } 276 }
278 277
279 bool isMathMLTextIntegrationPoint(Node element) { 278 bool isMathMLTextIntegrationPoint(Node element) {
280 return mathmlTextIntegrationPointElements.contains( 279 return MATHML_TEXT_INTEGRATION_POINT_ELEMENTS.contains(
281 new Pair(element.namespace, element.tagName)); 280 new Pair(element.namespace, element.tagName));
282 } 281 }
283 282
284 bool inForeignContent(Token token, int type) { 283 bool inForeignContent(Token token, int type) {
285 if (tree.openElements.length == 0) return false; 284 if (tree.openElements.length == 0) return false;
286 285
287 var node = tree.openElements.last; 286 var node = tree.openElements.last;
288 if (node.namespace == tree.defaultNamespace) return false; 287 if (node.namespace == tree.defaultNamespace) return false;
289 288
290 if (isMathMLTextIntegrationPoint(node)) { 289 if (isMathMLTextIntegrationPoint(node)) {
(...skipping 928 matching lines...) Expand 10 before | Expand all | Expand 10 after
1219 return true; 1218 return true;
1220 } 1219 }
1221 1220
1222 // helper 1221 // helper
1223 void addFormattingElement(token) { 1222 void addFormattingElement(token) {
1224 tree.insertElement(token); 1223 tree.insertElement(token);
1225 var element = tree.openElements.last; 1224 var element = tree.openElements.last;
1226 1225
1227 var matchingElements = []; 1226 var matchingElements = [];
1228 for (Node node in tree.activeFormattingElements.reversed) { 1227 for (Node node in tree.activeFormattingElements.reversed) {
1229 if (node == Marker) { 1228 if (node == MARKER) {
1230 break; 1229 break;
1231 } else if (isMatchingFormattingElement(node, element)) { 1230 } else if (isMatchingFormattingElement(node, element)) {
1232 matchingElements.add(node); 1231 matchingElements.add(node);
1233 } 1232 }
1234 } 1233 }
1235 1234
1236 assert(matchingElements.length <= 3); 1235 assert(matchingElements.length <= 3);
1237 if (matchingElements.length == 3) { 1236 if (matchingElements.length == 3) {
1238 tree.activeFormattingElements.remove(matchingElements.last); 1237 tree.activeFormattingElements.remove(matchingElements.last);
1239 } 1238 }
(...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after
1364 1363
1365 final stopNamesMap = const {"li": const ["li"], 1364 final stopNamesMap = const {"li": const ["li"],
1366 "dt": const ["dt", "dd"], 1365 "dt": const ["dt", "dd"],
1367 "dd": const ["dt", "dd"]}; 1366 "dd": const ["dt", "dd"]};
1368 var stopNames = stopNamesMap[token.name]; 1367 var stopNames = stopNamesMap[token.name];
1369 for (Node node in tree.openElements.reversed) { 1368 for (Node node in tree.openElements.reversed) {
1370 if (stopNames.contains(node.tagName)) { 1369 if (stopNames.contains(node.tagName)) {
1371 parser.phase.processEndTag(new EndTagToken(node.tagName)); 1370 parser.phase.processEndTag(new EndTagToken(node.tagName));
1372 break; 1371 break;
1373 } 1372 }
1374 if (specialElements.contains(node.nameTuple) && 1373 if (SPECIAL_ELEMENTS.contains(node.nameTuple) &&
1375 !const ["address", "div", "p"].contains(node.tagName)) { 1374 !const ["address", "div", "p"].contains(node.tagName)) {
1376 break; 1375 break;
1377 } 1376 }
1378 } 1377 }
1379 1378
1380 if (tree.elementInScope("p", variant: "button")) { 1379 if (tree.elementInScope("p", variant: "button")) {
1381 parser.phase.processEndTag(new EndTagToken("p")); 1380 parser.phase.processEndTag(new EndTagToken("p"));
1382 } 1381 }
1383 1382
1384 tree.insertElement(token); 1383 tree.insertElement(token);
1385 } 1384 }
1386 1385
1387 void startTagPlaintext(StartTagToken token) { 1386 void startTagPlaintext(StartTagToken token) {
1388 if (tree.elementInScope("p", variant: "button")) { 1387 if (tree.elementInScope("p", variant: "button")) {
1389 endTagP(new EndTagToken("p")); 1388 endTagP(new EndTagToken("p"));
1390 } 1389 }
1391 tree.insertElement(token); 1390 tree.insertElement(token);
1392 parser.tokenizer.state = parser.tokenizer.plaintextState; 1391 parser.tokenizer.state = parser.tokenizer.plaintextState;
1393 } 1392 }
1394 1393
1395 void startTagHeading(StartTagToken token) { 1394 void startTagHeading(StartTagToken token) {
1396 if (tree.elementInScope("p", variant: "button")) { 1395 if (tree.elementInScope("p", variant: "button")) {
1397 endTagP(new EndTagToken("p")); 1396 endTagP(new EndTagToken("p"));
1398 } 1397 }
1399 if (headingElements.contains(tree.openElements.last.tagName)) { 1398 if (HEADING_ELEMENTS.contains(tree.openElements.last.tagName)) {
1400 parser.parseError(token.span, "unexpected-start-tag", 1399 parser.parseError(token.span, "unexpected-start-tag",
1401 {"name": token.name}); 1400 {"name": token.name});
1402 tree.openElements.removeLast(); 1401 tree.openElements.removeLast();
1403 } 1402 }
1404 tree.insertElement(token); 1403 tree.insertElement(token);
1405 } 1404 }
1406 1405
1407 void startTagA(StartTagToken token) { 1406 void startTagA(StartTagToken token) {
1408 var afeAElement = tree.elementInActiveFormattingElements("a"); 1407 var afeAElement = tree.elementInActiveFormattingElements("a");
1409 if (afeAElement != null) { 1408 if (afeAElement != null) {
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
1443 } else { 1442 } else {
1444 tree.reconstructActiveFormattingElements(); 1443 tree.reconstructActiveFormattingElements();
1445 tree.insertElement(token); 1444 tree.insertElement(token);
1446 parser.framesetOK = false; 1445 parser.framesetOK = false;
1447 } 1446 }
1448 } 1447 }
1449 1448
1450 void startTagAppletMarqueeObject(StartTagToken token) { 1449 void startTagAppletMarqueeObject(StartTagToken token) {
1451 tree.reconstructActiveFormattingElements(); 1450 tree.reconstructActiveFormattingElements();
1452 tree.insertElement(token); 1451 tree.insertElement(token);
1453 tree.activeFormattingElements.add(Marker); 1452 tree.activeFormattingElements.add(MARKER);
1454 parser.framesetOK = false; 1453 parser.framesetOK = false;
1455 } 1454 }
1456 1455
1457 void startTagXmp(StartTagToken token) { 1456 void startTagXmp(StartTagToken token) {
1458 if (tree.elementInScope("p", variant: "button")) { 1457 if (tree.elementInScope("p", variant: "button")) {
1459 endTagP(new EndTagToken("p")); 1458 endTagP(new EndTagToken("p"));
1460 } 1459 }
1461 tree.reconstructActiveFormattingElements(); 1460 tree.reconstructActiveFormattingElements();
1462 parser.framesetOK = false; 1461 parser.framesetOK = false;
1463 parser.parseRCDataRawtext(token, "RAWTEXT"); 1462 parser.parseRCDataRawtext(token, "RAWTEXT");
(...skipping 265 matching lines...) Expand 10 before | Expand all | Expand 10 after
1729 } else { 1728 } else {
1730 tree.generateImpliedEndTags(token.name); 1729 tree.generateImpliedEndTags(token.name);
1731 if (tree.openElements.last.tagName != token.name) { 1730 if (tree.openElements.last.tagName != token.name) {
1732 parser.parseError(token.span, "end-tag-too-early", {"name": token.name}) ; 1731 parser.parseError(token.span, "end-tag-too-early", {"name": token.name}) ;
1733 } 1732 }
1734 popOpenElementsUntil(token.name); 1733 popOpenElementsUntil(token.name);
1735 } 1734 }
1736 } 1735 }
1737 1736
1738 void endTagHeading(EndTagToken token) { 1737 void endTagHeading(EndTagToken token) {
1739 for (var item in headingElements) { 1738 for (var item in HEADING_ELEMENTS) {
1740 if (tree.elementInScope(item)) { 1739 if (tree.elementInScope(item)) {
1741 tree.generateImpliedEndTags(); 1740 tree.generateImpliedEndTags();
1742 break; 1741 break;
1743 } 1742 }
1744 } 1743 }
1745 if (tree.openElements.last.tagName != token.name) { 1744 if (tree.openElements.last.tagName != token.name) {
1746 parser.parseError(token.span, "end-tag-too-early", {"name": token.name}); 1745 parser.parseError(token.span, "end-tag-too-early", {"name": token.name});
1747 } 1746 }
1748 1747
1749 for (var item in headingElements) { 1748 for (var item in HEADING_ELEMENTS) {
1750 if (tree.elementInScope(item)) { 1749 if (tree.elementInScope(item)) {
1751 item = tree.openElements.removeLast(); 1750 item = tree.openElements.removeLast();
1752 while (!headingElements.contains(item.tagName)) { 1751 while (!HEADING_ELEMENTS.contains(item.tagName)) {
1753 item = tree.openElements.removeLast(); 1752 item = tree.openElements.removeLast();
1754 } 1753 }
1755 break; 1754 break;
1756 } 1755 }
1757 } 1756 }
1758 } 1757 }
1759 1758
1760 /** The much-feared adoption agency algorithm. */ 1759 /** The much-feared adoption agency algorithm. */
1761 endTagFormatting(EndTagToken token) { 1760 endTagFormatting(EndTagToken token) {
1762 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construc tion.html#adoptionAgency 1761 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construc tion.html#adoptionAgency
(...skipping 26 matching lines...) Expand all
1789 if (formattingElement != tree.openElements.last) { 1788 if (formattingElement != tree.openElements.last) {
1790 parser.parseError(token.span, "adoption-agency-1.3", 1789 parser.parseError(token.span, "adoption-agency-1.3",
1791 {"name": token.name}); 1790 {"name": token.name});
1792 } 1791 }
1793 1792
1794 // Step 2 1793 // Step 2
1795 // Start of the adoption agency algorithm proper 1794 // Start of the adoption agency algorithm proper
1796 var afeIndex = tree.openElements.indexOf(formattingElement); 1795 var afeIndex = tree.openElements.indexOf(formattingElement);
1797 Node furthestBlock = null; 1796 Node furthestBlock = null;
1798 for (Node element in slice(tree.openElements, afeIndex)) { 1797 for (Node element in slice(tree.openElements, afeIndex)) {
1799 if (specialElements.contains(element.nameTuple)) { 1798 if (SPECIAL_ELEMENTS.contains(element.nameTuple)) {
1800 furthestBlock = element; 1799 furthestBlock = element;
1801 break; 1800 break;
1802 } 1801 }
1803 } 1802 }
1804 // Step 3 1803 // Step 3
1805 if (furthestBlock == null) { 1804 if (furthestBlock == null) {
1806 var element = tree.openElements.removeLast(); 1805 var element = tree.openElements.removeLast();
1807 while (element != formattingElement) { 1806 while (element != formattingElement) {
1808 element = tree.openElements.removeLast(); 1807 element = tree.openElements.removeLast();
1809 } 1808 }
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after
1926 for (Node node in tree.openElements.reversed) { 1925 for (Node node in tree.openElements.reversed) {
1927 if (node.tagName == token.name) { 1926 if (node.tagName == token.name) {
1928 tree.generateImpliedEndTags(token.name); 1927 tree.generateImpliedEndTags(token.name);
1929 if (tree.openElements.last.tagName != token.name) { 1928 if (tree.openElements.last.tagName != token.name) {
1930 parser.parseError(token.span, "unexpected-end-tag", 1929 parser.parseError(token.span, "unexpected-end-tag",
1931 {"name": token.name}); 1930 {"name": token.name});
1932 } 1931 }
1933 while (tree.openElements.removeLast() != node); 1932 while (tree.openElements.removeLast() != node);
1934 break; 1933 break;
1935 } else { 1934 } else {
1936 if (specialElements.contains(node.nameTuple)) { 1935 if (SPECIAL_ELEMENTS.contains(node.nameTuple)) {
1937 parser.parseError(token.span, "unexpected-end-tag", 1936 parser.parseError(token.span, "unexpected-end-tag",
1938 {"name": token.name}); 1937 {"name": token.name});
1939 break; 1938 break;
1940 } 1939 }
1941 } 1940 }
1942 } 1941 }
1943 } 1942 }
1944 } 1943 }
1945 1944
1946 1945
(...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after
2053 void insertText(CharactersToken token) { 2052 void insertText(CharactersToken token) {
2054 // If we get here there must be at least one non-whitespace character 2053 // If we get here there must be at least one non-whitespace character
2055 // Do the table magic! 2054 // Do the table magic!
2056 tree.insertFromTable = true; 2055 tree.insertFromTable = true;
2057 parser._inBodyPhase.processCharacters(token); 2056 parser._inBodyPhase.processCharacters(token);
2058 tree.insertFromTable = false; 2057 tree.insertFromTable = false;
2059 } 2058 }
2060 2059
2061 void startTagCaption(StartTagToken token) { 2060 void startTagCaption(StartTagToken token) {
2062 clearStackToTableContext(); 2061 clearStackToTableContext();
2063 tree.activeFormattingElements.add(Marker); 2062 tree.activeFormattingElements.add(MARKER);
2064 tree.insertElement(token); 2063 tree.insertElement(token);
2065 parser.phase = parser._inCaptionPhase; 2064 parser.phase = parser._inCaptionPhase;
2066 } 2065 }
2067 2066
2068 void startTagColgroup(StartTagToken token) { 2067 void startTagColgroup(StartTagToken token) {
2069 clearStackToTableContext(); 2068 clearStackToTableContext();
2070 tree.insertElement(token); 2069 tree.insertElement(token);
2071 parser.phase = parser._inColumnGroupPhase; 2070 parser.phase = parser._inColumnGroupPhase;
2072 } 2071 }
2073 2072
(...skipping 495 matching lines...) Expand 10 before | Expand all | Expand 10 after
2569 } 2568 }
2570 2569
2571 Token processCharacters(CharactersToken token) { 2570 Token processCharacters(CharactersToken token) {
2572 return parser._inTablePhase.processCharacters(token); 2571 return parser._inTablePhase.processCharacters(token);
2573 } 2572 }
2574 2573
2575 void startTagTableCell(StartTagToken token) { 2574 void startTagTableCell(StartTagToken token) {
2576 clearStackToTableRowContext(); 2575 clearStackToTableRowContext();
2577 tree.insertElement(token); 2576 tree.insertElement(token);
2578 parser.phase = parser._inCellPhase; 2577 parser.phase = parser._inCellPhase;
2579 tree.activeFormattingElements.add(Marker); 2578 tree.activeFormattingElements.add(MARKER);
2580 } 2579 }
2581 2580
2582 Token startTagTableOther(StartTagToken token) { 2581 Token startTagTableOther(StartTagToken token) {
2583 bool ignoreEndTag = ignoreEndTagTr(); 2582 bool ignoreEndTag = ignoreEndTagTr();
2584 endTagTr(new EndTagToken("tr")); 2583 endTagTr(new EndTagToken("tr"));
2585 // XXX how are we sure it's always ignored in the innerHTML case? 2584 // XXX how are we sure it's always ignored in the innerHTML case?
2586 return ignoreEndTag ? null : token; 2585 return ignoreEndTag ? null : token;
2587 } 2586 }
2588 2587
2589 Token startTagOther(StartTagToken token) { 2588 Token startTagOther(StartTagToken token) {
(...skipping 739 matching lines...) Expand 10 before | Expand all | Expand 10 after
3329 int get column => span.start.column; 3328 int get column => span.start.column;
3330 3329
3331 /** 3330 /**
3332 * Gets the human readable error message for this error. Use 3331 * Gets the human readable error message for this error. Use
3333 * [span.getLocationMessage] or [toString] to get a message including span 3332 * [span.getLocationMessage] or [toString] to get a message including span
3334 * information. If there is a file associated with the span, both 3333 * information. If there is a file associated with the span, both
3335 * [span.getLocationMessage] and [toString] are equivalent. Otherwise, 3334 * [span.getLocationMessage] and [toString] are equivalent. Otherwise,
3336 * [span.getLocationMessage] will not show any source url information, but 3335 * [span.getLocationMessage] will not show any source url information, but
3337 * [toString] will include 'ParserError:' as a prefix. 3336 * [toString] will include 'ParserError:' as a prefix.
3338 */ 3337 */
3339 String get message => formatStr(errorMessages[errorCode], data); 3338 String get message => formatStr(ERROR_MESSAGES[errorCode], data);
3340 3339
3341 String toString() { 3340 String toString() {
3342 var res = span.getLocationMessage(message); 3341 var res = span.getLocationMessage(message);
3343 return span.sourceUrl == null ? 'ParserError$res' : res; 3342 return span.sourceUrl == null ? 'ParserError$res' : res;
3344 } 3343 }
3345 } 3344 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698