pkg/third_party/html5lib/lib/parser.dart - Issue 157983005: pkg/third_party/html5lib: lots of cleanup

Side by Side Diff: pkg/third_party/html5lib/lib/parser.dart

Issue 157983005: pkg/third_party/html5lib: lots of cleanup (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: bump version Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « pkg/third_party/html5lib/lib/dom_parsing.dart ('k') | pkg/third_party/html5lib/lib/src/constants.dart » ('j') | pkg/third_party/html5lib/lib/src/constants.dart » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /**	1 /**

2 * This library has a parser for HTML5 documents, that lets you parse HTML	2 * This library has a parser for HTML5 documents, that lets you parse HTML

3 * easily from a script or server side application:	3 * easily from a script or server side application:

4 *	4 *

5 * import 'package:html5lib/parser.dart' show parse;	5 * import 'package:html5lib/parser.dart' show parse;

6 * import 'package:html5lib/dom.dart';	6 * import 'package:html5lib/dom.dart';

7 * main() {	7 * main() {

8 * var document = parse(	8 * var document = parse(

9 * '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!');	9 * '<body>Hello world! <a href="www.html5rocks.com">HTML5 rocks!');

10 * print(document.outerHtml);	10 * print(document.outerHtml);

11 * }	11 * }

12 *	12 *

13 * The resulting document you get back has a DOM-like API for easy tree	13 * The resulting document you get back has a DOM-like API for easy tree

14 * traversal and manipulation.	14 * traversal and manipulation.

15 */	15 */

16 library parser;	16 library parser;

17	17

18 import 'dart:collection';	18 import 'dart:collection';

19 import 'dart:math';	19 import 'dart:math';

20 import 'package:source_maps/span.dart' show Span, FileSpan;	20 import 'package:source_maps/span.dart' show Span, FileSpan;

21	21

22 import 'src/treebuilder.dart';	22 import 'src/treebuilder.dart';

23 import 'src/constants.dart';	23 import 'src/constants.dart';

24 import 'src/encoding_parser.dart';	24 import 'src/encoding_parser.dart';

25 import 'src/token.dart';	25 import 'src/token.dart';

26 import 'src/tokenizer.dart';	26 import 'src/tokenizer.dart';

27 import 'src/utils.dart';	27 import 'src/utils.dart';

28 import 'dom.dart';	28 import 'dom.dart';

29 import 'dom_parsing.dart';

30	29

31 /**	30 /**

32 * Parse the [input] html5 document into a tree. The [input] can be	31 * Parse the [input] html5 document into a tree. The [input] can be

33 * a [String], [List<int>] of bytes or an [HtmlTokenizer].	32 * a [String], [List<int>] of bytes or an [HtmlTokenizer].

34 *	33 *

35 * If [input] is not a [HtmlTokenizer], you can optionally specify the file's	34 * If [input] is not a [HtmlTokenizer], you can optionally specify the file's

36 * [encoding], which must be a string. If specified, that encoding will be used,	35 * [encoding], which must be a string. If specified, that encoding will be used,

37 * regardless of any BOM or later declaration (such as in a meta element).	36 * regardless of any BOM or later declaration (such as in a meta element).

38 *	37 *

39 * Set [generateSpans] if you want to generate [Span]s, otherwise the	38 * Set [generateSpans] if you want to generate [Span]s, otherwise the

(...skipping 195 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
235 void reset() {	234 void reset() {

236 tokenizer.reset();	235 tokenizer.reset();

237	236

238 tree.reset();	237 tree.reset();

239 firstStartTag = false;	238 firstStartTag = false;

240 errors.clear();	239 errors.clear();

241 // "quirks" / "limited quirks" / "no quirks"	240 // "quirks" / "limited quirks" / "no quirks"

242 compatMode = "no quirks";	241 compatMode = "no quirks";

243	242

244 if (innerHTMLMode) {	243 if (innerHTMLMode) {

245 if (cdataElements.contains(innerHTML)) {	244 if (CDATA_ELEMENTS.contains(innerHTML)) {

246 tokenizer.state = tokenizer.rcdataState;	245 tokenizer.state = tokenizer.rcdataState;

247 } else if (rcdataElements.contains(innerHTML)) {	246 } else if (RCDATA_ELEMENTS.contains(innerHTML)) {

248 tokenizer.state = tokenizer.rawtextState;	247 tokenizer.state = tokenizer.rawtextState;

249 } else if (innerHTML == 'plaintext') {	248 } else if (innerHTML == 'plaintext') {

250 tokenizer.state = tokenizer.plaintextState;	249 tokenizer.state = tokenizer.plaintextState;

251 } else {	250 } else {

252 // state already is data state	251 // state already is data state

253 // tokenizer.state = tokenizer.dataState;	252 // tokenizer.state = tokenizer.dataState;

254 }	253 }

255 phase = _beforeHtmlPhase;	254 phase = _beforeHtmlPhase;

256 _beforeHtmlPhase.insertHtmlElement();	255 _beforeHtmlPhase.insertHtmlElement();

257 resetInsertionMode();	256 resetInsertionMode();

258 } else {	257 } else {

259 phase = _initialPhase;	258 phase = _initialPhase;

260 }	259 }

261	260

262 lastPhase = null;	261 lastPhase = null;

263 beforeRCDataPhase = null;	262 beforeRCDataPhase = null;

264 framesetOK = true;	263 framesetOK = true;

265 }	264 }

266	265

267 bool isHTMLIntegrationPoint(Node element) {	266 bool isHTMLIntegrationPoint(Node element) {

268 if (element.tagName == "annotation-xml" &&	267 if (element.tagName == "annotation-xml" &&

269 element.namespace == Namespaces.mathml) {	268 element.namespace == Namespaces.mathml) {

270 var enc = element.attributes["encoding"];	269 var enc = element.attributes["encoding"];

271 if (enc != null) enc = asciiUpper2Lower(enc);	270 if (enc != null) enc = asciiUpper2Lower(enc);

272 return enc == "text/html" \|\| enc == "application/xhtml+xml";	271 return enc == "text/html" \|\| enc == "application/xhtml+xml";

273 } else {	272 } else {

274 return htmlIntegrationPointElements.contains(	273 return HTML_INTEGRATION_POINT_ELEMENTS.contains(

275 new Pair(element.namespace, element.tagName));	274 new Pair(element.namespace, element.tagName));

276 }	275 }

277 }	276 }

278	277

279 bool isMathMLTextIntegrationPoint(Node element) {	278 bool isMathMLTextIntegrationPoint(Node element) {

280 return mathmlTextIntegrationPointElements.contains(	279 return MATHML_TEXT_INTEGRATION_POINT_ELEMENTS.contains(

281 new Pair(element.namespace, element.tagName));	280 new Pair(element.namespace, element.tagName));

282 }	281 }

283	282

284 bool inForeignContent(Token token, int type) {	283 bool inForeignContent(Token token, int type) {

285 if (tree.openElements.length == 0) return false;	284 if (tree.openElements.length == 0) return false;

286	285

287 var node = tree.openElements.last;	286 var node = tree.openElements.last;

288 if (node.namespace == tree.defaultNamespace) return false;	287 if (node.namespace == tree.defaultNamespace) return false;

289	288

290 if (isMathMLTextIntegrationPoint(node)) {	289 if (isMathMLTextIntegrationPoint(node)) {

(...skipping 928 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1219 return true;	1218 return true;

1220 }	1219 }

1221	1220

1222 // helper	1221 // helper

1223 void addFormattingElement(token) {	1222 void addFormattingElement(token) {

1224 tree.insertElement(token);	1223 tree.insertElement(token);

1225 var element = tree.openElements.last;	1224 var element = tree.openElements.last;

1226	1225

1227 var matchingElements = [];	1226 var matchingElements = [];

1228 for (Node node in tree.activeFormattingElements.reversed) {	1227 for (Node node in tree.activeFormattingElements.reversed) {

1229 if (node == Marker) {	1228 if (node == MARKER) {

1230 break;	1229 break;

1231 } else if (isMatchingFormattingElement(node, element)) {	1230 } else if (isMatchingFormattingElement(node, element)) {

1232 matchingElements.add(node);	1231 matchingElements.add(node);

1233 }	1232 }

1234 }	1233 }

1235	1234

1236 assert(matchingElements.length <= 3);	1235 assert(matchingElements.length <= 3);

1237 if (matchingElements.length == 3) {	1236 if (matchingElements.length == 3) {

1238 tree.activeFormattingElements.remove(matchingElements.last);	1237 tree.activeFormattingElements.remove(matchingElements.last);

1239 }	1238 }

(...skipping 124 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1364	1363

1365 final stopNamesMap = const {"li": const ["li"],	1364 final stopNamesMap = const {"li": const ["li"],

1366 "dt": const ["dt", "dd"],	1365 "dt": const ["dt", "dd"],

1367 "dd": const ["dt", "dd"]};	1366 "dd": const ["dt", "dd"]};

1368 var stopNames = stopNamesMap[token.name];	1367 var stopNames = stopNamesMap[token.name];

1369 for (Node node in tree.openElements.reversed) {	1368 for (Node node in tree.openElements.reversed) {

1370 if (stopNames.contains(node.tagName)) {	1369 if (stopNames.contains(node.tagName)) {

1371 parser.phase.processEndTag(new EndTagToken(node.tagName));	1370 parser.phase.processEndTag(new EndTagToken(node.tagName));

1372 break;	1371 break;

1373 }	1372 }

1374 if (specialElements.contains(node.nameTuple) &&	1373 if (SPECIAL_ELEMENTS.contains(node.nameTuple) &&

1375 !const ["address", "div", "p"].contains(node.tagName)) {	1374 !const ["address", "div", "p"].contains(node.tagName)) {

1376 break;	1375 break;

1377 }	1376 }

1378 }	1377 }

1379	1378

1380 if (tree.elementInScope("p", variant: "button")) {	1379 if (tree.elementInScope("p", variant: "button")) {

1381 parser.phase.processEndTag(new EndTagToken("p"));	1380 parser.phase.processEndTag(new EndTagToken("p"));

1382 }	1381 }

1383	1382

1384 tree.insertElement(token);	1383 tree.insertElement(token);

1385 }	1384 }

1386	1385

1387 void startTagPlaintext(StartTagToken token) {	1386 void startTagPlaintext(StartTagToken token) {

1388 if (tree.elementInScope("p", variant: "button")) {	1387 if (tree.elementInScope("p", variant: "button")) {

1389 endTagP(new EndTagToken("p"));	1388 endTagP(new EndTagToken("p"));

1390 }	1389 }

1391 tree.insertElement(token);	1390 tree.insertElement(token);

1392 parser.tokenizer.state = parser.tokenizer.plaintextState;	1391 parser.tokenizer.state = parser.tokenizer.plaintextState;

1393 }	1392 }

1394	1393

1395 void startTagHeading(StartTagToken token) {	1394 void startTagHeading(StartTagToken token) {

1396 if (tree.elementInScope("p", variant: "button")) {	1395 if (tree.elementInScope("p", variant: "button")) {

1397 endTagP(new EndTagToken("p"));	1396 endTagP(new EndTagToken("p"));

1398 }	1397 }

1399 if (headingElements.contains(tree.openElements.last.tagName)) {	1398 if (HEADING_ELEMENTS.contains(tree.openElements.last.tagName)) {

1400 parser.parseError(token.span, "unexpected-start-tag",	1399 parser.parseError(token.span, "unexpected-start-tag",

1401 {"name": token.name});	1400 {"name": token.name});

1402 tree.openElements.removeLast();	1401 tree.openElements.removeLast();

1403 }	1402 }

1404 tree.insertElement(token);	1403 tree.insertElement(token);

1405 }	1404 }

1406	1405

1407 void startTagA(StartTagToken token) {	1406 void startTagA(StartTagToken token) {

1408 var afeAElement = tree.elementInActiveFormattingElements("a");	1407 var afeAElement = tree.elementInActiveFormattingElements("a");

1409 if (afeAElement != null) {	1408 if (afeAElement != null) {

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1443 } else {	1442 } else {

1444 tree.reconstructActiveFormattingElements();	1443 tree.reconstructActiveFormattingElements();

1445 tree.insertElement(token);	1444 tree.insertElement(token);

1446 parser.framesetOK = false;	1445 parser.framesetOK = false;

1447 }	1446 }

1448 }	1447 }

1449	1448

1450 void startTagAppletMarqueeObject(StartTagToken token) {	1449 void startTagAppletMarqueeObject(StartTagToken token) {

1451 tree.reconstructActiveFormattingElements();	1450 tree.reconstructActiveFormattingElements();

1452 tree.insertElement(token);	1451 tree.insertElement(token);

1453 tree.activeFormattingElements.add(Marker);	1452 tree.activeFormattingElements.add(MARKER);

1454 parser.framesetOK = false;	1453 parser.framesetOK = false;

1455 }	1454 }

1456	1455

1457 void startTagXmp(StartTagToken token) {	1456 void startTagXmp(StartTagToken token) {

1458 if (tree.elementInScope("p", variant: "button")) {	1457 if (tree.elementInScope("p", variant: "button")) {

1459 endTagP(new EndTagToken("p"));	1458 endTagP(new EndTagToken("p"));

1460 }	1459 }

1461 tree.reconstructActiveFormattingElements();	1460 tree.reconstructActiveFormattingElements();

1462 parser.framesetOK = false;	1461 parser.framesetOK = false;

1463 parser.parseRCDataRawtext(token, "RAWTEXT");	1462 parser.parseRCDataRawtext(token, "RAWTEXT");

(...skipping 265 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1729 } else {	1728 } else {

1730 tree.generateImpliedEndTags(token.name);	1729 tree.generateImpliedEndTags(token.name);

1731 if (tree.openElements.last.tagName != token.name) {	1730 if (tree.openElements.last.tagName != token.name) {

1732 parser.parseError(token.span, "end-tag-too-early", {"name": token.name}) ;	1731 parser.parseError(token.span, "end-tag-too-early", {"name": token.name}) ;

1733 }	1732 }

1734 popOpenElementsUntil(token.name);	1733 popOpenElementsUntil(token.name);

1735 }	1734 }

1736 }	1735 }

1737	1736

1738 void endTagHeading(EndTagToken token) {	1737 void endTagHeading(EndTagToken token) {

1739 for (var item in headingElements) {	1738 for (var item in HEADING_ELEMENTS) {

1740 if (tree.elementInScope(item)) {	1739 if (tree.elementInScope(item)) {

1741 tree.generateImpliedEndTags();	1740 tree.generateImpliedEndTags();

1742 break;	1741 break;

1743 }	1742 }

1744 }	1743 }

1745 if (tree.openElements.last.tagName != token.name) {	1744 if (tree.openElements.last.tagName != token.name) {

1746 parser.parseError(token.span, "end-tag-too-early", {"name": token.name});	1745 parser.parseError(token.span, "end-tag-too-early", {"name": token.name});

1747 }	1746 }

1748	1747

1749 for (var item in headingElements) {	1748 for (var item in HEADING_ELEMENTS) {

1750 if (tree.elementInScope(item)) {	1749 if (tree.elementInScope(item)) {

1751 item = tree.openElements.removeLast();	1750 item = tree.openElements.removeLast();

1752 while (!headingElements.contains(item.tagName)) {	1751 while (!HEADING_ELEMENTS.contains(item.tagName)) {

1753 item = tree.openElements.removeLast();	1752 item = tree.openElements.removeLast();

1754 }	1753 }

1755 break;	1754 break;

1756 }	1755 }

1757 }	1756 }

1758 }	1757 }

1759	1758

1760 /** The much-feared adoption agency algorithm. */	1759 /** The much-feared adoption agency algorithm. */

1761 endTagFormatting(EndTagToken token) {	1760 endTagFormatting(EndTagToken token) {

1762 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construc tion.html#adoptionAgency	1761 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construc tion.html#adoptionAgency

(...skipping 26 matching lines...) Expand all Loading...
1789 if (formattingElement != tree.openElements.last) {	1788 if (formattingElement != tree.openElements.last) {

1790 parser.parseError(token.span, "adoption-agency-1.3",	1789 parser.parseError(token.span, "adoption-agency-1.3",

1791 {"name": token.name});	1790 {"name": token.name});

1792 }	1791 }

1793	1792

1794 // Step 2	1793 // Step 2

1795 // Start of the adoption agency algorithm proper	1794 // Start of the adoption agency algorithm proper

1796 var afeIndex = tree.openElements.indexOf(formattingElement);	1795 var afeIndex = tree.openElements.indexOf(formattingElement);

1797 Node furthestBlock = null;	1796 Node furthestBlock = null;

1798 for (Node element in slice(tree.openElements, afeIndex)) {	1797 for (Node element in slice(tree.openElements, afeIndex)) {

1799 if (specialElements.contains(element.nameTuple)) {	1798 if (SPECIAL_ELEMENTS.contains(element.nameTuple)) {

1800 furthestBlock = element;	1799 furthestBlock = element;

1801 break;	1800 break;

1802 }	1801 }

1803 }	1802 }

1804 // Step 3	1803 // Step 3

1805 if (furthestBlock == null) {	1804 if (furthestBlock == null) {

1806 var element = tree.openElements.removeLast();	1805 var element = tree.openElements.removeLast();

1807 while (element != formattingElement) {	1806 while (element != formattingElement) {

1808 element = tree.openElements.removeLast();	1807 element = tree.openElements.removeLast();

1809 }	1808 }

(...skipping 116 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1926 for (Node node in tree.openElements.reversed) {	1925 for (Node node in tree.openElements.reversed) {

1927 if (node.tagName == token.name) {	1926 if (node.tagName == token.name) {

1928 tree.generateImpliedEndTags(token.name);	1927 tree.generateImpliedEndTags(token.name);

1929 if (tree.openElements.last.tagName != token.name) {	1928 if (tree.openElements.last.tagName != token.name) {

1930 parser.parseError(token.span, "unexpected-end-tag",	1929 parser.parseError(token.span, "unexpected-end-tag",

1931 {"name": token.name});	1930 {"name": token.name});

1932 }	1931 }

1933 while (tree.openElements.removeLast() != node);	1932 while (tree.openElements.removeLast() != node);

1934 break;	1933 break;

1935 } else {	1934 } else {

1936 if (specialElements.contains(node.nameTuple)) {	1935 if (SPECIAL_ELEMENTS.contains(node.nameTuple)) {

1937 parser.parseError(token.span, "unexpected-end-tag",	1936 parser.parseError(token.span, "unexpected-end-tag",

1938 {"name": token.name});	1937 {"name": token.name});

1939 break;	1938 break;

1940 }	1939 }

1941 }	1940 }

1942 }	1941 }

1943 }	1942 }

1944 }	1943 }

1945	1944

1946	1945

(...skipping 106 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2053 void insertText(CharactersToken token) {	2052 void insertText(CharactersToken token) {

2054 // If we get here there must be at least one non-whitespace character	2053 // If we get here there must be at least one non-whitespace character

2055 // Do the table magic!	2054 // Do the table magic!

2056 tree.insertFromTable = true;	2055 tree.insertFromTable = true;

2057 parser._inBodyPhase.processCharacters(token);	2056 parser._inBodyPhase.processCharacters(token);

2058 tree.insertFromTable = false;	2057 tree.insertFromTable = false;

2059 }	2058 }

2060	2059

2061 void startTagCaption(StartTagToken token) {	2060 void startTagCaption(StartTagToken token) {

2062 clearStackToTableContext();	2061 clearStackToTableContext();

2063 tree.activeFormattingElements.add(Marker);	2062 tree.activeFormattingElements.add(MARKER);

2064 tree.insertElement(token);	2063 tree.insertElement(token);

2065 parser.phase = parser._inCaptionPhase;	2064 parser.phase = parser._inCaptionPhase;

2066 }	2065 }

2067	2066

2068 void startTagColgroup(StartTagToken token) {	2067 void startTagColgroup(StartTagToken token) {

2069 clearStackToTableContext();	2068 clearStackToTableContext();

2070 tree.insertElement(token);	2069 tree.insertElement(token);

2071 parser.phase = parser._inColumnGroupPhase;	2070 parser.phase = parser._inColumnGroupPhase;

2072 }	2071 }

2073	2072

(...skipping 495 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2569 }	2568 }

2570	2569

2571 Token processCharacters(CharactersToken token) {	2570 Token processCharacters(CharactersToken token) {

2572 return parser._inTablePhase.processCharacters(token);	2571 return parser._inTablePhase.processCharacters(token);

2573 }	2572 }

2574	2573

2575 void startTagTableCell(StartTagToken token) {	2574 void startTagTableCell(StartTagToken token) {

2576 clearStackToTableRowContext();	2575 clearStackToTableRowContext();

2577 tree.insertElement(token);	2576 tree.insertElement(token);

2578 parser.phase = parser._inCellPhase;	2577 parser.phase = parser._inCellPhase;

2579 tree.activeFormattingElements.add(Marker);	2578 tree.activeFormattingElements.add(MARKER);

2580 }	2579 }

2581	2580

2582 Token startTagTableOther(StartTagToken token) {	2581 Token startTagTableOther(StartTagToken token) {

2583 bool ignoreEndTag = ignoreEndTagTr();	2582 bool ignoreEndTag = ignoreEndTagTr();

2584 endTagTr(new EndTagToken("tr"));	2583 endTagTr(new EndTagToken("tr"));

2585 // XXX how are we sure it's always ignored in the innerHTML case?	2584 // XXX how are we sure it's always ignored in the innerHTML case?

2586 return ignoreEndTag ? null : token;	2585 return ignoreEndTag ? null : token;

2587 }	2586 }

2588	2587

2589 Token startTagOther(StartTagToken token) {	2588 Token startTagOther(StartTagToken token) {

(...skipping 739 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3329 int get column => span.start.column;	3328 int get column => span.start.column;

3330	3329

3331 /**	3330 /**

3332 * Gets the human readable error message for this error. Use	3331 * Gets the human readable error message for this error. Use

3333 * [span.getLocationMessage] or [toString] to get a message including span	3332 * [span.getLocationMessage] or [toString] to get a message including span

3334 * information. If there is a file associated with the span, both	3333 * information. If there is a file associated with the span, both

3335 * [span.getLocationMessage] and [toString] are equivalent. Otherwise,	3334 * [span.getLocationMessage] and [toString] are equivalent. Otherwise,

3336 * [span.getLocationMessage] will not show any source url information, but	3335 * [span.getLocationMessage] will not show any source url information, but

3337 * [toString] will include 'ParserError:' as a prefix.	3336 * [toString] will include 'ParserError:' as a prefix.

3338 */	3337 */

3339 String get message => formatStr(errorMessages[errorCode], data);	3338 String get message => formatStr(ERROR_MESSAGES[errorCode], data);

3340	3339

3341 String toString() {	3340 String toString() {

3342 var res = span.getLocationMessage(message);	3341 var res = span.getLocationMessage(message);

3343 return span.sourceUrl == null ? 'ParserError$res' : res;	3342 return span.sourceUrl == null ? 'ParserError$res' : res;

3344 }	3343 }

3345 }	3344 }

OLD	NEW