pkg/third_party/html5lib/lib/src/tokenizer.dart - Issue 157983005: pkg/third_party/html5lib: lots of cleanup

Side by Side Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 157983005: pkg/third_party/html5lib: lots of cleanup (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: bump version Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« pkg/third_party/html5lib/lib/src/constants.dart ('K') | « pkg/third_party/html5lib/lib/src/inputstream.dart ('k') | pkg/third_party/html5lib/lib/src/treebuilder.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 library tokenizer;	1 library tokenizer;

2	2

3 import 'dart:collection';	3 import 'dart:collection';

4 import 'dart:math';

5 import 'package:html5lib/parser.dart' show HtmlParser;	4 import 'package:html5lib/parser.dart' show HtmlParser;

6 import 'package:source_maps/span.dart' show Span, FileSpan;	5 import 'package:source_maps/span.dart' show Span, FileSpan;

7 import 'constants.dart';	6 import 'constants.dart';

8 import 'inputstream.dart';	7 import 'inputstream.dart';

9 import 'token.dart';	8 import 'token.dart';

10 import 'utils.dart';	9 import 'utils.dart';

11	10

12 // Group entities by their first character, for faster lookups	11 // Group entities by their first character, for faster lookups

13	12

14 // TODO(jmesserly): we could use a better data structure here like a trie, if	13 // TODO(jmesserly): we could use a better data structure here like a trie, if

15 // we had it implemented in Dart.	14 // we had it implemented in Dart.

16 Map<String, List<String>> entitiesByFirstChar = (() {	15 Map<String, List<String>> entitiesByFirstChar = (() {

17 var result = {};	16 var result = {};

18 for (var k in entities.keys) {	17 for (var k in ENTITIES.keys) {

19 result.putIfAbsent(k[0], () => []).add(k);	18 result.putIfAbsent(k[0], () => []).add(k);

20 }	19 }

21 return result;	20 return result;

22 })();	21 })();

23	22

24 // TODO(jmesserly): lots of ways to make this faster:	23 // TODO(jmesserly): lots of ways to make this faster:

25 // - use char codes everywhere instead of 1-char strings	24 // - use char codes everywhere instead of 1-char strings

26 // - use switch instead of contains, indexOf	25 // - use switch instead of contains, indexOf

27 // - use switch instead of the sequential if tests	26 // - use switch instead of the sequential if tests

28 // - avoid string concat	27 // - avoid string concat

(...skipping 167 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
196 var c = stream.char();	195 var c = stream.char();

197 while (allowed(c) && c != EOF) {	196 while (allowed(c) && c != EOF) {

198 charStack.add(c);	197 charStack.add(c);

199 c = stream.char();	198 c = stream.char();

200 }	199 }

201	200

202 // Convert the set of characters consumed to an int.	201 // Convert the set of characters consumed to an int.

203 var charAsInt = parseIntRadix(charStack.join(), radix);	202 var charAsInt = parseIntRadix(charStack.join(), radix);

204	203

205 // Certain characters get replaced with others	204 // Certain characters get replaced with others

206 var char = replacementCharacters[charAsInt];	205 var char = REPLACEMENT_CHARACTERS[charAsInt];

207 if (char != null) {	206 if (char != null) {

208 _addToken(new ParseErrorToken(	207 _addToken(new ParseErrorToken(

209 "illegal-codepoint-for-numeric-entity",	208 "illegal-codepoint-for-numeric-entity",

210 messageParams: {"charAsInt": charAsInt}));	209 messageParams: {"charAsInt": charAsInt}));

211 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)	210 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)

212 \|\| (charAsInt > 0x10FFFF)) {	211 \|\| (charAsInt > 0x10FFFF)) {

213 char = "\uFFFD";	212 char = "\uFFFD";

214 _addToken(new ParseErrorToken(	213 _addToken(new ParseErrorToken(

215 "illegal-codepoint-for-numeric-entity",	214 "illegal-codepoint-for-numeric-entity",

216 messageParams: {"charAsInt": charAsInt}));	215 messageParams: {"charAsInt": charAsInt}));

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
298 // At this point we have a string that starts with some characters	297 // At this point we have a string that starts with some characters

299 // that may match an entity	298 // that may match an entity

300 String entityName = null;	299 String entityName = null;

301	300

302 // Try to find the longest entity the string will match to take care	301 // Try to find the longest entity the string will match to take care

303 // of &noti for instance.	302 // of &noti for instance.

304	303

305 int entityLen;	304 int entityLen;

306 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {	305 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {

307 var possibleEntityName = charStack.sublist(0, entityLen).join();	306 var possibleEntityName = charStack.sublist(0, entityLen).join();

308 if (entities.containsKey(possibleEntityName)) {	307 if (ENTITIES.containsKey(possibleEntityName)) {

309 entityName = possibleEntityName;	308 entityName = possibleEntityName;

310 break;	309 break;

311 }	310 }

312 }	311 }

313	312

314 if (entityName != null) {	313 if (entityName != null) {

315 var lastChar = entityName[entityName.length - 1];	314 var lastChar = entityName[entityName.length - 1];

316 if (lastChar != ";") {	315 if (lastChar != ";") {

317 _addToken(new ParseErrorToken(	316 _addToken(new ParseErrorToken(

318 "named-entity-without-semicolon"));	317 "named-entity-without-semicolon"));

319 }	318 }

320 if (lastChar != ";" && fromAttribute &&	319 if (lastChar != ";" && fromAttribute &&

321 (isLetterOrDigit(charStack[entityLen]) \|\|	320 (isLetterOrDigit(charStack[entityLen]) \|\|

322 charStack[entityLen] == '=')) {	321 charStack[entityLen] == '=')) {

323 stream.unget(charStack.removeLast());	322 stream.unget(charStack.removeLast());

324 output = "&${charStack.join()}";	323 output = "&${charStack.join()}";

325 } else {	324 } else {

326 output = entities[entityName];	325 output = ENTITIES[entityName];

327 stream.unget(charStack.removeLast());	326 stream.unget(charStack.removeLast());

328 output = '${output}${slice(charStack, entityLen).join()}';	327 output = '${output}${slice(charStack, entityLen).join()}';

329 }	328 }

330 } else {	329 } else {

331 _addToken(new ParseErrorToken("expected-named-entity"));	330 _addToken(new ParseErrorToken("expected-named-entity"));

332 stream.unget(charStack.removeLast());	331 stream.unget(charStack.removeLast());

333 output = "&${charStack.join()}";	332 output = "&${charStack.join()}";

334 }	333 }

335 }	334 }

336 if (fromAttribute) {	335 if (fromAttribute) {

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
400 _addToken(new ParseErrorToken("invalid-codepoint"));	399 _addToken(new ParseErrorToken("invalid-codepoint"));

401 _addToken(new CharactersToken("\u0000"));	400 _addToken(new CharactersToken("\u0000"));

402 } else if (data == EOF) {	401 } else if (data == EOF) {

403 // Tokenization ends.	402 // Tokenization ends.

404 return false;	403 return false;

405 } else if (isWhitespace(data)) {	404 } else if (isWhitespace(data)) {

406 // Directly after emitting a token you switch back to the "data	405 // Directly after emitting a token you switch back to the "data

407 // state". At that point spaceCharacters are important so they are	406 // state". At that point spaceCharacters are important so they are

408 // emitted separately.	407 // emitted separately.

409 _addToken(new SpaceCharactersToken(	408 _addToken(new SpaceCharactersToken(

410 '${data}${stream.charsUntil(spaceCharacters, true)}'));	409 '${data}${stream.charsUntil(SPACE_CHARACTERS, true)}'));

411 // No need to update lastFourChars here, since the first space will	410 // No need to update lastFourChars here, since the first space will

412 // have already been appended to lastFourChars and will have broken	411 // have already been appended to lastFourChars and will have broken

413 // any <!-- or --> sequences	412 // any <!-- or --> sequences

414 } else {	413 } else {

415 var chars = stream.charsUntil("&<\u0000");	414 var chars = stream.charsUntil("&<\u0000");

416 _addToken(new CharactersToken('${data}${chars}'));	415 _addToken(new CharactersToken('${data}${chars}'));

417 }	416 }

418 return true;	417 return true;

419 }	418 }

420	419

(...skipping 13 matching lines...) Expand all Loading...
434 // Tokenization ends.	433 // Tokenization ends.

435 return false;	434 return false;

436 } else if (data == "\u0000") {	435 } else if (data == "\u0000") {

437 _addToken(new ParseErrorToken("invalid-codepoint"));	436 _addToken(new ParseErrorToken("invalid-codepoint"));

438 _addToken(new CharactersToken("\uFFFD"));	437 _addToken(new CharactersToken("\uFFFD"));

439 } else if (isWhitespace(data)) {	438 } else if (isWhitespace(data)) {

440 // Directly after emitting a token you switch back to the "data	439 // Directly after emitting a token you switch back to the "data

441 // state". At that point spaceCharacters are important so they are	440 // state". At that point spaceCharacters are important so they are

442 // emitted separately.	441 // emitted separately.

443 _addToken(new SpaceCharactersToken(	442 _addToken(new SpaceCharactersToken(

444 '${data}${stream.charsUntil(spaceCharacters, true)}'));	443 '${data}${stream.charsUntil(SPACE_CHARACTERS, true)}'));

445 } else {	444 } else {

446 var chars = stream.charsUntil("&<");	445 var chars = stream.charsUntil("&<");

447 _addToken(new CharactersToken('${data}${chars}'));	446 _addToken(new CharactersToken('${data}${chars}'));

448 }	447 }

449 return true;	448 return true;

450 }	449 }

451	450

452 bool characterReferenceInRcdata() {	451 bool characterReferenceInRcdata() {

453 consumeEntity();	452 consumeEntity();

454 state = rcdataState;	453 state = rcdataState;

(...skipping 536 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
991 } else {	990 } else {

992 stream.unget(data);	991 stream.unget(data);

993 state = scriptDataDoubleEscapedState;	992 state = scriptDataDoubleEscapedState;

994 }	993 }

995 return true;	994 return true;

996 }	995 }

997	996

998 bool beforeAttributeNameState() {	997 bool beforeAttributeNameState() {

999 var data = stream.char();	998 var data = stream.char();

1000 if (isWhitespace(data)) {	999 if (isWhitespace(data)) {

1001 stream.charsUntil(spaceCharacters, true);	1000 stream.charsUntil(SPACE_CHARACTERS, true);

1002 } else if (isLetter(data)) {	1001 } else if (isLetter(data)) {

1003 _addAttribute(data);	1002 _addAttribute(data);

1004 state = attributeNameState;	1003 state = attributeNameState;

1005 } else if (data == ">") {	1004 } else if (data == ">") {

1006 emitCurrentToken();	1005 emitCurrentToken();

1007 } else if (data == "/") {	1006 } else if (data == "/") {

1008 state = selfClosingStartTagState;	1007 state = selfClosingStartTagState;

1009 } else if (data == EOF) {	1008 } else if (data == EOF) {

1010 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));	1009 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));

1011 state = dataState;	1010 state = dataState;

(...skipping 13 matching lines...) Expand all Loading...
1025 }	1024 }

1026	1025

1027 bool attributeNameState() {	1026 bool attributeNameState() {

1028 var data = stream.char();	1027 var data = stream.char();

1029 bool leavingThisState = true;	1028 bool leavingThisState = true;

1030 bool emitToken = false;	1029 bool emitToken = false;

1031 if (data == "=") {	1030 if (data == "=") {

1032 state = beforeAttributeValueState;	1031 state = beforeAttributeValueState;

1033 } else if (isLetter(data)) {	1032 } else if (isLetter(data)) {

1034 _attributeName = '$_attributeName$data'	1033 _attributeName = '$_attributeName$data'

1035 '${stream.charsUntil(asciiLetters, true)}';	1034 '${stream.charsUntil(ASCII_LETTERS, true)}';

1036 leavingThisState = false;	1035 leavingThisState = false;

1037 } else if (data == ">") {	1036 } else if (data == ">") {

1038 // XXX If we emit here the attributes are converted to a dict	1037 // XXX If we emit here the attributes are converted to a dict

1039 // without being checked and when the code below runs we error	1038 // without being checked and when the code below runs we error

1040 // because data is a dict not a list	1039 // because data is a dict not a list

1041 emitToken = true;	1040 emitToken = true;

1042 } else if (isWhitespace(data)) {	1041 } else if (isWhitespace(data)) {

1043 state = afterAttributeNameState;	1042 state = afterAttributeNameState;

1044 } else if (data == "/") {	1043 } else if (data == "/") {

1045 state = selfClosingStartTagState;	1044 state = selfClosingStartTagState;

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1078 if (emitToken) {	1077 if (emitToken) {

1079 emitCurrentToken();	1078 emitCurrentToken();

1080 }	1079 }

1081 }	1080 }

1082 return true;	1081 return true;

1083 }	1082 }

1084	1083

1085 bool afterAttributeNameState() {	1084 bool afterAttributeNameState() {

1086 var data = stream.char();	1085 var data = stream.char();

1087 if (isWhitespace(data)) {	1086 if (isWhitespace(data)) {

1088 stream.charsUntil(spaceCharacters, true);	1087 stream.charsUntil(SPACE_CHARACTERS, true);

1089 } else if (data == "=") {	1088 } else if (data == "=") {

1090 state = beforeAttributeValueState;	1089 state = beforeAttributeValueState;

1091 } else if (data == ">") {	1090 } else if (data == ">") {

1092 emitCurrentToken();	1091 emitCurrentToken();

1093 } else if (isLetter(data)) {	1092 } else if (isLetter(data)) {

1094 _addAttribute(data);	1093 _addAttribute(data);

1095 state = attributeNameState;	1094 state = attributeNameState;

1096 } else if (data == "/") {	1095 } else if (data == "/") {

1097 state = selfClosingStartTagState;	1096 state = selfClosingStartTagState;

1098 } else if (data == "\u0000") {	1097 } else if (data == "\u0000") {

(...skipping 10 matching lines...) Expand all Loading...
1109 } else {	1108 } else {

1110 _addAttribute(data);	1109 _addAttribute(data);

1111 state = attributeNameState;	1110 state = attributeNameState;

1112 }	1111 }

1113 return true;	1112 return true;

1114 }	1113 }

1115	1114

1116 bool beforeAttributeValueState() {	1115 bool beforeAttributeValueState() {

1117 var data = stream.char();	1116 var data = stream.char();

1118 if (isWhitespace(data)) {	1117 if (isWhitespace(data)) {

1119 stream.charsUntil(spaceCharacters, true);	1118 stream.charsUntil(SPACE_CHARACTERS, true);

1120 } else if (data == "\"") {	1119 } else if (data == "\"") {

1121 _markAttributeValueStart(0);	1120 _markAttributeValueStart(0);

1122 state = attributeValueDoubleQuotedState;	1121 state = attributeValueDoubleQuotedState;

1123 } else if (data == "&") {	1122 } else if (data == "&") {

1124 state = attributeValueUnQuotedState;	1123 state = attributeValueUnQuotedState;

1125 stream.unget(data);	1124 stream.unget(data);

1126 _markAttributeValueStart(0);	1125 _markAttributeValueStart(0);

1127 } else if (data == "'") {	1126 } else if (data == "'") {

1128 _markAttributeValueStart(0);	1127 _markAttributeValueStart(0);

1129 state = attributeValueSingleQuotedState;	1128 state = attributeValueSingleQuotedState;

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1210 state = dataState;	1209 state = dataState;

1211 } else if ('"\'=<`'.contains(data)) {	1210 } else if ('"\'=<`'.contains(data)) {

1212 _addToken(new ParseErrorToken(	1211 _addToken(new ParseErrorToken(

1213 "unexpected-character-in-unquoted-attribute-value"));	1212 "unexpected-character-in-unquoted-attribute-value"));

1214 _attributeValue = '$_attributeValue$data';	1213 _attributeValue = '$_attributeValue$data';

1215 } else if (data == "\u0000") {	1214 } else if (data == "\u0000") {

1216 _addToken(new ParseErrorToken("invalid-codepoint"));	1215 _addToken(new ParseErrorToken("invalid-codepoint"));

1217 _attributeValue = '${_attributeValue}\uFFFD';	1216 _attributeValue = '${_attributeValue}\uFFFD';

1218 } else {	1217 } else {

1219 _attributeValue = '$_attributeValue$data'	1218 _attributeValue = '$_attributeValue$data'

1220 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}';	1219 '${stream.charsUntil("&>\"\'=<`$SPACE_CHARACTERS")}';

1221 }	1220 }

1222 return true;	1221 return true;

1223 }	1222 }

1224	1223

1225 bool afterAttributeValueState() {	1224 bool afterAttributeValueState() {

1226 var data = stream.char();	1225 var data = stream.char();

1227 if (isWhitespace(data)) {	1226 if (isWhitespace(data)) {

1228 state = beforeAttributeNameState;	1227 state = beforeAttributeNameState;

1229 } else if (data == ">") {	1228 } else if (data == ">") {

1230 emitCurrentToken();	1229 emitCurrentToken();

(...skipping 662 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1893 }	1892 }

1894	1893

1895 if (data.length > 0) {	1894 if (data.length > 0) {

1896 _addToken(new CharactersToken(data.join()));	1895 _addToken(new CharactersToken(data.join()));

1897 }	1896 }

1898 state = dataState;	1897 state = dataState;

1899 return true;	1898 return true;

1900 }	1899 }

1901 }	1900 }

1902	1901

OLD	NEW