Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(286)

Side by Side Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 157983005: pkg/third_party/html5lib: lots of cleanup (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: bump version Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 library tokenizer; 1 library tokenizer;
2 2
3 import 'dart:collection'; 3 import 'dart:collection';
4 import 'dart:math';
5 import 'package:html5lib/parser.dart' show HtmlParser; 4 import 'package:html5lib/parser.dart' show HtmlParser;
6 import 'package:source_maps/span.dart' show Span, FileSpan; 5 import 'package:source_maps/span.dart' show Span, FileSpan;
7 import 'constants.dart'; 6 import 'constants.dart';
8 import 'inputstream.dart'; 7 import 'inputstream.dart';
9 import 'token.dart'; 8 import 'token.dart';
10 import 'utils.dart'; 9 import 'utils.dart';
11 10
12 // Group entities by their first character, for faster lookups 11 // Group entities by their first character, for faster lookups
13 12
14 // TODO(jmesserly): we could use a better data structure here like a trie, if 13 // TODO(jmesserly): we could use a better data structure here like a trie, if
15 // we had it implemented in Dart. 14 // we had it implemented in Dart.
16 Map<String, List<String>> entitiesByFirstChar = (() { 15 Map<String, List<String>> entitiesByFirstChar = (() {
17 var result = {}; 16 var result = {};
18 for (var k in entities.keys) { 17 for (var k in ENTITIES.keys) {
19 result.putIfAbsent(k[0], () => []).add(k); 18 result.putIfAbsent(k[0], () => []).add(k);
20 } 19 }
21 return result; 20 return result;
22 })(); 21 })();
23 22
24 // TODO(jmesserly): lots of ways to make this faster: 23 // TODO(jmesserly): lots of ways to make this faster:
25 // - use char codes everywhere instead of 1-char strings 24 // - use char codes everywhere instead of 1-char strings
26 // - use switch instead of contains, indexOf 25 // - use switch instead of contains, indexOf
27 // - use switch instead of the sequential if tests 26 // - use switch instead of the sequential if tests
28 // - avoid string concat 27 // - avoid string concat
(...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after
196 var c = stream.char(); 195 var c = stream.char();
197 while (allowed(c) && c != EOF) { 196 while (allowed(c) && c != EOF) {
198 charStack.add(c); 197 charStack.add(c);
199 c = stream.char(); 198 c = stream.char();
200 } 199 }
201 200
202 // Convert the set of characters consumed to an int. 201 // Convert the set of characters consumed to an int.
203 var charAsInt = parseIntRadix(charStack.join(), radix); 202 var charAsInt = parseIntRadix(charStack.join(), radix);
204 203
205 // Certain characters get replaced with others 204 // Certain characters get replaced with others
206 var char = replacementCharacters[charAsInt]; 205 var char = REPLACEMENT_CHARACTERS[charAsInt];
207 if (char != null) { 206 if (char != null) {
208 _addToken(new ParseErrorToken( 207 _addToken(new ParseErrorToken(
209 "illegal-codepoint-for-numeric-entity", 208 "illegal-codepoint-for-numeric-entity",
210 messageParams: {"charAsInt": charAsInt})); 209 messageParams: {"charAsInt": charAsInt}));
211 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) 210 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)
212 || (charAsInt > 0x10FFFF)) { 211 || (charAsInt > 0x10FFFF)) {
213 char = "\uFFFD"; 212 char = "\uFFFD";
214 _addToken(new ParseErrorToken( 213 _addToken(new ParseErrorToken(
215 "illegal-codepoint-for-numeric-entity", 214 "illegal-codepoint-for-numeric-entity",
216 messageParams: {"charAsInt": charAsInt})); 215 messageParams: {"charAsInt": charAsInt}));
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
298 // At this point we have a string that starts with some characters 297 // At this point we have a string that starts with some characters
299 // that may match an entity 298 // that may match an entity
300 String entityName = null; 299 String entityName = null;
301 300
302 // Try to find the longest entity the string will match to take care 301 // Try to find the longest entity the string will match to take care
303 // of &noti for instance. 302 // of &noti for instance.
304 303
305 int entityLen; 304 int entityLen;
306 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) { 305 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {
307 var possibleEntityName = charStack.sublist(0, entityLen).join(); 306 var possibleEntityName = charStack.sublist(0, entityLen).join();
308 if (entities.containsKey(possibleEntityName)) { 307 if (ENTITIES.containsKey(possibleEntityName)) {
309 entityName = possibleEntityName; 308 entityName = possibleEntityName;
310 break; 309 break;
311 } 310 }
312 } 311 }
313 312
314 if (entityName != null) { 313 if (entityName != null) {
315 var lastChar = entityName[entityName.length - 1]; 314 var lastChar = entityName[entityName.length - 1];
316 if (lastChar != ";") { 315 if (lastChar != ";") {
317 _addToken(new ParseErrorToken( 316 _addToken(new ParseErrorToken(
318 "named-entity-without-semicolon")); 317 "named-entity-without-semicolon"));
319 } 318 }
320 if (lastChar != ";" && fromAttribute && 319 if (lastChar != ";" && fromAttribute &&
321 (isLetterOrDigit(charStack[entityLen]) || 320 (isLetterOrDigit(charStack[entityLen]) ||
322 charStack[entityLen] == '=')) { 321 charStack[entityLen] == '=')) {
323 stream.unget(charStack.removeLast()); 322 stream.unget(charStack.removeLast());
324 output = "&${charStack.join()}"; 323 output = "&${charStack.join()}";
325 } else { 324 } else {
326 output = entities[entityName]; 325 output = ENTITIES[entityName];
327 stream.unget(charStack.removeLast()); 326 stream.unget(charStack.removeLast());
328 output = '${output}${slice(charStack, entityLen).join()}'; 327 output = '${output}${slice(charStack, entityLen).join()}';
329 } 328 }
330 } else { 329 } else {
331 _addToken(new ParseErrorToken("expected-named-entity")); 330 _addToken(new ParseErrorToken("expected-named-entity"));
332 stream.unget(charStack.removeLast()); 331 stream.unget(charStack.removeLast());
333 output = "&${charStack.join()}"; 332 output = "&${charStack.join()}";
334 } 333 }
335 } 334 }
336 if (fromAttribute) { 335 if (fromAttribute) {
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
400 _addToken(new ParseErrorToken("invalid-codepoint")); 399 _addToken(new ParseErrorToken("invalid-codepoint"));
401 _addToken(new CharactersToken("\u0000")); 400 _addToken(new CharactersToken("\u0000"));
402 } else if (data == EOF) { 401 } else if (data == EOF) {
403 // Tokenization ends. 402 // Tokenization ends.
404 return false; 403 return false;
405 } else if (isWhitespace(data)) { 404 } else if (isWhitespace(data)) {
406 // Directly after emitting a token you switch back to the "data 405 // Directly after emitting a token you switch back to the "data
407 // state". At that point spaceCharacters are important so they are 406 // state". At that point spaceCharacters are important so they are
408 // emitted separately. 407 // emitted separately.
409 _addToken(new SpaceCharactersToken( 408 _addToken(new SpaceCharactersToken(
410 '${data}${stream.charsUntil(spaceCharacters, true)}')); 409 '${data}${stream.charsUntil(SPACE_CHARACTERS, true)}'));
411 // No need to update lastFourChars here, since the first space will 410 // No need to update lastFourChars here, since the first space will
412 // have already been appended to lastFourChars and will have broken 411 // have already been appended to lastFourChars and will have broken
413 // any <!-- or --> sequences 412 // any <!-- or --> sequences
414 } else { 413 } else {
415 var chars = stream.charsUntil("&<\u0000"); 414 var chars = stream.charsUntil("&<\u0000");
416 _addToken(new CharactersToken('${data}${chars}')); 415 _addToken(new CharactersToken('${data}${chars}'));
417 } 416 }
418 return true; 417 return true;
419 } 418 }
420 419
(...skipping 13 matching lines...) Expand all
434 // Tokenization ends. 433 // Tokenization ends.
435 return false; 434 return false;
436 } else if (data == "\u0000") { 435 } else if (data == "\u0000") {
437 _addToken(new ParseErrorToken("invalid-codepoint")); 436 _addToken(new ParseErrorToken("invalid-codepoint"));
438 _addToken(new CharactersToken("\uFFFD")); 437 _addToken(new CharactersToken("\uFFFD"));
439 } else if (isWhitespace(data)) { 438 } else if (isWhitespace(data)) {
440 // Directly after emitting a token you switch back to the "data 439 // Directly after emitting a token you switch back to the "data
441 // state". At that point spaceCharacters are important so they are 440 // state". At that point spaceCharacters are important so they are
442 // emitted separately. 441 // emitted separately.
443 _addToken(new SpaceCharactersToken( 442 _addToken(new SpaceCharactersToken(
444 '${data}${stream.charsUntil(spaceCharacters, true)}')); 443 '${data}${stream.charsUntil(SPACE_CHARACTERS, true)}'));
445 } else { 444 } else {
446 var chars = stream.charsUntil("&<"); 445 var chars = stream.charsUntil("&<");
447 _addToken(new CharactersToken('${data}${chars}')); 446 _addToken(new CharactersToken('${data}${chars}'));
448 } 447 }
449 return true; 448 return true;
450 } 449 }
451 450
452 bool characterReferenceInRcdata() { 451 bool characterReferenceInRcdata() {
453 consumeEntity(); 452 consumeEntity();
454 state = rcdataState; 453 state = rcdataState;
(...skipping 536 matching lines...) Expand 10 before | Expand all | Expand 10 after
991 } else { 990 } else {
992 stream.unget(data); 991 stream.unget(data);
993 state = scriptDataDoubleEscapedState; 992 state = scriptDataDoubleEscapedState;
994 } 993 }
995 return true; 994 return true;
996 } 995 }
997 996
998 bool beforeAttributeNameState() { 997 bool beforeAttributeNameState() {
999 var data = stream.char(); 998 var data = stream.char();
1000 if (isWhitespace(data)) { 999 if (isWhitespace(data)) {
1001 stream.charsUntil(spaceCharacters, true); 1000 stream.charsUntil(SPACE_CHARACTERS, true);
1002 } else if (isLetter(data)) { 1001 } else if (isLetter(data)) {
1003 _addAttribute(data); 1002 _addAttribute(data);
1004 state = attributeNameState; 1003 state = attributeNameState;
1005 } else if (data == ">") { 1004 } else if (data == ">") {
1006 emitCurrentToken(); 1005 emitCurrentToken();
1007 } else if (data == "/") { 1006 } else if (data == "/") {
1008 state = selfClosingStartTagState; 1007 state = selfClosingStartTagState;
1009 } else if (data == EOF) { 1008 } else if (data == EOF) {
1010 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof")); 1009 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));
1011 state = dataState; 1010 state = dataState;
(...skipping 13 matching lines...) Expand all
1025 } 1024 }
1026 1025
1027 bool attributeNameState() { 1026 bool attributeNameState() {
1028 var data = stream.char(); 1027 var data = stream.char();
1029 bool leavingThisState = true; 1028 bool leavingThisState = true;
1030 bool emitToken = false; 1029 bool emitToken = false;
1031 if (data == "=") { 1030 if (data == "=") {
1032 state = beforeAttributeValueState; 1031 state = beforeAttributeValueState;
1033 } else if (isLetter(data)) { 1032 } else if (isLetter(data)) {
1034 _attributeName = '$_attributeName$data' 1033 _attributeName = '$_attributeName$data'
1035 '${stream.charsUntil(asciiLetters, true)}'; 1034 '${stream.charsUntil(ASCII_LETTERS, true)}';
1036 leavingThisState = false; 1035 leavingThisState = false;
1037 } else if (data == ">") { 1036 } else if (data == ">") {
1038 // XXX If we emit here the attributes are converted to a dict 1037 // XXX If we emit here the attributes are converted to a dict
1039 // without being checked and when the code below runs we error 1038 // without being checked and when the code below runs we error
1040 // because data is a dict not a list 1039 // because data is a dict not a list
1041 emitToken = true; 1040 emitToken = true;
1042 } else if (isWhitespace(data)) { 1041 } else if (isWhitespace(data)) {
1043 state = afterAttributeNameState; 1042 state = afterAttributeNameState;
1044 } else if (data == "/") { 1043 } else if (data == "/") {
1045 state = selfClosingStartTagState; 1044 state = selfClosingStartTagState;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
1078 if (emitToken) { 1077 if (emitToken) {
1079 emitCurrentToken(); 1078 emitCurrentToken();
1080 } 1079 }
1081 } 1080 }
1082 return true; 1081 return true;
1083 } 1082 }
1084 1083
1085 bool afterAttributeNameState() { 1084 bool afterAttributeNameState() {
1086 var data = stream.char(); 1085 var data = stream.char();
1087 if (isWhitespace(data)) { 1086 if (isWhitespace(data)) {
1088 stream.charsUntil(spaceCharacters, true); 1087 stream.charsUntil(SPACE_CHARACTERS, true);
1089 } else if (data == "=") { 1088 } else if (data == "=") {
1090 state = beforeAttributeValueState; 1089 state = beforeAttributeValueState;
1091 } else if (data == ">") { 1090 } else if (data == ">") {
1092 emitCurrentToken(); 1091 emitCurrentToken();
1093 } else if (isLetter(data)) { 1092 } else if (isLetter(data)) {
1094 _addAttribute(data); 1093 _addAttribute(data);
1095 state = attributeNameState; 1094 state = attributeNameState;
1096 } else if (data == "/") { 1095 } else if (data == "/") {
1097 state = selfClosingStartTagState; 1096 state = selfClosingStartTagState;
1098 } else if (data == "\u0000") { 1097 } else if (data == "\u0000") {
(...skipping 10 matching lines...) Expand all
1109 } else { 1108 } else {
1110 _addAttribute(data); 1109 _addAttribute(data);
1111 state = attributeNameState; 1110 state = attributeNameState;
1112 } 1111 }
1113 return true; 1112 return true;
1114 } 1113 }
1115 1114
1116 bool beforeAttributeValueState() { 1115 bool beforeAttributeValueState() {
1117 var data = stream.char(); 1116 var data = stream.char();
1118 if (isWhitespace(data)) { 1117 if (isWhitespace(data)) {
1119 stream.charsUntil(spaceCharacters, true); 1118 stream.charsUntil(SPACE_CHARACTERS, true);
1120 } else if (data == "\"") { 1119 } else if (data == "\"") {
1121 _markAttributeValueStart(0); 1120 _markAttributeValueStart(0);
1122 state = attributeValueDoubleQuotedState; 1121 state = attributeValueDoubleQuotedState;
1123 } else if (data == "&") { 1122 } else if (data == "&") {
1124 state = attributeValueUnQuotedState; 1123 state = attributeValueUnQuotedState;
1125 stream.unget(data); 1124 stream.unget(data);
1126 _markAttributeValueStart(0); 1125 _markAttributeValueStart(0);
1127 } else if (data == "'") { 1126 } else if (data == "'") {
1128 _markAttributeValueStart(0); 1127 _markAttributeValueStart(0);
1129 state = attributeValueSingleQuotedState; 1128 state = attributeValueSingleQuotedState;
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
1210 state = dataState; 1209 state = dataState;
1211 } else if ('"\'=<`'.contains(data)) { 1210 } else if ('"\'=<`'.contains(data)) {
1212 _addToken(new ParseErrorToken( 1211 _addToken(new ParseErrorToken(
1213 "unexpected-character-in-unquoted-attribute-value")); 1212 "unexpected-character-in-unquoted-attribute-value"));
1214 _attributeValue = '$_attributeValue$data'; 1213 _attributeValue = '$_attributeValue$data';
1215 } else if (data == "\u0000") { 1214 } else if (data == "\u0000") {
1216 _addToken(new ParseErrorToken("invalid-codepoint")); 1215 _addToken(new ParseErrorToken("invalid-codepoint"));
1217 _attributeValue = '${_attributeValue}\uFFFD'; 1216 _attributeValue = '${_attributeValue}\uFFFD';
1218 } else { 1217 } else {
1219 _attributeValue = '$_attributeValue$data' 1218 _attributeValue = '$_attributeValue$data'
1220 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}'; 1219 '${stream.charsUntil("&>\"\'=<`$SPACE_CHARACTERS")}';
1221 } 1220 }
1222 return true; 1221 return true;
1223 } 1222 }
1224 1223
1225 bool afterAttributeValueState() { 1224 bool afterAttributeValueState() {
1226 var data = stream.char(); 1225 var data = stream.char();
1227 if (isWhitespace(data)) { 1226 if (isWhitespace(data)) {
1228 state = beforeAttributeNameState; 1227 state = beforeAttributeNameState;
1229 } else if (data == ">") { 1228 } else if (data == ">") {
1230 emitCurrentToken(); 1229 emitCurrentToken();
(...skipping 662 matching lines...) Expand 10 before | Expand all | Expand 10 after
1893 } 1892 }
1894 1893
1895 if (data.length > 0) { 1894 if (data.length > 0) {
1896 _addToken(new CharactersToken(data.join())); 1895 _addToken(new CharactersToken(data.join()));
1897 } 1896 }
1898 state = dataState; 1897 state = dataState;
1899 return true; 1898 return true;
1900 } 1899 }
1901 } 1900 }
1902 1901
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698