| OLD | NEW |
| 1 library tokenizer; | 1 library tokenizer; |
| 2 | 2 |
| 3 import 'dart:collection'; | 3 import 'dart:collection'; |
| 4 import 'dart:math'; | |
| 5 import 'package:html5lib/parser.dart' show HtmlParser; | 4 import 'package:html5lib/parser.dart' show HtmlParser; |
| 6 import 'package:source_maps/span.dart' show Span, FileSpan; | 5 import 'package:source_maps/span.dart' show Span, FileSpan; |
| 7 import 'constants.dart'; | 6 import 'constants.dart'; |
| 8 import 'inputstream.dart'; | 7 import 'inputstream.dart'; |
| 9 import 'token.dart'; | 8 import 'token.dart'; |
| 10 import 'utils.dart'; | 9 import 'utils.dart'; |
| 11 | 10 |
| 12 // Group entities by their first character, for faster lookups | 11 // Group entities by their first character, for faster lookups |
| 13 | 12 |
| 14 // TODO(jmesserly): we could use a better data structure here like a trie, if | 13 // TODO(jmesserly): we could use a better data structure here like a trie, if |
| 15 // we had it implemented in Dart. | 14 // we had it implemented in Dart. |
| 16 Map<String, List<String>> entitiesByFirstChar = (() { | 15 Map<String, List<String>> entitiesByFirstChar = (() { |
| 17 var result = {}; | 16 var result = {}; |
| 18 for (var k in entities.keys) { | 17 for (var k in ENTITIES.keys) { |
| 19 result.putIfAbsent(k[0], () => []).add(k); | 18 result.putIfAbsent(k[0], () => []).add(k); |
| 20 } | 19 } |
| 21 return result; | 20 return result; |
| 22 })(); | 21 })(); |
| 23 | 22 |
| 24 // TODO(jmesserly): lots of ways to make this faster: | 23 // TODO(jmesserly): lots of ways to make this faster: |
| 25 // - use char codes everywhere instead of 1-char strings | 24 // - use char codes everywhere instead of 1-char strings |
| 26 // - use switch instead of contains, indexOf | 25 // - use switch instead of contains, indexOf |
| 27 // - use switch instead of the sequential if tests | 26 // - use switch instead of the sequential if tests |
| 28 // - avoid string concat | 27 // - avoid string concat |
| (...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 196 var c = stream.char(); | 195 var c = stream.char(); |
| 197 while (allowed(c) && c != EOF) { | 196 while (allowed(c) && c != EOF) { |
| 198 charStack.add(c); | 197 charStack.add(c); |
| 199 c = stream.char(); | 198 c = stream.char(); |
| 200 } | 199 } |
| 201 | 200 |
| 202 // Convert the set of characters consumed to an int. | 201 // Convert the set of characters consumed to an int. |
| 203 var charAsInt = parseIntRadix(charStack.join(), radix); | 202 var charAsInt = parseIntRadix(charStack.join(), radix); |
| 204 | 203 |
| 205 // Certain characters get replaced with others | 204 // Certain characters get replaced with others |
| 206 var char = replacementCharacters[charAsInt]; | 205 var char = REPLACEMENT_CHARACTERS[charAsInt]; |
| 207 if (char != null) { | 206 if (char != null) { |
| 208 _addToken(new ParseErrorToken( | 207 _addToken(new ParseErrorToken( |
| 209 "illegal-codepoint-for-numeric-entity", | 208 "illegal-codepoint-for-numeric-entity", |
| 210 messageParams: {"charAsInt": charAsInt})); | 209 messageParams: {"charAsInt": charAsInt})); |
| 211 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) | 210 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) |
| 212 || (charAsInt > 0x10FFFF)) { | 211 || (charAsInt > 0x10FFFF)) { |
| 213 char = "\uFFFD"; | 212 char = "\uFFFD"; |
| 214 _addToken(new ParseErrorToken( | 213 _addToken(new ParseErrorToken( |
| 215 "illegal-codepoint-for-numeric-entity", | 214 "illegal-codepoint-for-numeric-entity", |
| 216 messageParams: {"charAsInt": charAsInt})); | 215 messageParams: {"charAsInt": charAsInt})); |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 298 // At this point we have a string that starts with some characters | 297 // At this point we have a string that starts with some characters |
| 299 // that may match an entity | 298 // that may match an entity |
| 300 String entityName = null; | 299 String entityName = null; |
| 301 | 300 |
| 302 // Try to find the longest entity the string will match to take care | 301 // Try to find the longest entity the string will match to take care |
| 303 // of ¬i for instance. | 302 // of ¬i for instance. |
| 304 | 303 |
| 305 int entityLen; | 304 int entityLen; |
| 306 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) { | 305 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) { |
| 307 var possibleEntityName = charStack.sublist(0, entityLen).join(); | 306 var possibleEntityName = charStack.sublist(0, entityLen).join(); |
| 308 if (entities.containsKey(possibleEntityName)) { | 307 if (ENTITIES.containsKey(possibleEntityName)) { |
| 309 entityName = possibleEntityName; | 308 entityName = possibleEntityName; |
| 310 break; | 309 break; |
| 311 } | 310 } |
| 312 } | 311 } |
| 313 | 312 |
| 314 if (entityName != null) { | 313 if (entityName != null) { |
| 315 var lastChar = entityName[entityName.length - 1]; | 314 var lastChar = entityName[entityName.length - 1]; |
| 316 if (lastChar != ";") { | 315 if (lastChar != ";") { |
| 317 _addToken(new ParseErrorToken( | 316 _addToken(new ParseErrorToken( |
| 318 "named-entity-without-semicolon")); | 317 "named-entity-without-semicolon")); |
| 319 } | 318 } |
| 320 if (lastChar != ";" && fromAttribute && | 319 if (lastChar != ";" && fromAttribute && |
| 321 (isLetterOrDigit(charStack[entityLen]) || | 320 (isLetterOrDigit(charStack[entityLen]) || |
| 322 charStack[entityLen] == '=')) { | 321 charStack[entityLen] == '=')) { |
| 323 stream.unget(charStack.removeLast()); | 322 stream.unget(charStack.removeLast()); |
| 324 output = "&${charStack.join()}"; | 323 output = "&${charStack.join()}"; |
| 325 } else { | 324 } else { |
| 326 output = entities[entityName]; | 325 output = ENTITIES[entityName]; |
| 327 stream.unget(charStack.removeLast()); | 326 stream.unget(charStack.removeLast()); |
| 328 output = '${output}${slice(charStack, entityLen).join()}'; | 327 output = '${output}${slice(charStack, entityLen).join()}'; |
| 329 } | 328 } |
| 330 } else { | 329 } else { |
| 331 _addToken(new ParseErrorToken("expected-named-entity")); | 330 _addToken(new ParseErrorToken("expected-named-entity")); |
| 332 stream.unget(charStack.removeLast()); | 331 stream.unget(charStack.removeLast()); |
| 333 output = "&${charStack.join()}"; | 332 output = "&${charStack.join()}"; |
| 334 } | 333 } |
| 335 } | 334 } |
| 336 if (fromAttribute) { | 335 if (fromAttribute) { |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 400 _addToken(new ParseErrorToken("invalid-codepoint")); | 399 _addToken(new ParseErrorToken("invalid-codepoint")); |
| 401 _addToken(new CharactersToken("\u0000")); | 400 _addToken(new CharactersToken("\u0000")); |
| 402 } else if (data == EOF) { | 401 } else if (data == EOF) { |
| 403 // Tokenization ends. | 402 // Tokenization ends. |
| 404 return false; | 403 return false; |
| 405 } else if (isWhitespace(data)) { | 404 } else if (isWhitespace(data)) { |
| 406 // Directly after emitting a token you switch back to the "data | 405 // Directly after emitting a token you switch back to the "data |
| 407 // state". At that point spaceCharacters are important so they are | 406 // state". At that point spaceCharacters are important so they are |
| 408 // emitted separately. | 407 // emitted separately. |
| 409 _addToken(new SpaceCharactersToken( | 408 _addToken(new SpaceCharactersToken( |
| 410 '${data}${stream.charsUntil(spaceCharacters, true)}')); | 409 '${data}${stream.charsUntil(SPACE_CHARACTERS, true)}')); |
| 411 // No need to update lastFourChars here, since the first space will | 410 // No need to update lastFourChars here, since the first space will |
| 412 // have already been appended to lastFourChars and will have broken | 411 // have already been appended to lastFourChars and will have broken |
| 413 // any <!-- or --> sequences | 412 // any <!-- or --> sequences |
| 414 } else { | 413 } else { |
| 415 var chars = stream.charsUntil("&<\u0000"); | 414 var chars = stream.charsUntil("&<\u0000"); |
| 416 _addToken(new CharactersToken('${data}${chars}')); | 415 _addToken(new CharactersToken('${data}${chars}')); |
| 417 } | 416 } |
| 418 return true; | 417 return true; |
| 419 } | 418 } |
| 420 | 419 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 434 // Tokenization ends. | 433 // Tokenization ends. |
| 435 return false; | 434 return false; |
| 436 } else if (data == "\u0000") { | 435 } else if (data == "\u0000") { |
| 437 _addToken(new ParseErrorToken("invalid-codepoint")); | 436 _addToken(new ParseErrorToken("invalid-codepoint")); |
| 438 _addToken(new CharactersToken("\uFFFD")); | 437 _addToken(new CharactersToken("\uFFFD")); |
| 439 } else if (isWhitespace(data)) { | 438 } else if (isWhitespace(data)) { |
| 440 // Directly after emitting a token you switch back to the "data | 439 // Directly after emitting a token you switch back to the "data |
| 441 // state". At that point spaceCharacters are important so they are | 440 // state". At that point spaceCharacters are important so they are |
| 442 // emitted separately. | 441 // emitted separately. |
| 443 _addToken(new SpaceCharactersToken( | 442 _addToken(new SpaceCharactersToken( |
| 444 '${data}${stream.charsUntil(spaceCharacters, true)}')); | 443 '${data}${stream.charsUntil(SPACE_CHARACTERS, true)}')); |
| 445 } else { | 444 } else { |
| 446 var chars = stream.charsUntil("&<"); | 445 var chars = stream.charsUntil("&<"); |
| 447 _addToken(new CharactersToken('${data}${chars}')); | 446 _addToken(new CharactersToken('${data}${chars}')); |
| 448 } | 447 } |
| 449 return true; | 448 return true; |
| 450 } | 449 } |
| 451 | 450 |
| 452 bool characterReferenceInRcdata() { | 451 bool characterReferenceInRcdata() { |
| 453 consumeEntity(); | 452 consumeEntity(); |
| 454 state = rcdataState; | 453 state = rcdataState; |
| (...skipping 536 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 991 } else { | 990 } else { |
| 992 stream.unget(data); | 991 stream.unget(data); |
| 993 state = scriptDataDoubleEscapedState; | 992 state = scriptDataDoubleEscapedState; |
| 994 } | 993 } |
| 995 return true; | 994 return true; |
| 996 } | 995 } |
| 997 | 996 |
| 998 bool beforeAttributeNameState() { | 997 bool beforeAttributeNameState() { |
| 999 var data = stream.char(); | 998 var data = stream.char(); |
| 1000 if (isWhitespace(data)) { | 999 if (isWhitespace(data)) { |
| 1001 stream.charsUntil(spaceCharacters, true); | 1000 stream.charsUntil(SPACE_CHARACTERS, true); |
| 1002 } else if (isLetter(data)) { | 1001 } else if (isLetter(data)) { |
| 1003 _addAttribute(data); | 1002 _addAttribute(data); |
| 1004 state = attributeNameState; | 1003 state = attributeNameState; |
| 1005 } else if (data == ">") { | 1004 } else if (data == ">") { |
| 1006 emitCurrentToken(); | 1005 emitCurrentToken(); |
| 1007 } else if (data == "/") { | 1006 } else if (data == "/") { |
| 1008 state = selfClosingStartTagState; | 1007 state = selfClosingStartTagState; |
| 1009 } else if (data == EOF) { | 1008 } else if (data == EOF) { |
| 1010 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof")); | 1009 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof")); |
| 1011 state = dataState; | 1010 state = dataState; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 1025 } | 1024 } |
| 1026 | 1025 |
| 1027 bool attributeNameState() { | 1026 bool attributeNameState() { |
| 1028 var data = stream.char(); | 1027 var data = stream.char(); |
| 1029 bool leavingThisState = true; | 1028 bool leavingThisState = true; |
| 1030 bool emitToken = false; | 1029 bool emitToken = false; |
| 1031 if (data == "=") { | 1030 if (data == "=") { |
| 1032 state = beforeAttributeValueState; | 1031 state = beforeAttributeValueState; |
| 1033 } else if (isLetter(data)) { | 1032 } else if (isLetter(data)) { |
| 1034 _attributeName = '$_attributeName$data' | 1033 _attributeName = '$_attributeName$data' |
| 1035 '${stream.charsUntil(asciiLetters, true)}'; | 1034 '${stream.charsUntil(ASCII_LETTERS, true)}'; |
| 1036 leavingThisState = false; | 1035 leavingThisState = false; |
| 1037 } else if (data == ">") { | 1036 } else if (data == ">") { |
| 1038 // XXX If we emit here the attributes are converted to a dict | 1037 // XXX If we emit here the attributes are converted to a dict |
| 1039 // without being checked and when the code below runs we error | 1038 // without being checked and when the code below runs we error |
| 1040 // because data is a dict not a list | 1039 // because data is a dict not a list |
| 1041 emitToken = true; | 1040 emitToken = true; |
| 1042 } else if (isWhitespace(data)) { | 1041 } else if (isWhitespace(data)) { |
| 1043 state = afterAttributeNameState; | 1042 state = afterAttributeNameState; |
| 1044 } else if (data == "/") { | 1043 } else if (data == "/") { |
| 1045 state = selfClosingStartTagState; | 1044 state = selfClosingStartTagState; |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1078 if (emitToken) { | 1077 if (emitToken) { |
| 1079 emitCurrentToken(); | 1078 emitCurrentToken(); |
| 1080 } | 1079 } |
| 1081 } | 1080 } |
| 1082 return true; | 1081 return true; |
| 1083 } | 1082 } |
| 1084 | 1083 |
| 1085 bool afterAttributeNameState() { | 1084 bool afterAttributeNameState() { |
| 1086 var data = stream.char(); | 1085 var data = stream.char(); |
| 1087 if (isWhitespace(data)) { | 1086 if (isWhitespace(data)) { |
| 1088 stream.charsUntil(spaceCharacters, true); | 1087 stream.charsUntil(SPACE_CHARACTERS, true); |
| 1089 } else if (data == "=") { | 1088 } else if (data == "=") { |
| 1090 state = beforeAttributeValueState; | 1089 state = beforeAttributeValueState; |
| 1091 } else if (data == ">") { | 1090 } else if (data == ">") { |
| 1092 emitCurrentToken(); | 1091 emitCurrentToken(); |
| 1093 } else if (isLetter(data)) { | 1092 } else if (isLetter(data)) { |
| 1094 _addAttribute(data); | 1093 _addAttribute(data); |
| 1095 state = attributeNameState; | 1094 state = attributeNameState; |
| 1096 } else if (data == "/") { | 1095 } else if (data == "/") { |
| 1097 state = selfClosingStartTagState; | 1096 state = selfClosingStartTagState; |
| 1098 } else if (data == "\u0000") { | 1097 } else if (data == "\u0000") { |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1109 } else { | 1108 } else { |
| 1110 _addAttribute(data); | 1109 _addAttribute(data); |
| 1111 state = attributeNameState; | 1110 state = attributeNameState; |
| 1112 } | 1111 } |
| 1113 return true; | 1112 return true; |
| 1114 } | 1113 } |
| 1115 | 1114 |
| 1116 bool beforeAttributeValueState() { | 1115 bool beforeAttributeValueState() { |
| 1117 var data = stream.char(); | 1116 var data = stream.char(); |
| 1118 if (isWhitespace(data)) { | 1117 if (isWhitespace(data)) { |
| 1119 stream.charsUntil(spaceCharacters, true); | 1118 stream.charsUntil(SPACE_CHARACTERS, true); |
| 1120 } else if (data == "\"") { | 1119 } else if (data == "\"") { |
| 1121 _markAttributeValueStart(0); | 1120 _markAttributeValueStart(0); |
| 1122 state = attributeValueDoubleQuotedState; | 1121 state = attributeValueDoubleQuotedState; |
| 1123 } else if (data == "&") { | 1122 } else if (data == "&") { |
| 1124 state = attributeValueUnQuotedState; | 1123 state = attributeValueUnQuotedState; |
| 1125 stream.unget(data); | 1124 stream.unget(data); |
| 1126 _markAttributeValueStart(0); | 1125 _markAttributeValueStart(0); |
| 1127 } else if (data == "'") { | 1126 } else if (data == "'") { |
| 1128 _markAttributeValueStart(0); | 1127 _markAttributeValueStart(0); |
| 1129 state = attributeValueSingleQuotedState; | 1128 state = attributeValueSingleQuotedState; |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1210 state = dataState; | 1209 state = dataState; |
| 1211 } else if ('"\'=<`'.contains(data)) { | 1210 } else if ('"\'=<`'.contains(data)) { |
| 1212 _addToken(new ParseErrorToken( | 1211 _addToken(new ParseErrorToken( |
| 1213 "unexpected-character-in-unquoted-attribute-value")); | 1212 "unexpected-character-in-unquoted-attribute-value")); |
| 1214 _attributeValue = '$_attributeValue$data'; | 1213 _attributeValue = '$_attributeValue$data'; |
| 1215 } else if (data == "\u0000") { | 1214 } else if (data == "\u0000") { |
| 1216 _addToken(new ParseErrorToken("invalid-codepoint")); | 1215 _addToken(new ParseErrorToken("invalid-codepoint")); |
| 1217 _attributeValue = '${_attributeValue}\uFFFD'; | 1216 _attributeValue = '${_attributeValue}\uFFFD'; |
| 1218 } else { | 1217 } else { |
| 1219 _attributeValue = '$_attributeValue$data' | 1218 _attributeValue = '$_attributeValue$data' |
| 1220 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}'; | 1219 '${stream.charsUntil("&>\"\'=<`$SPACE_CHARACTERS")}'; |
| 1221 } | 1220 } |
| 1222 return true; | 1221 return true; |
| 1223 } | 1222 } |
| 1224 | 1223 |
| 1225 bool afterAttributeValueState() { | 1224 bool afterAttributeValueState() { |
| 1226 var data = stream.char(); | 1225 var data = stream.char(); |
| 1227 if (isWhitespace(data)) { | 1226 if (isWhitespace(data)) { |
| 1228 state = beforeAttributeNameState; | 1227 state = beforeAttributeNameState; |
| 1229 } else if (data == ">") { | 1228 } else if (data == ">") { |
| 1230 emitCurrentToken(); | 1229 emitCurrentToken(); |
| (...skipping 662 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1893 } | 1892 } |
| 1894 | 1893 |
| 1895 if (data.length > 0) { | 1894 if (data.length > 0) { |
| 1896 _addToken(new CharactersToken(data.join())); | 1895 _addToken(new CharactersToken(data.join())); |
| 1897 } | 1896 } |
| 1898 state = dataState; | 1897 state = dataState; |
| 1899 return true; | 1898 return true; |
| 1900 } | 1899 } |
| 1901 } | 1900 } |
| 1902 | 1901 |
| OLD | NEW |