OLD | NEW |
(Empty) | |
| 1 library tokenizer_test; |
| 2 |
| 3 // Note: mirrors used to match the getattr usage in the original test |
| 4 import 'dart:async'; |
| 5 import 'dart:io'; |
| 6 import 'dart:json' as json; |
| 7 import 'dart:mirrors'; |
| 8 import 'dart:utf'; |
| 9 import 'package:path/path.dart' as pathos; |
| 10 import 'package:unittest/unittest.dart'; |
| 11 import 'package:html5lib/src/char_encodings.dart'; |
| 12 import 'package:html5lib/src/constants.dart' as constants; |
| 13 import 'package:html5lib/src/token.dart'; |
| 14 import 'package:html5lib/src/tokenizer.dart'; |
| 15 import 'package:html5lib/src/utils.dart'; |
| 16 import 'support.dart'; |
| 17 |
| 18 class TokenizerTestParser { |
| 19 String _state; |
| 20 var _lastStartTag; |
| 21 List outputTokens; |
| 22 |
| 23 TokenizerTestParser(String initialState, [lastStartTag]) |
| 24 : _state = initialState, |
| 25 _lastStartTag = lastStartTag; |
| 26 |
| 27 List parse(String str) { |
| 28 // Note: we need to pass bytes to the tokenizer if we want it to handle BOM. |
| 29 var bytes = codepointsToUtf8(toCodepoints(str)); |
| 30 var tokenizer = new HtmlTokenizer(bytes, encoding: 'utf-8'); |
| 31 outputTokens = []; |
| 32 |
| 33 // Note: we can't get a closure of the state method. However, we can |
| 34 // create a new closure to invoke it via mirrors. |
| 35 var mtok = reflect(tokenizer); |
| 36 tokenizer.state = () => deprecatedFutureValue( |
| 37 mtok.invokeAsync(new Symbol(_state), const [])).reflectee; |
| 38 |
| 39 if (_lastStartTag != null) { |
| 40 tokenizer.currentToken = new StartTagToken(_lastStartTag); |
| 41 } |
| 42 |
| 43 while (tokenizer.moveNext()) { |
| 44 var token = tokenizer.current; |
| 45 switch (token.kind) { |
| 46 case TokenKind.characters: |
| 47 processCharacters(token); |
| 48 break; |
| 49 case TokenKind.spaceCharacters: |
| 50 processSpaceCharacters(token); |
| 51 break; |
| 52 case TokenKind.startTag: |
| 53 processStartTag(token); |
| 54 break; |
| 55 case TokenKind.endTag: |
| 56 processEndTag(token); |
| 57 break; |
| 58 case TokenKind.comment: |
| 59 processComment(token); |
| 60 break; |
| 61 case TokenKind.doctype: |
| 62 processDoctype(token); |
| 63 break; |
| 64 case TokenKind.parseError: |
| 65 processParseError(token); |
| 66 break; |
| 67 } |
| 68 } |
| 69 |
| 70 return outputTokens; |
| 71 } |
| 72 |
| 73 void processDoctype(DoctypeToken token) { |
| 74 outputTokens.add(["DOCTYPE", token.name, token.publicId, |
| 75 token.systemId, token.correct]); |
| 76 } |
| 77 |
| 78 void processStartTag(StartTagToken token) { |
| 79 outputTokens.add(["StartTag", token.name, token.data, token.selfClosing]); |
| 80 } |
| 81 |
| 82 void processEndTag(EndTagToken token) { |
| 83 outputTokens.add(["EndTag", token.name, token.selfClosing]); |
| 84 } |
| 85 |
| 86 void processComment(StringToken token) { |
| 87 outputTokens.add(["Comment", token.data]); |
| 88 } |
| 89 |
| 90 void processSpaceCharacters(StringToken token) { |
| 91 processCharacters(token); |
| 92 } |
| 93 |
| 94 void processCharacters(StringToken token) { |
| 95 outputTokens.add(["Character", token.data]); |
| 96 } |
| 97 |
| 98 void processEOF(token) { |
| 99 } |
| 100 |
| 101 void processParseError(StringToken token) { |
| 102 // TODO(jmesserly): when debugging test failures it can be useful to add |
| 103 // logging here like `print('ParseError $token');`. It would be nice to |
| 104 // use the actual logging library. |
| 105 outputTokens.add(["ParseError", token.data]); |
| 106 } |
| 107 } |
| 108 |
| 109 List concatenateCharacterTokens(List tokens) { |
| 110 var outputTokens = []; |
| 111 for (var token in tokens) { |
| 112 if (token.indexOf("ParseError") == -1 && token[0] == "Character") { |
| 113 if (outputTokens.length > 0 && |
| 114 outputTokens.last.indexOf("ParseError") == -1 && |
| 115 outputTokens.last[0] == "Character") { |
| 116 |
| 117 outputTokens.last[1] = '${outputTokens.last[1]}${token[1]}'; |
| 118 } else { |
| 119 outputTokens.add(token); |
| 120 } |
| 121 } else { |
| 122 outputTokens.add(token); |
| 123 } |
| 124 } |
| 125 return outputTokens; |
| 126 } |
| 127 |
| 128 List normalizeTokens(List tokens) { |
| 129 // TODO: convert tests to reflect arrays |
| 130 for (int i = 0; i < tokens.length; i++) { |
| 131 var token = tokens[i]; |
| 132 if (token[0] == 'ParseError') { |
| 133 tokens[i] = token[0]; |
| 134 } |
| 135 } |
| 136 return tokens; |
| 137 } |
| 138 |
| 139 |
| 140 /** |
| 141 * Test whether the test has passed or failed |
| 142 * |
| 143 * If the ignoreErrorOrder flag is set to true we don't test the relative |
| 144 * positions of parse errors and non parse errors. |
| 145 */ |
| 146 void expectTokensMatch(List expectedTokens, List receivedTokens, |
| 147 bool ignoreErrorOrder, [bool ignoreErrors = false, String message]) { |
| 148 |
| 149 var checkSelfClosing = false; |
| 150 for (var token in expectedTokens) { |
| 151 if (token[0] == "StartTag" && token.length == 4 |
| 152 || token[0] == "EndTag" && token.length == 3) { |
| 153 checkSelfClosing = true; |
| 154 break; |
| 155 } |
| 156 } |
| 157 |
| 158 if (!checkSelfClosing) { |
| 159 for (var token in receivedTokens) { |
| 160 if (token[0] == "StartTag" || token[0] == "EndTag") { |
| 161 token.removeLast(); |
| 162 } |
| 163 } |
| 164 } |
| 165 |
| 166 if (!ignoreErrorOrder && !ignoreErrors) { |
| 167 expect(receivedTokens, equals(expectedTokens), reason: message); |
| 168 } else { |
| 169 // Sort the tokens into two groups; non-parse errors and parse errors |
| 170 var expectedNonErrors = expectedTokens.where((t) => t != "ParseError"); |
| 171 var receivedNonErrors = receivedTokens.where((t) => t != "ParseError"); |
| 172 |
| 173 expect(receivedNonErrors, equals(expectedNonErrors), reason: message); |
| 174 if (!ignoreErrors) { |
| 175 var expectedParseErrors = expectedTokens.where((t) => t == "ParseError"); |
| 176 var receivedParseErrors = receivedTokens.where((t) => t == "ParseError"); |
| 177 expect(receivedParseErrors, equals(expectedParseErrors), reason: message); |
| 178 } |
| 179 } |
| 180 } |
| 181 |
| 182 void runTokenizerTest(Map testInfo) { |
| 183 // XXX - move this out into the setup function |
| 184 // concatenate all consecutive character tokens into a single token |
| 185 if (testInfo.containsKey('doubleEscaped')) { |
| 186 testInfo = unescape(testInfo); |
| 187 } |
| 188 |
| 189 var expected = concatenateCharacterTokens(testInfo['output']); |
| 190 if (!testInfo.containsKey('lastStartTag')) { |
| 191 testInfo['lastStartTag'] = null; |
| 192 } |
| 193 var parser = new TokenizerTestParser(testInfo['initialState'], |
| 194 testInfo['lastStartTag']); |
| 195 var tokens = parser.parse(testInfo['input']); |
| 196 tokens = concatenateCharacterTokens(tokens); |
| 197 var received = normalizeTokens(tokens); |
| 198 var errorMsg = ["\n\nInitial state:", |
| 199 testInfo['initialState'], |
| 200 "\nInput:", testInfo['input'], |
| 201 "\nExpected:", expected, |
| 202 "\nreceived:", tokens].map((s) => '$s').join('\n'); |
| 203 var ignoreErrorOrder = testInfo['ignoreErrorOrder']; |
| 204 if (ignoreErrorOrder == null) ignoreErrorOrder = false; |
| 205 |
| 206 expectTokensMatch(expected, received, ignoreErrorOrder, true, errorMsg); |
| 207 } |
| 208 |
| 209 Map unescape(Map testInfo) { |
| 210 // TODO(sigmundch,jmesserly): we currently use json.parse to unescape the |
| 211 // unicode characters in the string, we should use a decoding that works with |
| 212 // any control characters. |
| 213 decode(inp) => inp == '\u0000' ? inp : json.parse('"$inp"'); |
| 214 |
| 215 testInfo["input"] = decode(testInfo["input"]); |
| 216 for (var token in testInfo["output"]) { |
| 217 if (token == "ParseError") { |
| 218 continue; |
| 219 } else { |
| 220 token[1] = decode(token[1]); |
| 221 if (token.length > 2) { |
| 222 for (var pair in token[2]) { |
| 223 var key = pair[0]; |
| 224 var value = pair[1]; |
| 225 token[2].remove(key); |
| 226 token[2][decode(key)] = decode(value); |
| 227 } |
| 228 } |
| 229 } |
| 230 } |
| 231 return testInfo; |
| 232 } |
| 233 |
| 234 |
| 235 String camelCase(String s) { |
| 236 s = s.toLowerCase(); |
| 237 var result = new StringBuffer(); |
| 238 for (var match in new RegExp(r"\W+(\w)(\w+)").allMatches(s)) { |
| 239 if (result.length == 0) result.write(s.substring(0, match.start)); |
| 240 result.write(match.group(1).toUpperCase()); |
| 241 result.write(match.group(2)); |
| 242 } |
| 243 return result.toString(); |
| 244 } |
| 245 |
| 246 void main() { |
| 247 for (var path in getDataFiles('tokenizer')) { |
| 248 if (!path.endsWith('.test')) continue; |
| 249 |
| 250 var text = new File(path).readAsStringSync(); |
| 251 var tests = json.parse(text); |
| 252 var testName = pathos.basenameWithoutExtension(path); |
| 253 var testList = tests['tests']; |
| 254 if (testList == null) continue; |
| 255 |
| 256 group(testName, () { |
| 257 for (int index = 0; index < testList.length; index++) { |
| 258 final testInfo = testList[index]; |
| 259 |
| 260 testInfo.putIfAbsent("initialStates", () => ["Data state"]); |
| 261 for (var initialState in testInfo["initialStates"]) { |
| 262 test(testInfo["description"], () { |
| 263 testInfo["initialState"] = camelCase(initialState); |
| 264 runTokenizerTest(testInfo); |
| 265 }); |
| 266 } |
| 267 } |
| 268 }); |
| 269 } |
| 270 } |
OLD | NEW |