| OLD | NEW |
| (Empty) |
| 1 # -*- test-case-name: twisted.web.test.test_xml -*- | |
| 2 # | |
| 3 # Copyright (c) 2001-2004 Twisted Matrix Laboratories. | |
| 4 # See LICENSE for details. | |
| 5 | |
| 6 | |
| 7 """ | |
| 8 *S*mall, *U*ncomplicated *X*ML. | |
| 9 | |
| 10 This is a very simple implementation of XML/HTML as a network | |
| 11 protocol. It is not at all clever. Its main features are that it | |
| 12 does not: | |
| 13 | |
| 14 - support namespaces | |
| 15 - mung mnemonic entity references | |
| 16 - validate | |
| 17 - perform *any* external actions (such as fetching URLs or writing files) | |
| 18 under *any* circumstances | |
| 19 - has lots and lots of horrible hacks for supporting broken HTML (as an | |
| 20 option, they're not on by default). | |
| 21 """ | |
| 22 | |
| 23 from twisted.internet.protocol import Protocol, FileWrapper | |
| 24 from twisted.python.reflect import prefixedMethodNames | |
| 25 | |
| 26 | |
| 27 | |
| 28 # Elements of the three-tuples in the state table. | |
| 29 BEGIN_HANDLER = 0 | |
| 30 DO_HANDLER = 1 | |
| 31 END_HANDLER = 2 | |
| 32 | |
| 33 identChars = '.-_:' | |
| 34 lenientIdentChars = identChars + ';+#/%~' | |
| 35 | |
| 36 def nop(*args, **kw): | |
| 37 "Do nothing." | |
| 38 | |
| 39 | |
| 40 def unionlist(*args): | |
| 41 l = [] | |
| 42 for x in args: | |
| 43 l.extend(x) | |
| 44 d = dict([(x, 1) for x in l]) | |
| 45 return d.keys() | |
| 46 | |
| 47 | |
| 48 def zipfndict(*args, **kw): | |
| 49 default = kw.get('default', nop) | |
| 50 d = {} | |
| 51 for key in unionlist(*[fndict.keys() for fndict in args]): | |
| 52 d[key] = tuple([x.get(key, default) for x in args]) | |
| 53 return d | |
| 54 | |
| 55 | |
| 56 def prefixedMethodClassDict(clazz, prefix): | |
| 57 return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMetho
dNames(clazz, prefix)]) | |
| 58 | |
| 59 | |
| 60 def prefixedMethodObjDict(obj, prefix): | |
| 61 return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodN
ames(obj.__class__, prefix)]) | |
| 62 | |
| 63 | |
| 64 class ParseError(Exception): | |
| 65 | |
| 66 def __init__(self, filename, line, col, message): | |
| 67 self.filename = filename | |
| 68 self.line = line | |
| 69 self.col = col | |
| 70 self.message = message | |
| 71 | |
| 72 def __str__(self): | |
| 73 return "%s:%s:%s: %s" % (self.filename, self.line, self.col, | |
| 74 self.message) | |
| 75 | |
| 76 class XMLParser(Protocol): | |
| 77 | |
| 78 state = None | |
| 79 encodings = None | |
| 80 filename = "<xml />" | |
| 81 beExtremelyLenient = 0 | |
| 82 _prepend = None | |
| 83 | |
| 84 # _leadingBodyData will sometimes be set before switching to the | |
| 85 # 'bodydata' state, when we "accidentally" read a byte of bodydata | |
| 86 # in a different state. | |
| 87 _leadingBodyData = None | |
| 88 | |
| 89 def connectionMade(self): | |
| 90 self.lineno = 1 | |
| 91 self.colno = 0 | |
| 92 self.encodings = [] | |
| 93 | |
| 94 def saveMark(self): | |
| 95 '''Get the line number and column of the last character parsed''' | |
| 96 # This gets replaced during dataReceived, restored afterwards | |
| 97 return (self.lineno, self.colno) | |
| 98 | |
| 99 def _parseError(self, message): | |
| 100 raise ParseError(*((self.filename,)+self.saveMark()+(message,))) | |
| 101 | |
| 102 def _buildStateTable(self): | |
| 103 '''Return a dictionary of begin, do, end state function tuples''' | |
| 104 # _buildStateTable leaves something to be desired but it does what it | |
| 105 # does.. probably slowly, so I'm doing some evil caching so it doesn't | |
| 106 # get called more than once per class. | |
| 107 stateTable = getattr(self.__class__, '__stateTable', None) | |
| 108 if stateTable is None: | |
| 109 stateTable = self.__class__.__stateTable = zipfndict( | |
| 110 *[prefixedMethodObjDict(self, prefix) | |
| 111 for prefix in ('begin_', 'do_', 'end_')]) | |
| 112 return stateTable | |
| 113 | |
| 114 def _decode(self, data): | |
| 115 if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings: | |
| 116 assert not len(data) & 1, 'UTF-16 must come in pairs for now' | |
| 117 if self._prepend: | |
| 118 data = self._prepend + data | |
| 119 for encoding in self.encodings: | |
| 120 data = unicode(data, encoding) | |
| 121 return data | |
| 122 | |
| 123 def maybeBodyData(self): | |
| 124 if self.endtag: | |
| 125 return 'bodydata' | |
| 126 | |
| 127 # Get ready for fun! We're going to allow | |
| 128 # <script>if (foo < bar)</script> to work! | |
| 129 # We do this by making everything between <script> and | |
| 130 # </script> a Text | |
| 131 # BUT <script src="foo"> will be special-cased to do regular, | |
| 132 # lenient behavior, because those may not have </script> | |
| 133 # -radix | |
| 134 | |
| 135 if (self.tagName == 'script' | |
| 136 and not self.tagAttributes.has_key('src')): | |
| 137 # we do this ourselves rather than having begin_waitforendscript | |
| 138 # becuase that can get called multiple times and we don't want | |
| 139 # bodydata to get reset other than the first time. | |
| 140 self.begin_bodydata(None) | |
| 141 return 'waitforendscript' | |
| 142 return 'bodydata' | |
| 143 | |
| 144 | |
| 145 | |
| 146 def dataReceived(self, data): | |
| 147 stateTable = self._buildStateTable() | |
| 148 if not self.state: | |
| 149 # all UTF-16 starts with this string | |
| 150 if data.startswith('\xff\xfe'): | |
| 151 self._prepend = '\xff\xfe' | |
| 152 self.encodings.append('UTF-16') | |
| 153 data = data[2:] | |
| 154 elif data.startswith('\xfe\xff'): | |
| 155 self._prepend = '\xfe\xff' | |
| 156 self.encodings.append('UTF-16') | |
| 157 data = data[2:] | |
| 158 self.state = 'begin' | |
| 159 if self.encodings: | |
| 160 data = self._decode(data) | |
| 161 # bring state, lineno, colno into local scope | |
| 162 lineno, colno = self.lineno, self.colno | |
| 163 curState = self.state | |
| 164 # replace saveMark with a nested scope function | |
| 165 _saveMark = self.saveMark | |
| 166 def saveMark(): | |
| 167 return (lineno, colno) | |
| 168 self.saveMark = saveMark | |
| 169 # fetch functions from the stateTable | |
| 170 beginFn, doFn, endFn = stateTable[curState] | |
| 171 try: | |
| 172 for byte in data: | |
| 173 # do newline stuff | |
| 174 if byte == '\n': | |
| 175 lineno += 1 | |
| 176 colno = 0 | |
| 177 else: | |
| 178 colno += 1 | |
| 179 newState = doFn(byte) | |
| 180 if newState is not None and newState != curState: | |
| 181 # this is the endFn from the previous state | |
| 182 endFn() | |
| 183 curState = newState | |
| 184 beginFn, doFn, endFn = stateTable[curState] | |
| 185 beginFn(byte) | |
| 186 finally: | |
| 187 self.saveMark = _saveMark | |
| 188 self.lineno, self.colno = lineno, colno | |
| 189 # state doesn't make sense if there's an exception.. | |
| 190 self.state = curState | |
| 191 | |
| 192 | |
| 193 def connectionLost(self, reason): | |
| 194 """ | |
| 195 End the last state we were in. | |
| 196 """ | |
| 197 stateTable = self._buildStateTable() | |
| 198 stateTable[self.state][END_HANDLER]() | |
| 199 | |
| 200 | |
| 201 # state methods | |
| 202 | |
| 203 def do_begin(self, byte): | |
| 204 if byte.isspace(): | |
| 205 return | |
| 206 if byte != '<': | |
| 207 if self.beExtremelyLenient: | |
| 208 self._leadingBodyData = byte | |
| 209 return 'bodydata' | |
| 210 self._parseError("First char of document [%r] wasn't <" % (byte,)) | |
| 211 return 'tagstart' | |
| 212 | |
| 213 def begin_comment(self, byte): | |
| 214 self.commentbuf = '' | |
| 215 | |
| 216 def do_comment(self, byte): | |
| 217 self.commentbuf += byte | |
| 218 if self.commentbuf.endswith('-->'): | |
| 219 self.gotComment(self.commentbuf[:-3]) | |
| 220 return 'bodydata' | |
| 221 | |
| 222 def begin_tagstart(self, byte): | |
| 223 self.tagName = '' # name of the tag | |
| 224 self.tagAttributes = {} # attributes of the tag | |
| 225 self.termtag = 0 # is the tag self-terminating | |
| 226 self.endtag = 0 | |
| 227 | |
| 228 def do_tagstart(self, byte): | |
| 229 if byte.isalnum() or byte in identChars: | |
| 230 self.tagName += byte | |
| 231 if self.tagName == '!--': | |
| 232 return 'comment' | |
| 233 elif byte.isspace(): | |
| 234 if self.tagName: | |
| 235 if self.endtag: | |
| 236 # properly strict thing to do here is probably to only | |
| 237 # accept whitespace | |
| 238 return 'waitforgt' | |
| 239 return 'attrs' | |
| 240 else: | |
| 241 self._parseError("Whitespace before tag-name") | |
| 242 elif byte == '>': | |
| 243 if self.endtag: | |
| 244 self.gotTagEnd(self.tagName) | |
| 245 return 'bodydata' | |
| 246 else: | |
| 247 self.gotTagStart(self.tagName, {}) | |
| 248 return (not self.beExtremelyLenient) and 'bodydata' or self.mayb
eBodyData() | |
| 249 elif byte == '/': | |
| 250 if self.tagName: | |
| 251 return 'afterslash' | |
| 252 else: | |
| 253 self.endtag = 1 | |
| 254 elif byte in '!?': | |
| 255 if self.tagName: | |
| 256 if not self.beExtremelyLenient: | |
| 257 self._parseError("Invalid character in tag-name") | |
| 258 else: | |
| 259 self.tagName += byte | |
| 260 self.termtag = 1 | |
| 261 elif byte == '[': | |
| 262 if self.tagName == '!': | |
| 263 return 'expectcdata' | |
| 264 else: | |
| 265 self._parseError("Invalid '[' in tag-name") | |
| 266 else: | |
| 267 if self.beExtremelyLenient: | |
| 268 self.bodydata = '<' | |
| 269 return 'unentity' | |
| 270 self._parseError('Invalid tag character: %r'% byte) | |
| 271 | |
| 272 def begin_unentity(self, byte): | |
| 273 self.bodydata += byte | |
| 274 | |
| 275 def do_unentity(self, byte): | |
| 276 self.bodydata += byte | |
| 277 return 'bodydata' | |
| 278 | |
| 279 def end_unentity(self): | |
| 280 self.gotText(self.bodydata) | |
| 281 | |
| 282 def begin_expectcdata(self, byte): | |
| 283 self.cdatabuf = byte | |
| 284 | |
| 285 def do_expectcdata(self, byte): | |
| 286 self.cdatabuf += byte | |
| 287 cdb = self.cdatabuf | |
| 288 cd = '[CDATA[' | |
| 289 if len(cd) > len(cdb): | |
| 290 if cd.startswith(cdb): | |
| 291 return | |
| 292 elif self.beExtremelyLenient: | |
| 293 ## WHAT THE CRAP!? MSWord9 generates HTML that includes these | |
| 294 ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore | |
| 295 ## 'em as best I can. this should really be a separate parse | |
| 296 ## state but I don't even have any idea what these _are_. | |
| 297 return 'waitforgt' | |
| 298 else: | |
| 299 self._parseError("Mal-formed CDATA header") | |
| 300 if cd == cdb: | |
| 301 self.cdatabuf = '' | |
| 302 return 'cdata' | |
| 303 self._parseError("Mal-formed CDATA header") | |
| 304 | |
| 305 def do_cdata(self, byte): | |
| 306 self.cdatabuf += byte | |
| 307 if self.cdatabuf.endswith("]]>"): | |
| 308 self.cdatabuf = self.cdatabuf[:-3] | |
| 309 return 'bodydata' | |
| 310 | |
| 311 def end_cdata(self): | |
| 312 self.gotCData(self.cdatabuf) | |
| 313 self.cdatabuf = '' | |
| 314 | |
| 315 def do_attrs(self, byte): | |
| 316 if byte.isalnum() or byte in identChars: | |
| 317 # XXX FIXME really handle !DOCTYPE at some point | |
| 318 if self.tagName == '!DOCTYPE': | |
| 319 return 'doctype' | |
| 320 if self.tagName[0] in '!?': | |
| 321 return 'waitforgt' | |
| 322 return 'attrname' | |
| 323 elif byte.isspace(): | |
| 324 return | |
| 325 elif byte == '>': | |
| 326 self.gotTagStart(self.tagName, self.tagAttributes) | |
| 327 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBod
yData() | |
| 328 elif byte == '/': | |
| 329 return 'afterslash' | |
| 330 elif self.beExtremelyLenient: | |
| 331 # discard and move on? Only case I've seen of this so far was: | |
| 332 # <foo bar="baz""> | |
| 333 return | |
| 334 self._parseError("Unexpected character: %r" % byte) | |
| 335 | |
| 336 def begin_doctype(self, byte): | |
| 337 self.doctype = byte | |
| 338 | |
| 339 def do_doctype(self, byte): | |
| 340 if byte == '>': | |
| 341 return 'bodydata' | |
| 342 self.doctype += byte | |
| 343 | |
| 344 def end_doctype(self): | |
| 345 self.gotDoctype(self.doctype) | |
| 346 self.doctype = None | |
| 347 | |
| 348 def do_waitforgt(self, byte): | |
| 349 if byte == '>': | |
| 350 if self.endtag or not self.beExtremelyLenient: | |
| 351 return 'bodydata' | |
| 352 return self.maybeBodyData() | |
| 353 | |
| 354 def begin_attrname(self, byte): | |
| 355 self.attrname = byte | |
| 356 self._attrname_termtag = 0 | |
| 357 | |
| 358 def do_attrname(self, byte): | |
| 359 if byte.isalnum() or byte in identChars: | |
| 360 self.attrname += byte | |
| 361 return | |
| 362 elif byte == '=': | |
| 363 return 'beforeattrval' | |
| 364 elif byte.isspace(): | |
| 365 return 'beforeeq' | |
| 366 elif self.beExtremelyLenient: | |
| 367 if byte in '"\'': | |
| 368 return 'attrval' | |
| 369 if byte in lenientIdentChars or byte.isalnum(): | |
| 370 self.attrname += byte | |
| 371 return | |
| 372 if byte == '/': | |
| 373 self._attrname_termtag = 1 | |
| 374 return | |
| 375 if byte == '>': | |
| 376 self.attrval = 'True' | |
| 377 self.tagAttributes[self.attrname] = self.attrval | |
| 378 self.gotTagStart(self.tagName, self.tagAttributes) | |
| 379 if self._attrname_termtag: | |
| 380 self.gotTagEnd(self.tagName) | |
| 381 return 'bodydata' | |
| 382 return self.maybeBodyData() | |
| 383 # something is really broken. let's leave this attribute where it | |
| 384 # is and move on to the next thing | |
| 385 return | |
| 386 self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte)
) | |
| 387 | |
| 388 def do_beforeattrval(self, byte): | |
| 389 if byte in '"\'': | |
| 390 return 'attrval' | |
| 391 elif byte.isspace(): | |
| 392 return | |
| 393 elif self.beExtremelyLenient: | |
| 394 if byte in lenientIdentChars or byte.isalnum(): | |
| 395 return 'messyattr' | |
| 396 if byte == '>': | |
| 397 self.attrval = 'True' | |
| 398 self.tagAttributes[self.attrname] = self.attrval | |
| 399 self.gotTagStart(self.tagName, self.tagAttributes) | |
| 400 return self.maybeBodyData() | |
| 401 if byte == '\\': | |
| 402 # I saw this in actual HTML once: | |
| 403 # <font size=\"3\"><sup>SM</sup></font> | |
| 404 return | |
| 405 self._parseError("Invalid initial attribute value: %r; Attribute values
must be quoted." % byte) | |
| 406 | |
| 407 attrname = '' | |
| 408 attrval = '' | |
| 409 | |
| 410 def begin_beforeeq(self,byte): | |
| 411 self._beforeeq_termtag = 0 | |
| 412 | |
| 413 def do_beforeeq(self, byte): | |
| 414 if byte == '=': | |
| 415 return 'beforeattrval' | |
| 416 elif byte.isspace(): | |
| 417 return | |
| 418 elif self.beExtremelyLenient: | |
| 419 if byte.isalnum() or byte in identChars: | |
| 420 self.attrval = 'True' | |
| 421 self.tagAttributes[self.attrname] = self.attrval | |
| 422 return 'attrname' | |
| 423 elif byte == '>': | |
| 424 self.attrval = 'True' | |
| 425 self.tagAttributes[self.attrname] = self.attrval | |
| 426 self.gotTagStart(self.tagName, self.tagAttributes) | |
| 427 if self._beforeeq_termtag: | |
| 428 self.gotTagEnd(self.tagName) | |
| 429 return 'bodydata' | |
| 430 return self.maybeBodyData() | |
| 431 elif byte == '/': | |
| 432 self._beforeeq_termtag = 1 | |
| 433 return | |
| 434 self._parseError("Invalid attribute") | |
| 435 | |
| 436 def begin_attrval(self, byte): | |
| 437 self.quotetype = byte | |
| 438 self.attrval = '' | |
| 439 | |
| 440 def do_attrval(self, byte): | |
| 441 if byte == self.quotetype: | |
| 442 return 'attrs' | |
| 443 self.attrval += byte | |
| 444 | |
| 445 def end_attrval(self): | |
| 446 self.tagAttributes[self.attrname] = self.attrval | |
| 447 self.attrname = self.attrval = '' | |
| 448 | |
| 449 def begin_messyattr(self, byte): | |
| 450 self.attrval = byte | |
| 451 | |
| 452 def do_messyattr(self, byte): | |
| 453 if byte.isspace(): | |
| 454 return 'attrs' | |
| 455 elif byte == '>': | |
| 456 endTag = 0 | |
| 457 if self.attrval.endswith('/'): | |
| 458 endTag = 1 | |
| 459 self.attrval = self.attrval[:-1] | |
| 460 self.tagAttributes[self.attrname] = self.attrval | |
| 461 self.gotTagStart(self.tagName, self.tagAttributes) | |
| 462 if endTag: | |
| 463 self.gotTagEnd(self.tagName) | |
| 464 return 'bodydata' | |
| 465 return self.maybeBodyData() | |
| 466 else: | |
| 467 self.attrval += byte | |
| 468 | |
| 469 def end_messyattr(self): | |
| 470 if self.attrval: | |
| 471 self.tagAttributes[self.attrname] = self.attrval | |
| 472 | |
| 473 def begin_afterslash(self, byte): | |
| 474 self._after_slash_closed = 0 | |
| 475 | |
| 476 def do_afterslash(self, byte): | |
| 477 # this state is only after a self-terminating slash, e.g. <foo/> | |
| 478 if self._after_slash_closed: | |
| 479 self._parseError("Mal-formed")#XXX When does this happen?? | |
| 480 if byte != '>': | |
| 481 if self.beExtremelyLenient: | |
| 482 return | |
| 483 else: | |
| 484 self._parseError("No data allowed after '/'") | |
| 485 self._after_slash_closed = 1 | |
| 486 self.gotTagStart(self.tagName, self.tagAttributes) | |
| 487 self.gotTagEnd(self.tagName) | |
| 488 # don't need maybeBodyData here because there better not be | |
| 489 # any javascript code after a <script/>... we'll see :( | |
| 490 return 'bodydata' | |
| 491 | |
| 492 def begin_bodydata(self, byte): | |
| 493 if self._leadingBodyData: | |
| 494 self.bodydata = self._leadingBodyData | |
| 495 del self._leadingBodyData | |
| 496 else: | |
| 497 self.bodydata = '' | |
| 498 | |
| 499 def do_bodydata(self, byte): | |
| 500 if byte == '<': | |
| 501 return 'tagstart' | |
| 502 if byte == '&': | |
| 503 return 'entityref' | |
| 504 self.bodydata += byte | |
| 505 | |
| 506 def end_bodydata(self): | |
| 507 self.gotText(self.bodydata) | |
| 508 self.bodydata = '' | |
| 509 | |
| 510 def do_waitforendscript(self, byte): | |
| 511 if byte == '<': | |
| 512 return 'waitscriptendtag' | |
| 513 self.bodydata += byte | |
| 514 | |
| 515 def begin_waitscriptendtag(self, byte): | |
| 516 self.temptagdata = '' | |
| 517 self.tagName = '' | |
| 518 self.endtag = 0 | |
| 519 | |
| 520 def do_waitscriptendtag(self, byte): | |
| 521 # 1 enforce / as first byte read | |
| 522 # 2 enforce following bytes to be subset of "script" until | |
| 523 # tagName == "script" | |
| 524 # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagN
ame) | |
| 525 # 3 spaces can happen anywhere, they're ignored | |
| 526 # e.g. < / script > | |
| 527 # 4 anything else causes all data I've read to be moved to the | |
| 528 # bodydata, and switch back to waitforendscript state | |
| 529 | |
| 530 # If it turns out this _isn't_ a </script>, we need to | |
| 531 # remember all the data we've been through so we can append it | |
| 532 # to bodydata | |
| 533 self.temptagdata += byte | |
| 534 | |
| 535 # 1 | |
| 536 if byte == '/': | |
| 537 self.endtag = True | |
| 538 elif not self.endtag: | |
| 539 self.bodydata += "<" + self.temptagdata | |
| 540 return 'waitforendscript' | |
| 541 # 2 | |
| 542 elif byte.isalnum() or byte in identChars: | |
| 543 self.tagName += byte | |
| 544 if not 'script'.startswith(self.tagName): | |
| 545 self.bodydata += "<" + self.temptagdata | |
| 546 return 'waitforendscript' | |
| 547 elif self.tagName == 'script': | |
| 548 self.gotText(self.bodydata) | |
| 549 self.gotTagEnd(self.tagName) | |
| 550 return 'waitforgt' | |
| 551 # 3 | |
| 552 elif byte.isspace(): | |
| 553 return 'waitscriptendtag' | |
| 554 # 4 | |
| 555 else: | |
| 556 self.bodydata += "<" + self.temptagdata | |
| 557 return 'waitforendscript' | |
| 558 | |
| 559 | |
| 560 def begin_entityref(self, byte): | |
| 561 self.erefbuf = '' | |
| 562 self.erefextra = '' # extra bit for lenient mode | |
| 563 | |
| 564 def do_entityref(self, byte): | |
| 565 if byte.isspace() or byte == "<": | |
| 566 if self.beExtremelyLenient: | |
| 567 # '&foo' probably was '&foo' | |
| 568 if self.erefbuf and self.erefbuf != "amp": | |
| 569 self.erefextra = self.erefbuf | |
| 570 self.erefbuf = "amp" | |
| 571 if byte == "<": | |
| 572 return "tagstart" | |
| 573 else: | |
| 574 self.erefextra += byte | |
| 575 return 'spacebodydata' | |
| 576 self._parseError("Bad entity reference") | |
| 577 elif byte != ';': | |
| 578 self.erefbuf += byte | |
| 579 else: | |
| 580 return 'bodydata' | |
| 581 | |
| 582 def end_entityref(self): | |
| 583 self.gotEntityReference(self.erefbuf) | |
| 584 | |
| 585 # hacky support for space after & in entityref in beExtremelyLenient | |
| 586 # state should only happen in that case | |
| 587 def begin_spacebodydata(self, byte): | |
| 588 self.bodydata = self.erefextra | |
| 589 self.erefextra = None | |
| 590 do_spacebodydata = do_bodydata | |
| 591 end_spacebodydata = end_bodydata | |
| 592 | |
| 593 # Sorta SAX-ish API | |
| 594 | |
| 595 def gotTagStart(self, name, attributes): | |
| 596 '''Encountered an opening tag. | |
| 597 | |
| 598 Default behaviour is to print.''' | |
| 599 print 'begin', name, attributes | |
| 600 | |
| 601 def gotText(self, data): | |
| 602 '''Encountered text | |
| 603 | |
| 604 Default behaviour is to print.''' | |
| 605 print 'text:', repr(data) | |
| 606 | |
| 607 def gotEntityReference(self, entityRef): | |
| 608 '''Encountered mnemonic entity reference | |
| 609 | |
| 610 Default behaviour is to print.''' | |
| 611 print 'entityRef: &%s;' % entityRef | |
| 612 | |
| 613 def gotComment(self, comment): | |
| 614 '''Encountered comment. | |
| 615 | |
| 616 Default behaviour is to ignore.''' | |
| 617 pass | |
| 618 | |
| 619 def gotCData(self, cdata): | |
| 620 '''Encountered CDATA | |
| 621 | |
| 622 Default behaviour is to call the gotText method''' | |
| 623 self.gotText(cdata) | |
| 624 | |
| 625 def gotDoctype(self, doctype): | |
| 626 """Encountered DOCTYPE | |
| 627 | |
| 628 This is really grotty: it basically just gives you everything between | |
| 629 '<!DOCTYPE' and '>' as an argument. | |
| 630 """ | |
| 631 print '!DOCTYPE', repr(doctype) | |
| 632 | |
| 633 def gotTagEnd(self, name): | |
| 634 '''Encountered closing tag | |
| 635 | |
| 636 Default behaviour is to print.''' | |
| 637 print 'end', name | |
| 638 | |
| 639 if __name__ == '__main__': | |
| 640 from cStringIO import StringIO | |
| 641 testDocument = ''' | |
| 642 | |
| 643 <!DOCTYPE ignore all this shit, hah its malformed!!!!@$> | |
| 644 <?xml version="suck it"?> | |
| 645 <foo> | |
| 646 A | |
| 647 <bar /> | |
| 648 <baz boz="buz">boz &zop;</baz> | |
| 649 <![CDATA[ foo bar baz ]]> | |
| 650 </foo> | |
| 651 ''' | |
| 652 x = XMLParser() | |
| 653 x.makeConnection(FileWrapper(StringIO())) | |
| 654 # fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html" | |
| 655 fn = "/home/glyph/gruesome.xml" | |
| 656 # testDocument = open(fn).read() | |
| 657 x.dataReceived(testDocument) | |
| OLD | NEW |