third_party/twisted_8_1/twisted/web/sux.py - Issue 12261012: Remove third_party/twisted_8_1

Side by Side Diff: third_party/twisted_8_1/twisted/web/sux.py

Issue 12261012: Remove third_party/twisted_8_1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/build

Patch Set: Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 # -- test-case-name: twisted.web.test.test_xml --

2 #

3 # Copyright (c) 2001-2004 Twisted Matrix Laboratories.

4 # See LICENSE for details.

5

6

7 """

8 Small, Uncomplicated XML.

9

10 This is a very simple implementation of XML/HTML as a network

11 protocol. It is not at all clever. Its main features are that it

12 does not:

13

14 - support namespaces

15 - mung mnemonic entity references

16 - validate

17 - perform any external actions (such as fetching URLs or writing files)

18 under any circumstances

19 - has lots and lots of horrible hacks for supporting broken HTML (as an

20 option, they're not on by default).

21 """

22

23 from twisted.internet.protocol import Protocol, FileWrapper

24 from twisted.python.reflect import prefixedMethodNames

25

26

27

28 # Elements of the three-tuples in the state table.

29 BEGIN_HANDLER = 0

30 DO_HANDLER = 1

31 END_HANDLER = 2

32

33 identChars = '.-_:'

34 lenientIdentChars = identChars + ';+#/%~'

35

36 def nop(args, *kw):

37 "Do nothing."

38

39

40 def unionlist(*args):

41 l = []

42 for x in args:

43 l.extend(x)

44 d = dict([(x, 1) for x in l])

45 return d.keys()

46

47

48 def zipfndict(args, *kw):

49 default = kw.get('default', nop)

50 d = {}

51 for key in unionlist(*[fndict.keys() for fndict in args]):

52 d[key] = tuple([x.get(key, default) for x in args])

53 return d

54

55

56 def prefixedMethodClassDict(clazz, prefix):

57 return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMetho dNames(clazz, prefix)])

58

59

60 def prefixedMethodObjDict(obj, prefix):

61 return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodN ames(obj.__class__, prefix)])

62

63

64 class ParseError(Exception):

65

66 def __init__(self, filename, line, col, message):

67 self.filename = filename

68 self.line = line

69 self.col = col

70 self.message = message

71

72 def __str__(self):

73 return "%s:%s:%s: %s" % (self.filename, self.line, self.col,

74 self.message)

75

76 class XMLParser(Protocol):

77

78 state = None

79 encodings = None

80 filename = "<xml />"

81 beExtremelyLenient = 0

82 _prepend = None

83

84 # _leadingBodyData will sometimes be set before switching to the

85 # 'bodydata' state, when we "accidentally" read a byte of bodydata

86 # in a different state.

87 _leadingBodyData = None

88

89 def connectionMade(self):

90 self.lineno = 1

91 self.colno = 0

92 self.encodings = []

93

94 def saveMark(self):

95 '''Get the line number and column of the last character parsed'''

96 # This gets replaced during dataReceived, restored afterwards

97 return (self.lineno, self.colno)

98

99 def _parseError(self, message):

100 raise ParseError(*((self.filename,)+self.saveMark()+(message,)))

101

102 def _buildStateTable(self):

103 '''Return a dictionary of begin, do, end state function tuples'''

104 # _buildStateTable leaves something to be desired but it does what it

105 # does.. probably slowly, so I'm doing some evil caching so it doesn't

106 # get called more than once per class.

107 stateTable = getattr(self.__class__, '__stateTable', None)

108 if stateTable is None:

109 stateTable = self.__class__.__stateTable = zipfndict(

110 *[prefixedMethodObjDict(self, prefix)

111 for prefix in ('begin_', 'do_', 'end_')])

112 return stateTable

113

114 def _decode(self, data):

115 if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:

116 assert not len(data) & 1, 'UTF-16 must come in pairs for now'

117 if self._prepend:

118 data = self._prepend + data

119 for encoding in self.encodings:

120 data = unicode(data, encoding)

121 return data

122

123 def maybeBodyData(self):

124 if self.endtag:

125 return 'bodydata'

126

127 # Get ready for fun! We're going to allow

128 # <script>if (foo < bar)</script> to work!

129 # We do this by making everything between <script> and

130 # </script> a Text

131 # BUT <script src="foo"> will be special-cased to do regular,

132 # lenient behavior, because those may not have </script>

133 # -radix

134

135 if (self.tagName == 'script'

136 and not self.tagAttributes.has_key('src')):

137 # we do this ourselves rather than having begin_waitforendscript

138 # becuase that can get called multiple times and we don't want

139 # bodydata to get reset other than the first time.

140 self.begin_bodydata(None)

141 return 'waitforendscript'

142 return 'bodydata'

143

144

145

146 def dataReceived(self, data):

147 stateTable = self._buildStateTable()

148 if not self.state:

149 # all UTF-16 starts with this string

150 if data.startswith('\xff\xfe'):

151 self._prepend = '\xff\xfe'

152 self.encodings.append('UTF-16')

153 data = data[2:]

154 elif data.startswith('\xfe\xff'):

155 self._prepend = '\xfe\xff'

156 self.encodings.append('UTF-16')

157 data = data[2:]

158 self.state = 'begin'

159 if self.encodings:

160 data = self._decode(data)

161 # bring state, lineno, colno into local scope

162 lineno, colno = self.lineno, self.colno

163 curState = self.state

164 # replace saveMark with a nested scope function

165 _saveMark = self.saveMark

166 def saveMark():

167 return (lineno, colno)

168 self.saveMark = saveMark

169 # fetch functions from the stateTable

170 beginFn, doFn, endFn = stateTable[curState]

171 try:

172 for byte in data:

173 # do newline stuff

174 if byte == '\n':

175 lineno += 1

176 colno = 0

177 else:

178 colno += 1

179 newState = doFn(byte)

180 if newState is not None and newState != curState:

181 # this is the endFn from the previous state

182 endFn()

183 curState = newState

184 beginFn, doFn, endFn = stateTable[curState]

185 beginFn(byte)

186 finally:

187 self.saveMark = _saveMark

188 self.lineno, self.colno = lineno, colno

189 # state doesn't make sense if there's an exception..

190 self.state = curState

191

192

193 def connectionLost(self, reason):

194 """

195 End the last state we were in.

196 """

197 stateTable = self._buildStateTable()

198 stateTable[self.state][END_HANDLER]()

199

200

201 # state methods

202

203 def do_begin(self, byte):

204 if byte.isspace():

205 return

206 if byte != '<':

207 if self.beExtremelyLenient:

208 self._leadingBodyData = byte

209 return 'bodydata'

210 self._parseError("First char of document [%r] wasn't <" % (byte,))

211 return 'tagstart'

212

213 def begin_comment(self, byte):

214 self.commentbuf = ''

215

216 def do_comment(self, byte):

217 self.commentbuf += byte

218 if self.commentbuf.endswith('-->'):

219 self.gotComment(self.commentbuf[:-3])

220 return 'bodydata'

221

222 def begin_tagstart(self, byte):

223 self.tagName = '' # name of the tag

224 self.tagAttributes = {} # attributes of the tag

225 self.termtag = 0 # is the tag self-terminating

226 self.endtag = 0

227

228 def do_tagstart(self, byte):

229 if byte.isalnum() or byte in identChars:

230 self.tagName += byte

231 if self.tagName == '!--':

232 return 'comment'

233 elif byte.isspace():

234 if self.tagName:

235 if self.endtag:

236 # properly strict thing to do here is probably to only

237 # accept whitespace

238 return 'waitforgt'

239 return 'attrs'

240 else:

241 self._parseError("Whitespace before tag-name")

242 elif byte == '>':

243 if self.endtag:

244 self.gotTagEnd(self.tagName)

245 return 'bodydata'

246 else:

247 self.gotTagStart(self.tagName, {})

248 return (not self.beExtremelyLenient) and 'bodydata' or self.mayb eBodyData()

249 elif byte == '/':

250 if self.tagName:

251 return 'afterslash'

252 else:

253 self.endtag = 1

254 elif byte in '!?':

255 if self.tagName:

256 if not self.beExtremelyLenient:

257 self._parseError("Invalid character in tag-name")

258 else:

259 self.tagName += byte

260 self.termtag = 1

261 elif byte == '[':

262 if self.tagName == '!':

263 return 'expectcdata'

264 else:

265 self._parseError("Invalid '[' in tag-name")

266 else:

267 if self.beExtremelyLenient:

268 self.bodydata = '<'

269 return 'unentity'

270 self._parseError('Invalid tag character: %r'% byte)

271

272 def begin_unentity(self, byte):

273 self.bodydata += byte

274

275 def do_unentity(self, byte):

276 self.bodydata += byte

277 return 'bodydata'

278

279 def end_unentity(self):

280 self.gotText(self.bodydata)

281

282 def begin_expectcdata(self, byte):

283 self.cdatabuf = byte

284

285 def do_expectcdata(self, byte):

286 self.cdatabuf += byte

287 cdb = self.cdatabuf

288 cd = '[CDATA['

289 if len(cd) > len(cdb):

290 if cd.startswith(cdb):

291 return

292 elif self.beExtremelyLenient:

293 ## WHAT THE CRAP!? MSWord9 generates HTML that includes these

294 ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore

295 ## 'em as best I can. this should really be a separate parse

296 ## state but I don't even have any idea what these _are_.

297 return 'waitforgt'

298 else:

299 self._parseError("Mal-formed CDATA header")

300 if cd == cdb:

301 self.cdatabuf = ''

302 return 'cdata'

303 self._parseError("Mal-formed CDATA header")

304

305 def do_cdata(self, byte):

306 self.cdatabuf += byte

307 if self.cdatabuf.endswith("]]>"):

308 self.cdatabuf = self.cdatabuf[:-3]

309 return 'bodydata'

310

311 def end_cdata(self):

312 self.gotCData(self.cdatabuf)

313 self.cdatabuf = ''

314

315 def do_attrs(self, byte):

316 if byte.isalnum() or byte in identChars:

317 # XXX FIXME really handle !DOCTYPE at some point

318 if self.tagName == '!DOCTYPE':

319 return 'doctype'

320 if self.tagName[0] in '!?':

321 return 'waitforgt'

322 return 'attrname'

323 elif byte.isspace():

324 return

325 elif byte == '>':

326 self.gotTagStart(self.tagName, self.tagAttributes)

327 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBod yData()

328 elif byte == '/':

329 return 'afterslash'

330 elif self.beExtremelyLenient:

331 # discard and move on? Only case I've seen of this so far was:

332 # <foo bar="baz"">

333 return

334 self._parseError("Unexpected character: %r" % byte)

335

336 def begin_doctype(self, byte):

337 self.doctype = byte

338

339 def do_doctype(self, byte):

340 if byte == '>':

341 return 'bodydata'

342 self.doctype += byte

343

344 def end_doctype(self):

345 self.gotDoctype(self.doctype)

346 self.doctype = None

347

348 def do_waitforgt(self, byte):

349 if byte == '>':

350 if self.endtag or not self.beExtremelyLenient:

351 return 'bodydata'

352 return self.maybeBodyData()

353

354 def begin_attrname(self, byte):

355 self.attrname = byte

356 self._attrname_termtag = 0

357

358 def do_attrname(self, byte):

359 if byte.isalnum() or byte in identChars:

360 self.attrname += byte

361 return

362 elif byte == '=':

363 return 'beforeattrval'

364 elif byte.isspace():

365 return 'beforeeq'

366 elif self.beExtremelyLenient:

367 if byte in '"\'':

368 return 'attrval'

369 if byte in lenientIdentChars or byte.isalnum():

370 self.attrname += byte

371 return

372 if byte == '/':

373 self._attrname_termtag = 1

374 return

375 if byte == '>':

376 self.attrval = 'True'

377 self.tagAttributes[self.attrname] = self.attrval

378 self.gotTagStart(self.tagName, self.tagAttributes)

379 if self._attrname_termtag:

380 self.gotTagEnd(self.tagName)

381 return 'bodydata'

382 return self.maybeBodyData()

383 # something is really broken. let's leave this attribute where it

384 # is and move on to the next thing

385 return

386 self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte) )

387

388 def do_beforeattrval(self, byte):

389 if byte in '"\'':

390 return 'attrval'

391 elif byte.isspace():

392 return

393 elif self.beExtremelyLenient:

394 if byte in lenientIdentChars or byte.isalnum():

395 return 'messyattr'

396 if byte == '>':

397 self.attrval = 'True'

398 self.tagAttributes[self.attrname] = self.attrval

399 self.gotTagStart(self.tagName, self.tagAttributes)

400 return self.maybeBodyData()

401 if byte == '\\':

402 # I saw this in actual HTML once:

403 # <font size=\"3\"><sup>SM</sup></font>

404 return

405 self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)

406

407 attrname = ''

408 attrval = ''

409

410 def begin_beforeeq(self,byte):

411 self._beforeeq_termtag = 0

412

413 def do_beforeeq(self, byte):

414 if byte == '=':

415 return 'beforeattrval'

416 elif byte.isspace():

417 return

418 elif self.beExtremelyLenient:

419 if byte.isalnum() or byte in identChars:

420 self.attrval = 'True'

421 self.tagAttributes[self.attrname] = self.attrval

422 return 'attrname'

423 elif byte == '>':

424 self.attrval = 'True'

425 self.tagAttributes[self.attrname] = self.attrval

426 self.gotTagStart(self.tagName, self.tagAttributes)

427 if self._beforeeq_termtag:

428 self.gotTagEnd(self.tagName)

429 return 'bodydata'

430 return self.maybeBodyData()

431 elif byte == '/':

432 self._beforeeq_termtag = 1

433 return

434 self._parseError("Invalid attribute")

435

436 def begin_attrval(self, byte):

437 self.quotetype = byte

438 self.attrval = ''

439

440 def do_attrval(self, byte):

441 if byte == self.quotetype:

442 return 'attrs'

443 self.attrval += byte

444

445 def end_attrval(self):

446 self.tagAttributes[self.attrname] = self.attrval

447 self.attrname = self.attrval = ''

448

449 def begin_messyattr(self, byte):

450 self.attrval = byte

451

452 def do_messyattr(self, byte):

453 if byte.isspace():

454 return 'attrs'

455 elif byte == '>':

456 endTag = 0

457 if self.attrval.endswith('/'):

458 endTag = 1

459 self.attrval = self.attrval[:-1]

460 self.tagAttributes[self.attrname] = self.attrval

461 self.gotTagStart(self.tagName, self.tagAttributes)

462 if endTag:

463 self.gotTagEnd(self.tagName)

464 return 'bodydata'

465 return self.maybeBodyData()

466 else:

467 self.attrval += byte

468

469 def end_messyattr(self):

470 if self.attrval:

471 self.tagAttributes[self.attrname] = self.attrval

472

473 def begin_afterslash(self, byte):

474 self._after_slash_closed = 0

475

476 def do_afterslash(self, byte):

477 # this state is only after a self-terminating slash, e.g. <foo/>

478 if self._after_slash_closed:

479 self._parseError("Mal-formed")#XXX When does this happen??

480 if byte != '>':

481 if self.beExtremelyLenient:

482 return

483 else:

484 self._parseError("No data allowed after '/'")

485 self._after_slash_closed = 1

486 self.gotTagStart(self.tagName, self.tagAttributes)

487 self.gotTagEnd(self.tagName)

488 # don't need maybeBodyData here because there better not be

489 # any javascript code after a <script/>... we'll see :(

490 return 'bodydata'

491

492 def begin_bodydata(self, byte):

493 if self._leadingBodyData:

494 self.bodydata = self._leadingBodyData

495 del self._leadingBodyData

496 else:

497 self.bodydata = ''

498

499 def do_bodydata(self, byte):

500 if byte == '<':

501 return 'tagstart'

502 if byte == '&':

503 return 'entityref'

504 self.bodydata += byte

505

506 def end_bodydata(self):

507 self.gotText(self.bodydata)

508 self.bodydata = ''

509

510 def do_waitforendscript(self, byte):

511 if byte == '<':

512 return 'waitscriptendtag'

513 self.bodydata += byte

514

515 def begin_waitscriptendtag(self, byte):

516 self.temptagdata = ''

517 self.tagName = ''

518 self.endtag = 0

519

520 def do_waitscriptendtag(self, byte):

521 # 1 enforce / as first byte read

522 # 2 enforce following bytes to be subset of "script" until

523 # tagName == "script"

524 # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagN ame)

525 # 3 spaces can happen anywhere, they're ignored

526 # e.g. < / script >

527 # 4 anything else causes all data I've read to be moved to the

528 # bodydata, and switch back to waitforendscript state

529

530 # If it turns out this _isn't_ a </script>, we need to

531 # remember all the data we've been through so we can append it

532 # to bodydata

533 self.temptagdata += byte

534

535 # 1

536 if byte == '/':

537 self.endtag = True

538 elif not self.endtag:

539 self.bodydata += "<" + self.temptagdata

540 return 'waitforendscript'

541 # 2

542 elif byte.isalnum() or byte in identChars:

543 self.tagName += byte

544 if not 'script'.startswith(self.tagName):

545 self.bodydata += "<" + self.temptagdata

546 return 'waitforendscript'

547 elif self.tagName == 'script':

548 self.gotText(self.bodydata)

549 self.gotTagEnd(self.tagName)

550 return 'waitforgt'

551 # 3

552 elif byte.isspace():

553 return 'waitscriptendtag'

554 # 4

555 else:

556 self.bodydata += "<" + self.temptagdata

557 return 'waitforendscript'

558

559

560 def begin_entityref(self, byte):

561 self.erefbuf = ''

562 self.erefextra = '' # extra bit for lenient mode

563

564 def do_entityref(self, byte):

565 if byte.isspace() or byte == "<":

566 if self.beExtremelyLenient:

567 # '&foo' probably was '&foo'

568 if self.erefbuf and self.erefbuf != "amp":

569 self.erefextra = self.erefbuf

570 self.erefbuf = "amp"

571 if byte == "<":

572 return "tagstart"

573 else:

574 self.erefextra += byte

575 return 'spacebodydata'

576 self._parseError("Bad entity reference")

577 elif byte != ';':

578 self.erefbuf += byte

579 else:

580 return 'bodydata'

581

582 def end_entityref(self):

583 self.gotEntityReference(self.erefbuf)

584

585 # hacky support for space after & in entityref in beExtremelyLenient

586 # state should only happen in that case

587 def begin_spacebodydata(self, byte):

588 self.bodydata = self.erefextra

589 self.erefextra = None

590 do_spacebodydata = do_bodydata

591 end_spacebodydata = end_bodydata

592

593 # Sorta SAX-ish API

594

595 def gotTagStart(self, name, attributes):

596 '''Encountered an opening tag.

597

598 Default behaviour is to print.'''

599 print 'begin', name, attributes

600

601 def gotText(self, data):

602 '''Encountered text

603

604 Default behaviour is to print.'''

605 print 'text:', repr(data)

606

607 def gotEntityReference(self, entityRef):

608 '''Encountered mnemonic entity reference

609

610 Default behaviour is to print.'''

611 print 'entityRef: &%s;' % entityRef

612

613 def gotComment(self, comment):

614 '''Encountered comment.

615

616 Default behaviour is to ignore.'''

617 pass

618

619 def gotCData(self, cdata):

620 '''Encountered CDATA

621

622 Default behaviour is to call the gotText method'''

623 self.gotText(cdata)

624

625 def gotDoctype(self, doctype):

626 """Encountered DOCTYPE

627

628 This is really grotty: it basically just gives you everything between

629 '<!DOCTYPE' and '>' as an argument.

630 """

631 print '!DOCTYPE', repr(doctype)

632

633 def gotTagEnd(self, name):

634 '''Encountered closing tag

635

636 Default behaviour is to print.'''

637 print 'end', name

638

639 if __name__ == '__main__':

640 from cStringIO import StringIO

641 testDocument = '''

642

643 <!DOCTYPE ignore all this shit, hah its malformed!!!!@$>

644 <?xml version="suck it"?>

645 <foo>

646 A

647 <bar />

648 <baz boz="buz">boz &zop;</baz>

649 <![CDATA[ foo bar baz ]]>

650 </foo>

651 '''

652 x = XMLParser()

653 x.makeConnection(FileWrapper(StringIO()))

654 # fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html"

655 fn = "/home/glyph/gruesome.xml"

656 # testDocument = open(fn).read()

657 x.dataReceived(testDocument)

OLD	NEW

« no previous file with comments | « third_party/twisted_8_1/twisted/web/static.py ('k') | third_party/twisted_8_1/twisted/web/tap.py » ('j') | no next file with comments »