Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py - Issue 18418010: Check in the thirdparty libs needed for webkitpy.

Side by Side Diff: Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py

Issue 18418010: Check in the thirdparty libs needed for webkitpy. (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.

	2

	3 Examples

	4

	5 This program extracts all links from a document. It will print one

	6 line for each link, containing the URL and the textual description

	7 between the <A>...</A> tags:

	8

	9 import pullparser, sys

	10 f = file(sys.argv[1])

	11 p = pullparser.PullParser(f)

	12 for token in p.tags("a"):

	13 if token.type == "endtag": continue

	14 url = dict(token.attrs).get("href", "-")

	15 text = p.get_compressed_text(endat=("endtag", "a"))

	16 print "%s\t%s" % (url, text)

	17

	18 This program extracts the <TITLE> from the document:

	19

	20 import pullparser, sys

	21 f = file(sys.argv[1])

	22 p = pullparser.PullParser(f)

	23 if p.get_tag("title"):

	24 title = p.get_compressed_text()

	25 print "Title: %s" % title

	26

	27

	28 Copyright 2003-2006 John J. Lee <jjl@pobox.com>

	29 Copyright 1998-2001 Gisle Aas (original libwww-perl code)

	30

	31 This code is free software; you can redistribute it and/or modify it

	32 under the terms of the BSD or ZPL 2.1 licenses.

	33

	34 """

	35

	36 import re, htmlentitydefs

	37 import _sgmllib_copy as sgmllib

	38 import HTMLParser

	39 from xml.sax import saxutils

	40

	41 from _html import unescape, unescape_charref

	42

	43

	44 class NoMoreTokensError(Exception): pass

	45

	46 class Token:

	47 """Represents an HTML tag, declaration, processing instruction etc.

	48

	49 Behaves as both a tuple-like object (ie. iterable) and has attributes

	50 .type, .data and .attrs.

	51

	52 >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])

	53 >>> t == ("starttag", "a", [("href", "http://www.python.org/")])

	54 True

	55 >>> (t.type, t.data) == ("starttag", "a")

	56 True

	57 >>> t.attrs == [("href", "http://www.python.org/")]

	58 True

	59

	60 Public attributes

	61

	62 type: one of "starttag", "endtag", "startendtag", "charref", "entityref",

	63 "data", "comment", "decl", "pi", after the corresponding methods of

	64 HTMLParser.HTMLParser

	65 data: For a tag, the tag name; otherwise, the relevant data carried by the

	66 tag, as a string

	67 attrs: list of (name, value) pairs representing HTML attributes

	68 (or None if token does not represent an opening tag)

	69

	70 """

	71 def __init__(self, type, data, attrs=None):

	72 self.type = type

	73 self.data = data

	74 self.attrs = attrs

	75 def __iter__(self):

	76 return iter((self.type, self.data, self.attrs))

	77 def __eq__(self, other):

	78 type, data, attrs = other

	79 if (self.type == type and

	80 self.data == data and

	81 self.attrs == attrs):

	82 return True

	83 else:

	84 return False

	85 def __ne__(self, other): return not self.__eq__(other)

	86 def __repr__(self):

	87 args = ", ".join(map(repr, [self.type, self.data, self.attrs]))

	88 return self.__class__.__name__+"(%s)" % args

	89

	90 def __str__(self):

	91 """

	92 >>> print Token("starttag", "br")

	93 <br>

	94 >>> print Token("starttag", "a",

	95 ... [("href", "http://www.python.org/"), ("alt", '"foo"')])

	96 <a href="http://www.python.org/" alt='"foo"'>

	97 >>> print Token("startendtag", "br")

	98 <br />

	99 >>> print Token("startendtag", "br", [("spam", "eggs")])

	100 <br spam="eggs" />

	101 >>> print Token("endtag", "p")

	102 </p>

	103 >>> print Token("charref", "38")

	104 &

	105 >>> print Token("entityref", "amp")

	106 &

	107 >>> print Token("data", "foo\\nbar")

	108 foo

	109 bar

	110 >>> print Token("comment", "Life is a bowl\\nof cherries.")

	111 <!--Life is a bowl

	112 of cherries.-->

	113 >>> print Token("decl", "decl")

	114 <!decl>

	115 >>> print Token("pi", "pi")

	116 <?pi>

	117 """

	118 if self.attrs is not None:

	119 attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for

	120 k, v in self.attrs])

	121 else:

	122 attrs = ""

	123 if self.type == "starttag":

	124 return "<%s%s>" % (self.data, attrs)

	125 elif self.type == "startendtag":

	126 return "<%s%s />" % (self.data, attrs)

	127 elif self.type == "endtag":

	128 return "</%s>" % self.data

	129 elif self.type == "charref":

	130 return "&#%s;" % self.data

	131 elif self.type == "entityref":

	132 return "&%s;" % self.data

	133 elif self.type == "data":

	134 return self.data

	135 elif self.type == "comment":

	136 return "<!--%s-->" % self.data

	137 elif self.type == "decl":

	138 return "<!%s>" % self.data

	139 elif self.type == "pi":

	140 return "<?%s>" % self.data

	141 assert False

	142

	143

	144 def iter_until_exception(fn, exception, args, *kwds):

	145 while 1:

	146 try:

	147 yield fn(args, *kwds)

	148 except exception:

	149 raise StopIteration

	150

	151

	152 class _AbstractParser:

	153 chunk = 1024

	154 compress_re = re.compile(r"\s+")

	155 def __init__(self, fh, textify={"img": "alt", "applet": "alt"},

	156 encoding="ascii", entitydefs=None):

	157 """

	158 fh: file-like object (only a .read() method is required) from which to

	159 read HTML to be parsed

	160 textify: mapping used by .get_text() and .get_compressed_text() methods

	161 to represent opening tags as text

	162 encoding: encoding used to encode numeric character references by

	163 .get_text() and .get_compressed_text() ("ascii" by default)

	164

	165 entitydefs: mapping like {"amp": "&", ...} containing HTML entity

	166 definitions (a sensible default is used). This is used to unescape

	167 entities in .get_text() (and .get_compressed_text()) and attribute

	168 values. If the encoding can not represent the character, the entity

	169 reference is left unescaped. Note that entity references (both

	170 numeric - e.g. { or ઼ - and non-numeric - e.g. &) are

	171 unescaped in attribute values and the return value of .get_text(), but

	172 not in data outside of tags. Instead, entity references outside of

	173 tags are represented as tokens. This is a bit odd, it's true :-/

	174

	175 If the element name of an opening tag matches a key in the textify

	176 mapping then that tag is converted to text. The corresponding value is

	177 used to specify which tag attribute to obtain the text from. textify

	178 maps from element names to either:

	179

	180 - an HTML attribute name, in which case the HTML attribute value is

	181 used as its text value along with the element name in square

	182 brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute

	183 were missing, just "[IMG]")

	184 - a callable object (e.g. a function) which takes a Token and returns

	185 the string to be used as its text value

	186

	187 If textify has no key for an element name, nothing is substituted for

	188 the opening tag.

	189

	190 Public attributes:

	191

	192 encoding and textify: see above

	193

	194 """

	195 self._fh = fh

	196 self._tokenstack = [] # FIFO

	197 self.textify = textify

	198 self.encoding = encoding

	199 if entitydefs is None:

	200 entitydefs = htmlentitydefs.name2codepoint

	201 self._entitydefs = entitydefs

	202

	203 def __iter__(self): return self

	204

	205 def tags(self, *names):

	206 return iter_until_exception(self.get_tag, NoMoreTokensError, *names)

	207

	208 def tokens(self, *tokentypes):

	209 return iter_until_exception(self.get_token, NoMoreTokensError,

	210 *tokentypes)

	211

	212 def next(self):

	213 try:

	214 return self.get_token()

	215 except NoMoreTokensError:

	216 raise StopIteration()

	217

	218 def get_token(self, *tokentypes):

	219 """Pop the next Token object from the stack of parsed tokens.

	220

	221 If arguments are given, they are taken to be token types in which the

	222 caller is interested: tokens representing other elements will be

	223 skipped. Element names must be given in lower case.

	224

	225 Raises NoMoreTokensError.

	226

	227 """

	228 while 1:

	229 while self._tokenstack:

	230 token = self._tokenstack.pop(0)

	231 if tokentypes:

	232 if token.type in tokentypes:

	233 return token

	234 else:

	235 return token

	236 data = self._fh.read(self.chunk)

	237 if not data:

	238 raise NoMoreTokensError()

	239 self.feed(data)

	240

	241 def unget_token(self, token):

	242 """Push a Token back onto the stack."""

	243 self._tokenstack.insert(0, token)

	244

	245 def get_tag(self, *names):

	246 """Return the next Token that represents an opening or closing tag.

	247

	248 If arguments are given, they are taken to be element names in which the

	249 caller is interested: tags representing other elements will be skipped.

	250 Element names must be given in lower case.

	251

	252 Raises NoMoreTokensError.

	253

	254 """

	255 while 1:

	256 tok = self.get_token()

	257 if tok.type not in ["starttag", "endtag", "startendtag"]:

	258 continue

	259 if names:

	260 if tok.data in names:

	261 return tok

	262 else:

	263 return tok

	264

	265 def get_text(self, endat=None):

	266 """Get some text.

	267

	268 endat: stop reading text at this tag (the tag is included in the

	269 returned text); endtag is a tuple (type, name) where type is

	270 "starttag", "endtag" or "startendtag", and name is the element name of

	271 the tag (element names must be given in lower case)

	272

	273 If endat is not given, .get_text() will stop at the next opening or

	274 closing tag, or when there are no more tokens (no exception is raised).

	275 Note that .get_text() includes the text representation (if any) of the

	276 opening tag, but pushes the opening tag back onto the stack. As a

	277 result, if you want to call .get_text() again, you need to call

	278 .get_tag() first (unless you want an empty string returned when you

	279 next call .get_text()).

	280

	281 Entity references are translated using the value of the entitydefs

	282 constructor argument (a mapping from names to characters like that

	283 provided by the standard module htmlentitydefs). Named entity

	284 references that are not in this mapping are left unchanged.

	285

	286 The textify attribute is used to translate opening tags into text: see

	287 the class docstring.

	288

	289 """

	290 text = []

	291 tok = None

	292 while 1:

	293 try:

	294 tok = self.get_token()

	295 except NoMoreTokensError:

	296 # unget last token (not the one we just failed to get)

	297 if tok: self.unget_token(tok)

	298 break

	299 if tok.type == "data":

	300 text.append(tok.data)

	301 elif tok.type == "entityref":

	302 t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)

	303 text.append(t)

	304 elif tok.type == "charref":

	305 t = unescape_charref(tok.data, self.encoding)

	306 text.append(t)

	307 elif tok.type in ["starttag", "endtag", "startendtag"]:

	308 tag_name = tok.data

	309 if tok.type in ["starttag", "startendtag"]:

	310 alt = self.textify.get(tag_name)

	311 if alt is not None:

	312 if callable(alt):

	313 text.append(alt(tok))

	314 elif tok.attrs is not None:

	315 for k, v in tok.attrs:

	316 if k == alt:

	317 text.append(v)

	318 text.append("[%s]" % tag_name.upper())

	319 if endat is None or endat == (tok.type, tag_name):

	320 self.unget_token(tok)

	321 break

	322 return "".join(text)

	323

	324 def get_compressed_text(self, args, *kwds):

	325 """

	326 As .get_text(), but collapses each group of contiguous whitespace to a

	327 single space character, and removes all initial and trailing

	328 whitespace.

	329

	330 """

	331 text = self.get_text(args, *kwds)

	332 text = text.strip()

	333 return self.compress_re.sub(" ", text)

	334

	335 def handle_startendtag(self, tag, attrs):

	336 self._tokenstack.append(Token("startendtag", tag, attrs))

	337 def handle_starttag(self, tag, attrs):

	338 self._tokenstack.append(Token("starttag", tag, attrs))

	339 def handle_endtag(self, tag):

	340 self._tokenstack.append(Token("endtag", tag))

	341 def handle_charref(self, name):

	342 self._tokenstack.append(Token("charref", name))

	343 def handle_entityref(self, name):

	344 self._tokenstack.append(Token("entityref", name))

	345 def handle_data(self, data):

	346 self._tokenstack.append(Token("data", data))

	347 def handle_comment(self, data):

	348 self._tokenstack.append(Token("comment", data))

	349 def handle_decl(self, decl):

	350 self._tokenstack.append(Token("decl", decl))

	351 def unknown_decl(self, data):

	352 # XXX should this call self.error instead?

	353 #self.error("unknown declaration: " + `data`)

	354 self._tokenstack.append(Token("decl", data))

	355 def handle_pi(self, data):

	356 self._tokenstack.append(Token("pi", data))

	357

	358 def unescape_attr(self, name):

	359 return unescape(name, self._entitydefs, self.encoding)

	360 def unescape_attrs(self, attrs):

	361 escaped_attrs = []

	362 for key, val in attrs:

	363 escaped_attrs.append((key, self.unescape_attr(val)))

	364 return escaped_attrs

	365

	366 class PullParser(_AbstractParser, HTMLParser.HTMLParser):

	367 def __init__(self, args, *kwds):

	368 HTMLParser.HTMLParser.__init__(self)

	369 _AbstractParser.__init__(self, args, *kwds)

	370 def unescape(self, name):

	371 # Use the entitydefs passed into constructor, not

	372 # HTMLParser.HTMLParser's entitydefs.

	373 return self.unescape_attr(name)

	374

	375 class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):

	376 def __init__(self, args, *kwds):

	377 sgmllib.SGMLParser.__init__(self)

	378 _AbstractParser.__init__(self, args, *kwds)

	379 def unknown_starttag(self, tag, attrs):

	380 attrs = self.unescape_attrs(attrs)

	381 self._tokenstack.append(Token("starttag", tag, attrs))

	382 def unknown_endtag(self, tag):

	383 self._tokenstack.append(Token("endtag", tag))

	384

	385

	386 def _test():

	387 import doctest, _pullparser

	388 return doctest.testmod(_pullparser)

	389

	390 if __name__ == "__main__":

	391 _test()

OLD	NEW

« no previous file with comments | « Tools/Scripts/webkitpy/thirdparty/mechanize/_opener.py ('k') | Tools/Scripts/webkitpy/thirdparty/mechanize/_request.py » ('j') | no next file with comments »