Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py - Issue 18418010: Check in the thirdparty libs needed for webkitpy.

Unified Diff: Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py

Issue 18418010: Check in the thirdparty libs needed for webkitpy. (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « Tools/Scripts/webkitpy/thirdparty/mechanize/_opener.py ('k') | Tools/Scripts/webkitpy/thirdparty/mechanize/_request.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py

diff --git a/Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py b/Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py

new file mode 100644

index 0000000000000000000000000000000000000000..1f212c1512bacd2d3ef95e51ba9d578ce0adfbf5

--- /dev/null

+++ b/Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py

@@ -0,0 +1,391 @@

+"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.

+Examples

+This program extracts all links from a document. It will print one

+line for each link, containing the URL and the textual description

+between the <A>...</A> tags:

+import pullparser, sys

+f = file(sys.argv[1])

+p = pullparser.PullParser(f)

+for token in p.tags("a"):

+ if token.type == "endtag": continue

+ url = dict(token.attrs).get("href", "-")

+ text = p.get_compressed_text(endat=("endtag", "a"))

+ print "%s\t%s" % (url, text)

+This program extracts the <TITLE> from the document:

+import pullparser, sys

+f = file(sys.argv[1])

+p = pullparser.PullParser(f)

+if p.get_tag("title"):

+ title = p.get_compressed_text()

+ print "Title: %s" % title

+This code is free software; you can redistribute it and/or modify it

+under the terms of the BSD or ZPL 2.1 licenses.

+"""

+import re, htmlentitydefs

+import _sgmllib_copy as sgmllib

+import HTMLParser

+from xml.sax import saxutils

+from _html import unescape, unescape_charref

+class NoMoreTokensError(Exception): pass

+class Token:

+ """Represents an HTML tag, declaration, processing instruction etc.

+ Behaves as both a tuple-like object (ie. iterable) and has attributes

+ .type, .data and .attrs.

+ >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])

+ >>> t == ("starttag", "a", [("href", "http://www.python.org/")])

+ True

+ >>> (t.type, t.data) == ("starttag", "a")

+ True

+ >>> t.attrs == [("href", "http://www.python.org/")]

+ True

+ Public attributes

+ type: one of "starttag", "endtag", "startendtag", "charref", "entityref",

+ "data", "comment", "decl", "pi", after the corresponding methods of

+ HTMLParser.HTMLParser

+ data: For a tag, the tag name; otherwise, the relevant data carried by the

+ tag, as a string

+ attrs: list of (name, value) pairs representing HTML attributes

+ (or None if token does not represent an opening tag)

+ """

+ def __init__(self, type, data, attrs=None):

+ self.type = type

+ self.data = data

+ self.attrs = attrs

+ def __iter__(self):

+ return iter((self.type, self.data, self.attrs))

+ def __eq__(self, other):

+ type, data, attrs = other

+ if (self.type == type and

+ self.data == data and

+ self.attrs == attrs):

+ return True

+ else:

+ return False

+ def __ne__(self, other): return not self.__eq__(other)

+ def __repr__(self):

+ args = ", ".join(map(repr, [self.type, self.data, self.attrs]))

+ return self.__class__.__name__+"(%s)" % args

+ def __str__(self):

+ """

+ >>> print Token("starttag", "br")

+ <br>

+ >>> print Token("starttag", "a",

+ ... [("href", "http://www.python.org/"), ("alt", '"foo"')])

+ <a href="http://www.python.org/" alt='"foo"'>

+ >>> print Token("startendtag", "br")

+ <br />

+ >>> print Token("startendtag", "br", [("spam", "eggs")])

+ <br spam="eggs" />

+ >>> print Token("endtag", "p")

+ </p>

+ >>> print Token("charref", "38")

+ &

+ >>> print Token("entityref", "amp")

+ &

+ >>> print Token("data", "foo\\nbar")

+ foo

+ bar

+ >>> print Token("comment", "Life is a bowl\\nof cherries.")

+ <!--Life is a bowl

+ of cherries.-->

+ >>> print Token("decl", "decl")

+ <!decl>

+ >>> print Token("pi", "pi")

+ <?pi>

+ """

+ if self.attrs is not None:

+ attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for

+ k, v in self.attrs])

+ else:

+ attrs = ""

+ if self.type == "starttag":

+ return "<%s%s>" % (self.data, attrs)

+ elif self.type == "startendtag":

+ return "<%s%s />" % (self.data, attrs)

+ elif self.type == "endtag":

+ return "</%s>" % self.data

+ elif self.type == "charref":

+ return "&#%s;" % self.data

+ elif self.type == "entityref":

+ return "&%s;" % self.data

+ elif self.type == "data":

+ return self.data

+ elif self.type == "comment":

+ return "" % self.data

+ elif self.type == "decl":

+ return "<!%s>" % self.data

+ elif self.type == "pi":

+ return "<?%s>" % self.data

+ assert False

+def iter_until_exception(fn, exception, *args, **kwds):

+ while 1:

+ try:

+ yield fn(*args, **kwds)

+ except exception:

+ raise StopIteration

+class _AbstractParser:

+ chunk = 1024

+ compress_re = re.compile(r"\s+")

+ def __init__(self, fh, textify={"img": "alt", "applet": "alt"},

+ encoding="ascii", entitydefs=None):

+ """

+ fh: file-like object (only a .read() method is required) from which to

+ read HTML to be parsed

+ textify: mapping used by .get_text() and .get_compressed_text() methods

+ to represent opening tags as text

+ encoding: encoding used to encode numeric character references by

+ .get_text() and .get_compressed_text() ("ascii" by default)

+ entitydefs: mapping like {"amp": "&", ...} containing HTML entity

+ definitions (a sensible default is used). This is used to unescape

+ entities in .get_text() (and .get_compressed_text()) and attribute

+ values. If the encoding can not represent the character, the entity

+ reference is left unescaped. Note that entity references (both

+ numeric - e.g. { or ઼ - and non-numeric - e.g. &) are

+ unescaped in attribute values and the return value of .get_text(), but

+ not in data outside of tags. Instead, entity references outside of

+ tags are represented as tokens. This is a bit odd, it's true :-/

+ If the element name of an opening tag matches a key in the textify

+ mapping then that tag is converted to text. The corresponding value is

+ used to specify which tag attribute to obtain the text from. textify

+ maps from element names to either:

+ - an HTML attribute name, in which case the HTML attribute value is

+ used as its text value along with the element name in square

+ brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute

+ were missing, just "[IMG]")

+ - a callable object (e.g. a function) which takes a Token and returns

+ the string to be used as its text value

+ If textify has no key for an element name, nothing is substituted for

+ the opening tag.

+ Public attributes:

+ encoding and textify: see above

+ """

+ self._fh = fh

+ self._tokenstack = [] # FIFO

+ self.textify = textify

+ self.encoding = encoding

+ if entitydefs is None:

+ entitydefs = htmlentitydefs.name2codepoint

+ self._entitydefs = entitydefs

+ def __iter__(self): return self

+ def tags(self, *names):

+ return iter_until_exception(self.get_tag, NoMoreTokensError, *names)

+ def tokens(self, *tokentypes):

+ return iter_until_exception(self.get_token, NoMoreTokensError,

+ *tokentypes)

+ def next(self):

+ try:

+ return self.get_token()

+ except NoMoreTokensError:

+ raise StopIteration()

+ def get_token(self, *tokentypes):

+ """Pop the next Token object from the stack of parsed tokens.

+ If arguments are given, they are taken to be token types in which the

+ caller is interested: tokens representing other elements will be

+ skipped. Element names must be given in lower case.

+ Raises NoMoreTokensError.

+ """

+ while 1:

+ while self._tokenstack:

+ token = self._tokenstack.pop(0)

+ if tokentypes:

+ if token.type in tokentypes:

+ return token

+ else:

+ return token

+ data = self._fh.read(self.chunk)

+ if not data:

+ raise NoMoreTokensError()

+ self.feed(data)

+ def unget_token(self, token):

+ """Push a Token back onto the stack."""

+ self._tokenstack.insert(0, token)

+ def get_tag(self, *names):

+ """Return the next Token that represents an opening or closing tag.

+ If arguments are given, they are taken to be element names in which the

+ caller is interested: tags representing other elements will be skipped.

+ Element names must be given in lower case.

+ Raises NoMoreTokensError.

+ """

+ while 1:

+ tok = self.get_token()

+ if tok.type not in ["starttag", "endtag", "startendtag"]:

+ continue

+ if names:

+ if tok.data in names:

+ return tok

+ else:

+ return tok

+ def get_text(self, endat=None):

+ """Get some text.

+ endat: stop reading text at this tag (the tag is included in the

+ returned text); endtag is a tuple (type, name) where type is

+ "starttag", "endtag" or "startendtag", and name is the element name of

+ the tag (element names must be given in lower case)

+ If endat is not given, .get_text() will stop at the next opening or

+ closing tag, or when there are no more tokens (no exception is raised).

+ Note that .get_text() includes the text representation (if any) of the

+ opening tag, but pushes the opening tag back onto the stack. As a

+ result, if you want to call .get_text() again, you need to call

+ .get_tag() first (unless you want an empty string returned when you

+ next call .get_text()).

+ Entity references are translated using the value of the entitydefs

+ constructor argument (a mapping from names to characters like that

+ provided by the standard module htmlentitydefs). Named entity

+ references that are not in this mapping are left unchanged.

+ The textify attribute is used to translate opening tags into text: see

+ the class docstring.

+ """

+ text = []

+ tok = None

+ while 1:

+ try:

+ tok = self.get_token()

+ except NoMoreTokensError:

+ # unget last token (not the one we just failed to get)

+ if tok: self.unget_token(tok)

+ break

+ if tok.type == "data":

+ text.append(tok.data)

+ elif tok.type == "entityref":

+ t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)

+ text.append(t)

+ elif tok.type == "charref":

+ t = unescape_charref(tok.data, self.encoding)

+ text.append(t)

+ elif tok.type in ["starttag", "endtag", "startendtag"]:

+ tag_name = tok.data

+ if tok.type in ["starttag", "startendtag"]:

+ alt = self.textify.get(tag_name)

+ if alt is not None:

+ if callable(alt):

+ text.append(alt(tok))

+ elif tok.attrs is not None:

+ for k, v in tok.attrs:

+ if k == alt:

+ text.append(v)

+ text.append("[%s]" % tag_name.upper())

+ if endat is None or endat == (tok.type, tag_name):

+ self.unget_token(tok)

+ break

+ return "".join(text)

+ def get_compressed_text(self, *args, **kwds):

+ """

+ As .get_text(), but collapses each group of contiguous whitespace to a

+ single space character, and removes all initial and trailing

+ whitespace.

+ """

+ text = self.get_text(*args, **kwds)

+ text = text.strip()

+ return self.compress_re.sub(" ", text)

+ def handle_startendtag(self, tag, attrs):

+ self._tokenstack.append(Token("startendtag", tag, attrs))

+ def handle_starttag(self, tag, attrs):

+ self._tokenstack.append(Token("starttag", tag, attrs))

+ def handle_endtag(self, tag):

+ self._tokenstack.append(Token("endtag", tag))

+ def handle_charref(self, name):

+ self._tokenstack.append(Token("charref", name))

+ def handle_entityref(self, name):

+ self._tokenstack.append(Token("entityref", name))

+ def handle_data(self, data):

+ self._tokenstack.append(Token("data", data))

+ def handle_comment(self, data):

+ self._tokenstack.append(Token("comment", data))

+ def handle_decl(self, decl):

+ self._tokenstack.append(Token("decl", decl))

+ def unknown_decl(self, data):

+ # XXX should this call self.error instead?

+ #self.error("unknown declaration: " + `data`)

+ self._tokenstack.append(Token("decl", data))

+ def handle_pi(self, data):

+ self._tokenstack.append(Token("pi", data))

+ def unescape_attr(self, name):

+ return unescape(name, self._entitydefs, self.encoding)

+ def unescape_attrs(self, attrs):

+ escaped_attrs = []

+ for key, val in attrs:

+ escaped_attrs.append((key, self.unescape_attr(val)))

+ return escaped_attrs

+class PullParser(_AbstractParser, HTMLParser.HTMLParser):

+ def __init__(self, *args, **kwds):

+ HTMLParser.HTMLParser.__init__(self)

+ _AbstractParser.__init__(self, *args, **kwds)

+ def unescape(self, name):

+ # Use the entitydefs passed into constructor, not

+ # HTMLParser.HTMLParser's entitydefs.

+ return self.unescape_attr(name)

+class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):

+ def __init__(self, *args, **kwds):

+ sgmllib.SGMLParser.__init__(self)

+ _AbstractParser.__init__(self, *args, **kwds)

+ def unknown_starttag(self, tag, attrs):

+ attrs = self.unescape_attrs(attrs)

+ self._tokenstack.append(Token("starttag", tag, attrs))

+ def unknown_endtag(self, tag):

+ self._tokenstack.append(Token("endtag", tag))

+def _test():

+ import doctest, _pullparser

+ return doctest.testmod(_pullparser)

+if __name__ == "__main__":

+ _test()

« no previous file with comments | « Tools/Scripts/webkitpy/thirdparty/mechanize/_opener.py ('k') | Tools/Scripts/webkitpy/thirdparty/mechanize/_request.py » ('j') | no next file with comments »