| Index: Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py
|
| diff --git a/Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py b/Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1f212c1512bacd2d3ef95e51ba9d578ce0adfbf5
|
| --- /dev/null
|
| +++ b/Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py
|
| @@ -0,0 +1,391 @@
|
| +"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
|
| +
|
| +Examples
|
| +
|
| +This program extracts all links from a document. It will print one
|
| +line for each link, containing the URL and the textual description
|
| +between the <A>...</A> tags:
|
| +
|
| +import pullparser, sys
|
| +f = file(sys.argv[1])
|
| +p = pullparser.PullParser(f)
|
| +for token in p.tags("a"):
|
| + if token.type == "endtag": continue
|
| + url = dict(token.attrs).get("href", "-")
|
| + text = p.get_compressed_text(endat=("endtag", "a"))
|
| + print "%s\t%s" % (url, text)
|
| +
|
| +This program extracts the <TITLE> from the document:
|
| +
|
| +import pullparser, sys
|
| +f = file(sys.argv[1])
|
| +p = pullparser.PullParser(f)
|
| +if p.get_tag("title"):
|
| + title = p.get_compressed_text()
|
| + print "Title: %s" % title
|
| +
|
| +
|
| +Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
| +Copyright 1998-2001 Gisle Aas (original libwww-perl code)
|
| +
|
| +This code is free software; you can redistribute it and/or modify it
|
| +under the terms of the BSD or ZPL 2.1 licenses.
|
| +
|
| +"""
|
| +
|
| +import re, htmlentitydefs
|
| +import _sgmllib_copy as sgmllib
|
| +import HTMLParser
|
| +from xml.sax import saxutils
|
| +
|
| +from _html import unescape, unescape_charref
|
| +
|
| +
|
| +class NoMoreTokensError(Exception): pass
|
| +
|
| +class Token:
|
| + """Represents an HTML tag, declaration, processing instruction etc.
|
| +
|
| + Behaves as both a tuple-like object (ie. iterable) and has attributes
|
| + .type, .data and .attrs.
|
| +
|
| + >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
|
| + >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
|
| + True
|
| + >>> (t.type, t.data) == ("starttag", "a")
|
| + True
|
| + >>> t.attrs == [("href", "http://www.python.org/")]
|
| + True
|
| +
|
| + Public attributes
|
| +
|
| + type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
|
| + "data", "comment", "decl", "pi", after the corresponding methods of
|
| + HTMLParser.HTMLParser
|
| + data: For a tag, the tag name; otherwise, the relevant data carried by the
|
| + tag, as a string
|
| + attrs: list of (name, value) pairs representing HTML attributes
|
| + (or None if token does not represent an opening tag)
|
| +
|
| + """
|
| + def __init__(self, type, data, attrs=None):
|
| + self.type = type
|
| + self.data = data
|
| + self.attrs = attrs
|
| + def __iter__(self):
|
| + return iter((self.type, self.data, self.attrs))
|
| + def __eq__(self, other):
|
| + type, data, attrs = other
|
| + if (self.type == type and
|
| + self.data == data and
|
| + self.attrs == attrs):
|
| + return True
|
| + else:
|
| + return False
|
| + def __ne__(self, other): return not self.__eq__(other)
|
| + def __repr__(self):
|
| + args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
|
| + return self.__class__.__name__+"(%s)" % args
|
| +
|
| + def __str__(self):
|
| + """
|
| + >>> print Token("starttag", "br")
|
| + <br>
|
| + >>> print Token("starttag", "a",
|
| + ... [("href", "http://www.python.org/"), ("alt", '"foo"')])
|
| + <a href="http://www.python.org/" alt='"foo"'>
|
| + >>> print Token("startendtag", "br")
|
| + <br />
|
| + >>> print Token("startendtag", "br", [("spam", "eggs")])
|
| + <br spam="eggs" />
|
| + >>> print Token("endtag", "p")
|
| + </p>
|
| + >>> print Token("charref", "38")
|
| + &
|
| + >>> print Token("entityref", "amp")
|
| + &
|
| + >>> print Token("data", "foo\\nbar")
|
| + foo
|
| + bar
|
| + >>> print Token("comment", "Life is a bowl\\nof cherries.")
|
| + <!--Life is a bowl
|
| + of cherries.-->
|
| + >>> print Token("decl", "decl")
|
| + <!decl>
|
| + >>> print Token("pi", "pi")
|
| + <?pi>
|
| + """
|
| + if self.attrs is not None:
|
| + attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
|
| + k, v in self.attrs])
|
| + else:
|
| + attrs = ""
|
| + if self.type == "starttag":
|
| + return "<%s%s>" % (self.data, attrs)
|
| + elif self.type == "startendtag":
|
| + return "<%s%s />" % (self.data, attrs)
|
| + elif self.type == "endtag":
|
| + return "</%s>" % self.data
|
| + elif self.type == "charref":
|
| + return "&#%s;" % self.data
|
| + elif self.type == "entityref":
|
| + return "&%s;" % self.data
|
| + elif self.type == "data":
|
| + return self.data
|
| + elif self.type == "comment":
|
| + return "<!--%s-->" % self.data
|
| + elif self.type == "decl":
|
| + return "<!%s>" % self.data
|
| + elif self.type == "pi":
|
| + return "<?%s>" % self.data
|
| + assert False
|
| +
|
| +
|
| +def iter_until_exception(fn, exception, *args, **kwds):
|
| + while 1:
|
| + try:
|
| + yield fn(*args, **kwds)
|
| + except exception:
|
| + raise StopIteration
|
| +
|
| +
|
| +class _AbstractParser:
|
| + chunk = 1024
|
| + compress_re = re.compile(r"\s+")
|
| + def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
|
| + encoding="ascii", entitydefs=None):
|
| + """
|
| + fh: file-like object (only a .read() method is required) from which to
|
| + read HTML to be parsed
|
| + textify: mapping used by .get_text() and .get_compressed_text() methods
|
| + to represent opening tags as text
|
| + encoding: encoding used to encode numeric character references by
|
| + .get_text() and .get_compressed_text() ("ascii" by default)
|
| +
|
| + entitydefs: mapping like {"amp": "&", ...} containing HTML entity
|
| + definitions (a sensible default is used). This is used to unescape
|
| + entities in .get_text() (and .get_compressed_text()) and attribute
|
| + values. If the encoding can not represent the character, the entity
|
| + reference is left unescaped. Note that entity references (both
|
| + numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
|
| + unescaped in attribute values and the return value of .get_text(), but
|
| + not in data outside of tags. Instead, entity references outside of
|
| + tags are represented as tokens. This is a bit odd, it's true :-/
|
| +
|
| + If the element name of an opening tag matches a key in the textify
|
| + mapping then that tag is converted to text. The corresponding value is
|
| + used to specify which tag attribute to obtain the text from. textify
|
| + maps from element names to either:
|
| +
|
| + - an HTML attribute name, in which case the HTML attribute value is
|
| + used as its text value along with the element name in square
|
| + brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute
|
| + were missing, just "[IMG]")
|
| + - a callable object (e.g. a function) which takes a Token and returns
|
| + the string to be used as its text value
|
| +
|
| + If textify has no key for an element name, nothing is substituted for
|
| + the opening tag.
|
| +
|
| + Public attributes:
|
| +
|
| + encoding and textify: see above
|
| +
|
| + """
|
| + self._fh = fh
|
| + self._tokenstack = [] # FIFO
|
| + self.textify = textify
|
| + self.encoding = encoding
|
| + if entitydefs is None:
|
| + entitydefs = htmlentitydefs.name2codepoint
|
| + self._entitydefs = entitydefs
|
| +
|
| + def __iter__(self): return self
|
| +
|
| + def tags(self, *names):
|
| + return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
|
| +
|
| + def tokens(self, *tokentypes):
|
| + return iter_until_exception(self.get_token, NoMoreTokensError,
|
| + *tokentypes)
|
| +
|
| + def next(self):
|
| + try:
|
| + return self.get_token()
|
| + except NoMoreTokensError:
|
| + raise StopIteration()
|
| +
|
| + def get_token(self, *tokentypes):
|
| + """Pop the next Token object from the stack of parsed tokens.
|
| +
|
| + If arguments are given, they are taken to be token types in which the
|
| + caller is interested: tokens representing other elements will be
|
| + skipped. Element names must be given in lower case.
|
| +
|
| + Raises NoMoreTokensError.
|
| +
|
| + """
|
| + while 1:
|
| + while self._tokenstack:
|
| + token = self._tokenstack.pop(0)
|
| + if tokentypes:
|
| + if token.type in tokentypes:
|
| + return token
|
| + else:
|
| + return token
|
| + data = self._fh.read(self.chunk)
|
| + if not data:
|
| + raise NoMoreTokensError()
|
| + self.feed(data)
|
| +
|
| + def unget_token(self, token):
|
| + """Push a Token back onto the stack."""
|
| + self._tokenstack.insert(0, token)
|
| +
|
| + def get_tag(self, *names):
|
| + """Return the next Token that represents an opening or closing tag.
|
| +
|
| + If arguments are given, they are taken to be element names in which the
|
| + caller is interested: tags representing other elements will be skipped.
|
| + Element names must be given in lower case.
|
| +
|
| + Raises NoMoreTokensError.
|
| +
|
| + """
|
| + while 1:
|
| + tok = self.get_token()
|
| + if tok.type not in ["starttag", "endtag", "startendtag"]:
|
| + continue
|
| + if names:
|
| + if tok.data in names:
|
| + return tok
|
| + else:
|
| + return tok
|
| +
|
| + def get_text(self, endat=None):
|
| + """Get some text.
|
| +
|
| + endat: stop reading text at this tag (the tag is included in the
|
| + returned text); endtag is a tuple (type, name) where type is
|
| + "starttag", "endtag" or "startendtag", and name is the element name of
|
| + the tag (element names must be given in lower case)
|
| +
|
| + If endat is not given, .get_text() will stop at the next opening or
|
| + closing tag, or when there are no more tokens (no exception is raised).
|
| + Note that .get_text() includes the text representation (if any) of the
|
| + opening tag, but pushes the opening tag back onto the stack. As a
|
| + result, if you want to call .get_text() again, you need to call
|
| + .get_tag() first (unless you want an empty string returned when you
|
| + next call .get_text()).
|
| +
|
| + Entity references are translated using the value of the entitydefs
|
| + constructor argument (a mapping from names to characters like that
|
| + provided by the standard module htmlentitydefs). Named entity
|
| + references that are not in this mapping are left unchanged.
|
| +
|
| + The textify attribute is used to translate opening tags into text: see
|
| + the class docstring.
|
| +
|
| + """
|
| + text = []
|
| + tok = None
|
| + while 1:
|
| + try:
|
| + tok = self.get_token()
|
| + except NoMoreTokensError:
|
| + # unget last token (not the one we just failed to get)
|
| + if tok: self.unget_token(tok)
|
| + break
|
| + if tok.type == "data":
|
| + text.append(tok.data)
|
| + elif tok.type == "entityref":
|
| + t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
|
| + text.append(t)
|
| + elif tok.type == "charref":
|
| + t = unescape_charref(tok.data, self.encoding)
|
| + text.append(t)
|
| + elif tok.type in ["starttag", "endtag", "startendtag"]:
|
| + tag_name = tok.data
|
| + if tok.type in ["starttag", "startendtag"]:
|
| + alt = self.textify.get(tag_name)
|
| + if alt is not None:
|
| + if callable(alt):
|
| + text.append(alt(tok))
|
| + elif tok.attrs is not None:
|
| + for k, v in tok.attrs:
|
| + if k == alt:
|
| + text.append(v)
|
| + text.append("[%s]" % tag_name.upper())
|
| + if endat is None or endat == (tok.type, tag_name):
|
| + self.unget_token(tok)
|
| + break
|
| + return "".join(text)
|
| +
|
| + def get_compressed_text(self, *args, **kwds):
|
| + """
|
| + As .get_text(), but collapses each group of contiguous whitespace to a
|
| + single space character, and removes all initial and trailing
|
| + whitespace.
|
| +
|
| + """
|
| + text = self.get_text(*args, **kwds)
|
| + text = text.strip()
|
| + return self.compress_re.sub(" ", text)
|
| +
|
| + def handle_startendtag(self, tag, attrs):
|
| + self._tokenstack.append(Token("startendtag", tag, attrs))
|
| + def handle_starttag(self, tag, attrs):
|
| + self._tokenstack.append(Token("starttag", tag, attrs))
|
| + def handle_endtag(self, tag):
|
| + self._tokenstack.append(Token("endtag", tag))
|
| + def handle_charref(self, name):
|
| + self._tokenstack.append(Token("charref", name))
|
| + def handle_entityref(self, name):
|
| + self._tokenstack.append(Token("entityref", name))
|
| + def handle_data(self, data):
|
| + self._tokenstack.append(Token("data", data))
|
| + def handle_comment(self, data):
|
| + self._tokenstack.append(Token("comment", data))
|
| + def handle_decl(self, decl):
|
| + self._tokenstack.append(Token("decl", decl))
|
| + def unknown_decl(self, data):
|
| + # XXX should this call self.error instead?
|
| + #self.error("unknown declaration: " + `data`)
|
| + self._tokenstack.append(Token("decl", data))
|
| + def handle_pi(self, data):
|
| + self._tokenstack.append(Token("pi", data))
|
| +
|
| + def unescape_attr(self, name):
|
| + return unescape(name, self._entitydefs, self.encoding)
|
| + def unescape_attrs(self, attrs):
|
| + escaped_attrs = []
|
| + for key, val in attrs:
|
| + escaped_attrs.append((key, self.unescape_attr(val)))
|
| + return escaped_attrs
|
| +
|
| +class PullParser(_AbstractParser, HTMLParser.HTMLParser):
|
| + def __init__(self, *args, **kwds):
|
| + HTMLParser.HTMLParser.__init__(self)
|
| + _AbstractParser.__init__(self, *args, **kwds)
|
| + def unescape(self, name):
|
| + # Use the entitydefs passed into constructor, not
|
| + # HTMLParser.HTMLParser's entitydefs.
|
| + return self.unescape_attr(name)
|
| +
|
| +class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
|
| + def __init__(self, *args, **kwds):
|
| + sgmllib.SGMLParser.__init__(self)
|
| + _AbstractParser.__init__(self, *args, **kwds)
|
| + def unknown_starttag(self, tag, attrs):
|
| + attrs = self.unescape_attrs(attrs)
|
| + self._tokenstack.append(Token("starttag", tag, attrs))
|
| + def unknown_endtag(self, tag):
|
| + self._tokenstack.append(Token("endtag", tag))
|
| +
|
| +
|
| +def _test():
|
| + import doctest, _pullparser
|
| + return doctest.testmod(_pullparser)
|
| +
|
| +if __name__ == "__main__":
|
| + _test()
|
|
|