| Index: Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py
|
| diff --git a/Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py b/Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1a4e2c0281571b8c7bbc7da8dcb377835adbb091
|
| --- /dev/null
|
| +++ b/Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py
|
| @@ -0,0 +1,629 @@
|
| +"""HTML handling.
|
| +
|
| +Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
| +
|
| +This code is free software; you can redistribute it and/or modify it under
|
| +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
| +included with the distribution).
|
| +
|
| +"""
|
| +
|
| +import codecs
|
| +import copy
|
| +import htmlentitydefs
|
| +import re
|
| +
|
| +import _sgmllib_copy as sgmllib
|
| +
|
| +import _beautifulsoup
|
| +import _form
|
| +from _headersutil import split_header_words, is_html as _is_html
|
| +import _request
|
| +import _rfc3986
|
| +
|
| +DEFAULT_ENCODING = "latin-1"
|
| +
|
| +COMPRESS_RE = re.compile(r"\s+")
|
| +
|
| +
|
| +class CachingGeneratorFunction(object):
|
| + """Caching wrapper around a no-arguments iterable."""
|
| +
|
| + def __init__(self, iterable):
|
| + self._cache = []
|
| + # wrap iterable to make it non-restartable (otherwise, repeated
|
| + # __call__ would give incorrect results)
|
| + self._iterator = iter(iterable)
|
| +
|
| + def __call__(self):
|
| + cache = self._cache
|
| + for item in cache:
|
| + yield item
|
| + for item in self._iterator:
|
| + cache.append(item)
|
| + yield item
|
| +
|
| +
|
| +class EncodingFinder:
|
| + def __init__(self, default_encoding):
|
| + self._default_encoding = default_encoding
|
| + def encoding(self, response):
|
| + # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
|
| + # headers may be in the response. HTTP-EQUIV headers come last,
|
| + # so try in order from first to last.
|
| + for ct in response.info().getheaders("content-type"):
|
| + for k, v in split_header_words([ct])[0]:
|
| + if k == "charset":
|
| + encoding = v
|
| + try:
|
| + codecs.lookup(v)
|
| + except LookupError:
|
| + continue
|
| + else:
|
| + return encoding
|
| + return self._default_encoding
|
| +
|
| +
|
| +class ResponseTypeFinder:
|
| + def __init__(self, allow_xhtml):
|
| + self._allow_xhtml = allow_xhtml
|
| + def is_html(self, response, encoding):
|
| + ct_hdrs = response.info().getheaders("content-type")
|
| + url = response.geturl()
|
| + # XXX encoding
|
| + return _is_html(ct_hdrs, url, self._allow_xhtml)
|
| +
|
| +
|
| +class Args(object):
|
| +
|
| + # idea for this argument-processing trick is from Peter Otten
|
| +
|
| + def __init__(self, args_map):
|
| + self.__dict__["dictionary"] = dict(args_map)
|
| +
|
| + def __getattr__(self, key):
|
| + try:
|
| + return self.dictionary[key]
|
| + except KeyError:
|
| + return getattr(self.__class__, key)
|
| +
|
| + def __setattr__(self, key, value):
|
| + if key == "dictionary":
|
| + raise AttributeError()
|
| + self.dictionary[key] = value
|
| +
|
| +
|
| +def form_parser_args(
|
| + select_default=False,
|
| + form_parser_class=None,
|
| + request_class=None,
|
| + backwards_compat=False,
|
| + ):
|
| + return Args(locals())
|
| +
|
| +
|
| +class Link:
|
| + def __init__(self, base_url, url, text, tag, attrs):
|
| + assert None not in [url, tag, attrs]
|
| + self.base_url = base_url
|
| + self.absolute_url = _rfc3986.urljoin(base_url, url)
|
| + self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
|
| + def __cmp__(self, other):
|
| + try:
|
| + for name in "url", "text", "tag", "attrs":
|
| + if getattr(self, name) != getattr(other, name):
|
| + return -1
|
| + except AttributeError:
|
| + return -1
|
| + return 0
|
| + def __repr__(self):
|
| + return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
|
| + self.base_url, self.url, self.text, self.tag, self.attrs)
|
| +
|
| +
|
| +class LinksFactory:
|
| +
|
| + def __init__(self,
|
| + link_parser_class=None,
|
| + link_class=Link,
|
| + urltags=None,
|
| + ):
|
| + import _pullparser
|
| + if link_parser_class is None:
|
| + link_parser_class = _pullparser.TolerantPullParser
|
| + self.link_parser_class = link_parser_class
|
| + self.link_class = link_class
|
| + if urltags is None:
|
| + urltags = {
|
| + "a": "href",
|
| + "area": "href",
|
| + "frame": "src",
|
| + "iframe": "src",
|
| + }
|
| + self.urltags = urltags
|
| + self._response = None
|
| + self._encoding = None
|
| +
|
| + def set_response(self, response, base_url, encoding):
|
| + self._response = response
|
| + self._encoding = encoding
|
| + self._base_url = base_url
|
| +
|
| + def links(self):
|
| + """Return an iterator that provides links of the document."""
|
| + response = self._response
|
| + encoding = self._encoding
|
| + base_url = self._base_url
|
| + p = self.link_parser_class(response, encoding=encoding)
|
| +
|
| + try:
|
| + for token in p.tags(*(self.urltags.keys()+["base"])):
|
| + if token.type == "endtag":
|
| + continue
|
| + if token.data == "base":
|
| + base_href = dict(token.attrs).get("href")
|
| + if base_href is not None:
|
| + base_url = base_href
|
| + continue
|
| + attrs = dict(token.attrs)
|
| + tag = token.data
|
| + text = None
|
| + # XXX use attr_encoding for ref'd doc if that doc does not
|
| + # provide one by other means
|
| + #attr_encoding = attrs.get("charset")
|
| + url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
|
| + if not url:
|
| + # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
|
| + # For our purposes a link is something with a URL, so
|
| + # ignore this.
|
| + continue
|
| +
|
| + url = _rfc3986.clean_url(url, encoding)
|
| + if tag == "a":
|
| + if token.type != "startendtag":
|
| + # hmm, this'd break if end tag is missing
|
| + text = p.get_compressed_text(("endtag", tag))
|
| + # but this doesn't work for e.g.
|
| + # <a href="blah"><b>Andy</b></a>
|
| + #text = p.get_compressed_text()
|
| +
|
| + yield Link(base_url, url, text, tag, token.attrs)
|
| + except sgmllib.SGMLParseError, exc:
|
| + raise _form.ParseError(exc)
|
| +
|
| +class FormsFactory:
|
| +
|
| + """Makes a sequence of objects satisfying HTMLForm interface.
|
| +
|
| + After calling .forms(), the .global_form attribute is a form object
|
| + containing all controls not a descendant of any FORM element.
|
| +
|
| + For constructor argument docs, see ParseResponse argument docs.
|
| + """
|
| +
|
| + def __init__(self,
|
| + select_default=False,
|
| + form_parser_class=None,
|
| + request_class=None,
|
| + backwards_compat=False,
|
| + ):
|
| + self.select_default = select_default
|
| + if form_parser_class is None:
|
| + form_parser_class = _form.FormParser
|
| + self.form_parser_class = form_parser_class
|
| + if request_class is None:
|
| + request_class = _request.Request
|
| + self.request_class = request_class
|
| + self.backwards_compat = backwards_compat
|
| + self._response = None
|
| + self.encoding = None
|
| + self.global_form = None
|
| +
|
| + def set_response(self, response, encoding):
|
| + self._response = response
|
| + self.encoding = encoding
|
| + self.global_form = None
|
| +
|
| + def forms(self):
|
| + encoding = self.encoding
|
| + forms = _form.ParseResponseEx(
|
| + self._response,
|
| + select_default=self.select_default,
|
| + form_parser_class=self.form_parser_class,
|
| + request_class=self.request_class,
|
| + encoding=encoding,
|
| + _urljoin=_rfc3986.urljoin,
|
| + _urlparse=_rfc3986.urlsplit,
|
| + _urlunparse=_rfc3986.urlunsplit,
|
| + )
|
| + self.global_form = forms[0]
|
| + return forms[1:]
|
| +
|
| +class TitleFactory:
|
| + def __init__(self):
|
| + self._response = self._encoding = None
|
| +
|
| + def set_response(self, response, encoding):
|
| + self._response = response
|
| + self._encoding = encoding
|
| +
|
| + def _get_title_text(self, parser):
|
| + import _pullparser
|
| + text = []
|
| + tok = None
|
| + while 1:
|
| + try:
|
| + tok = parser.get_token()
|
| + except _pullparser.NoMoreTokensError:
|
| + break
|
| + if tok.type == "data":
|
| + text.append(str(tok))
|
| + elif tok.type == "entityref":
|
| + t = unescape("&%s;" % tok.data,
|
| + parser._entitydefs, parser.encoding)
|
| + text.append(t)
|
| + elif tok.type == "charref":
|
| + t = unescape_charref(tok.data, parser.encoding)
|
| + text.append(t)
|
| + elif tok.type in ["starttag", "endtag", "startendtag"]:
|
| + tag_name = tok.data
|
| + if tok.type == "endtag" and tag_name == "title":
|
| + break
|
| + text.append(str(tok))
|
| + return COMPRESS_RE.sub(" ", "".join(text).strip())
|
| +
|
| + def title(self):
|
| + import _pullparser
|
| + p = _pullparser.TolerantPullParser(
|
| + self._response, encoding=self._encoding)
|
| + try:
|
| + try:
|
| + p.get_tag("title")
|
| + except _pullparser.NoMoreTokensError:
|
| + return None
|
| + else:
|
| + return self._get_title_text(p)
|
| + except sgmllib.SGMLParseError, exc:
|
| + raise _form.ParseError(exc)
|
| +
|
| +
|
| +def unescape(data, entities, encoding):
|
| + if data is None or "&" not in data:
|
| + return data
|
| +
|
| + def replace_entities(match):
|
| + ent = match.group()
|
| + if ent[1] == "#":
|
| + return unescape_charref(ent[2:-1], encoding)
|
| +
|
| + repl = entities.get(ent[1:-1])
|
| + if repl is not None:
|
| + repl = unichr(repl)
|
| + if type(repl) != type(""):
|
| + try:
|
| + repl = repl.encode(encoding)
|
| + except UnicodeError:
|
| + repl = ent
|
| + else:
|
| + repl = ent
|
| + return repl
|
| +
|
| + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
|
| +
|
| +def unescape_charref(data, encoding):
|
| + name, base = data, 10
|
| + if name.startswith("x"):
|
| + name, base= name[1:], 16
|
| + uc = unichr(int(name, base))
|
| + if encoding is None:
|
| + return uc
|
| + else:
|
| + try:
|
| + repl = uc.encode(encoding)
|
| + except UnicodeError:
|
| + repl = "&#%s;" % data
|
| + return repl
|
| +
|
| +
|
| +class MechanizeBs(_beautifulsoup.BeautifulSoup):
|
| + _entitydefs = htmlentitydefs.name2codepoint
|
| + # don't want the magic Microsoft-char workaround
|
| + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
|
| + lambda(x):x.group(1) + ' />'),
|
| + (re.compile('<!\s+([^<>]*)>'),
|
| + lambda(x):'<!' + x.group(1) + '>')
|
| + ]
|
| +
|
| + def __init__(self, encoding, text=None, avoidParserProblems=True,
|
| + initialTextIsEverything=True):
|
| + self._encoding = encoding
|
| + _beautifulsoup.BeautifulSoup.__init__(
|
| + self, text, avoidParserProblems, initialTextIsEverything)
|
| +
|
| + def handle_charref(self, ref):
|
| + t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
|
| + self.handle_data(t)
|
| + def handle_entityref(self, ref):
|
| + t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
|
| + self.handle_data(t)
|
| + def unescape_attrs(self, attrs):
|
| + escaped_attrs = []
|
| + for key, val in attrs:
|
| + val = unescape(val, self._entitydefs, self._encoding)
|
| + escaped_attrs.append((key, val))
|
| + return escaped_attrs
|
| +
|
| +class RobustLinksFactory:
|
| +
|
| + compress_re = COMPRESS_RE
|
| +
|
| + def __init__(self,
|
| + link_parser_class=None,
|
| + link_class=Link,
|
| + urltags=None,
|
| + ):
|
| + if link_parser_class is None:
|
| + link_parser_class = MechanizeBs
|
| + self.link_parser_class = link_parser_class
|
| + self.link_class = link_class
|
| + if urltags is None:
|
| + urltags = {
|
| + "a": "href",
|
| + "area": "href",
|
| + "frame": "src",
|
| + "iframe": "src",
|
| + }
|
| + self.urltags = urltags
|
| + self._bs = None
|
| + self._encoding = None
|
| + self._base_url = None
|
| +
|
| + def set_soup(self, soup, base_url, encoding):
|
| + self._bs = soup
|
| + self._base_url = base_url
|
| + self._encoding = encoding
|
| +
|
| + def links(self):
|
| + bs = self._bs
|
| + base_url = self._base_url
|
| + encoding = self._encoding
|
| + for ch in bs.recursiveChildGenerator():
|
| + if (isinstance(ch, _beautifulsoup.Tag) and
|
| + ch.name in self.urltags.keys()+["base"]):
|
| + link = ch
|
| + attrs = bs.unescape_attrs(link.attrs)
|
| + attrs_dict = dict(attrs)
|
| + if link.name == "base":
|
| + base_href = attrs_dict.get("href")
|
| + if base_href is not None:
|
| + base_url = base_href
|
| + continue
|
| + url_attr = self.urltags[link.name]
|
| + url = attrs_dict.get(url_attr)
|
| + if not url:
|
| + continue
|
| + url = _rfc3986.clean_url(url, encoding)
|
| + text = link.fetchText(lambda t: True)
|
| + if not text:
|
| + # follow _pullparser's weird behaviour rigidly
|
| + if link.name == "a":
|
| + text = ""
|
| + else:
|
| + text = None
|
| + else:
|
| + text = self.compress_re.sub(" ", " ".join(text).strip())
|
| + yield Link(base_url, url, text, link.name, attrs)
|
| +
|
| +
|
| +class RobustFormsFactory(FormsFactory):
|
| + def __init__(self, *args, **kwds):
|
| + args = form_parser_args(*args, **kwds)
|
| + if args.form_parser_class is None:
|
| + args.form_parser_class = _form.RobustFormParser
|
| + FormsFactory.__init__(self, **args.dictionary)
|
| +
|
| + def set_response(self, response, encoding):
|
| + self._response = response
|
| + self.encoding = encoding
|
| +
|
| +
|
| +class RobustTitleFactory:
|
| + def __init__(self):
|
| + self._bs = self._encoding = None
|
| +
|
| + def set_soup(self, soup, encoding):
|
| + self._bs = soup
|
| + self._encoding = encoding
|
| +
|
| + def title(self):
|
| + title = self._bs.first("title")
|
| + if title == _beautifulsoup.Null:
|
| + return None
|
| + else:
|
| + inner_html = "".join([str(node) for node in title.contents])
|
| + return COMPRESS_RE.sub(" ", inner_html.strip())
|
| +
|
| +
|
| +class Factory:
|
| + """Factory for forms, links, etc.
|
| +
|
| + This interface may expand in future.
|
| +
|
| + Public methods:
|
| +
|
| + set_request_class(request_class)
|
| + set_response(response)
|
| + forms()
|
| + links()
|
| +
|
| + Public attributes:
|
| +
|
| + Note that accessing these attributes may raise ParseError.
|
| +
|
| + encoding: string specifying the encoding of response if it contains a text
|
| + document (this value is left unspecified for documents that do not have
|
| + an encoding, e.g. an image file)
|
| + is_html: true if response contains an HTML document (XHTML may be
|
| + regarded as HTML too)
|
| + title: page title, or None if no title or not HTML
|
| + global_form: form object containing all controls that are not descendants
|
| + of any FORM element, or None if the forms_factory does not support
|
| + supplying a global form
|
| +
|
| + """
|
| +
|
| + LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
|
| +
|
| + def __init__(self, forms_factory, links_factory, title_factory,
|
| + encoding_finder=EncodingFinder(DEFAULT_ENCODING),
|
| + response_type_finder=ResponseTypeFinder(allow_xhtml=False),
|
| + ):
|
| + """
|
| +
|
| + Pass keyword arguments only.
|
| +
|
| + default_encoding: character encoding to use if encoding cannot be
|
| + determined (or guessed) from the response. You should turn on
|
| + HTTP-EQUIV handling if you want the best chance of getting this right
|
| + without resorting to this default. The default value of this
|
| + parameter (currently latin-1) may change in future.
|
| +
|
| + """
|
| + self._forms_factory = forms_factory
|
| + self._links_factory = links_factory
|
| + self._title_factory = title_factory
|
| + self._encoding_finder = encoding_finder
|
| + self._response_type_finder = response_type_finder
|
| +
|
| + self.set_response(None)
|
| +
|
| + def set_request_class(self, request_class):
|
| + """Set request class (mechanize.Request by default).
|
| +
|
| + HTMLForm instances returned by .forms() will return instances of this
|
| + class when .click()ed.
|
| +
|
| + """
|
| + self._forms_factory.request_class = request_class
|
| +
|
| + def set_response(self, response):
|
| + """Set response.
|
| +
|
| + The response must either be None or implement the same interface as
|
| + objects returned by mechanize.urlopen().
|
| +
|
| + """
|
| + self._response = response
|
| + self._forms_genf = self._links_genf = None
|
| + self._get_title = None
|
| + for name in self.LAZY_ATTRS:
|
| + try:
|
| + delattr(self, name)
|
| + except AttributeError:
|
| + pass
|
| +
|
| + def __getattr__(self, name):
|
| + if name not in self.LAZY_ATTRS:
|
| + return getattr(self.__class__, name)
|
| +
|
| + if name == "encoding":
|
| + self.encoding = self._encoding_finder.encoding(
|
| + copy.copy(self._response))
|
| + return self.encoding
|
| + elif name == "is_html":
|
| + self.is_html = self._response_type_finder.is_html(
|
| + copy.copy(self._response), self.encoding)
|
| + return self.is_html
|
| + elif name == "title":
|
| + if self.is_html:
|
| + self.title = self._title_factory.title()
|
| + else:
|
| + self.title = None
|
| + return self.title
|
| + elif name == "global_form":
|
| + self.forms()
|
| + return self.global_form
|
| +
|
| + def forms(self):
|
| + """Return iterable over HTMLForm-like objects.
|
| +
|
| + Raises mechanize.ParseError on failure.
|
| + """
|
| + # this implementation sets .global_form as a side-effect, for benefit
|
| + # of __getattr__ impl
|
| + if self._forms_genf is None:
|
| + try:
|
| + self._forms_genf = CachingGeneratorFunction(
|
| + self._forms_factory.forms())
|
| + except: # XXXX define exception!
|
| + self.set_response(self._response)
|
| + raise
|
| + self.global_form = getattr(
|
| + self._forms_factory, "global_form", None)
|
| + return self._forms_genf()
|
| +
|
| + def links(self):
|
| + """Return iterable over mechanize.Link-like objects.
|
| +
|
| + Raises mechanize.ParseError on failure.
|
| + """
|
| + if self._links_genf is None:
|
| + try:
|
| + self._links_genf = CachingGeneratorFunction(
|
| + self._links_factory.links())
|
| + except: # XXXX define exception!
|
| + self.set_response(self._response)
|
| + raise
|
| + return self._links_genf()
|
| +
|
| +class DefaultFactory(Factory):
|
| + """Based on sgmllib."""
|
| + def __init__(self, i_want_broken_xhtml_support=False):
|
| + Factory.__init__(
|
| + self,
|
| + forms_factory=FormsFactory(),
|
| + links_factory=LinksFactory(),
|
| + title_factory=TitleFactory(),
|
| + response_type_finder=ResponseTypeFinder(
|
| + allow_xhtml=i_want_broken_xhtml_support),
|
| + )
|
| +
|
| + def set_response(self, response):
|
| + Factory.set_response(self, response)
|
| + if response is not None:
|
| + self._forms_factory.set_response(
|
| + copy.copy(response), self.encoding)
|
| + self._links_factory.set_response(
|
| + copy.copy(response), response.geturl(), self.encoding)
|
| + self._title_factory.set_response(
|
| + copy.copy(response), self.encoding)
|
| +
|
| +class RobustFactory(Factory):
|
| + """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
|
| + DefaultFactory.
|
| +
|
| + """
|
| + def __init__(self, i_want_broken_xhtml_support=False,
|
| + soup_class=None):
|
| + Factory.__init__(
|
| + self,
|
| + forms_factory=RobustFormsFactory(),
|
| + links_factory=RobustLinksFactory(),
|
| + title_factory=RobustTitleFactory(),
|
| + response_type_finder=ResponseTypeFinder(
|
| + allow_xhtml=i_want_broken_xhtml_support),
|
| + )
|
| + if soup_class is None:
|
| + soup_class = MechanizeBs
|
| + self._soup_class = soup_class
|
| +
|
| + def set_response(self, response):
|
| + Factory.set_response(self, response)
|
| + if response is not None:
|
| + data = response.read()
|
| + soup = self._soup_class(self.encoding, data)
|
| + self._forms_factory.set_response(
|
| + copy.copy(response), self.encoding)
|
| + self._links_factory.set_soup(
|
| + soup, response.geturl(), self.encoding)
|
| + self._title_factory.set_soup(soup, self.encoding)
|
|
|