Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1427)

Unified Diff: Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py

Issue 18418010: Check in the thirdparty libs needed for webkitpy. (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py
diff --git a/Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py b/Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a4e2c0281571b8c7bbc7da8dcb377835adbb091
--- /dev/null
+++ b/Tools/Scripts/webkitpy/thirdparty/mechanize/_html.py
@@ -0,0 +1,629 @@
+"""HTML handling.
+
+Copyright 2003-2006 John J. Lee <jjl@pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+import codecs
+import copy
+import htmlentitydefs
+import re
+
+import _sgmllib_copy as sgmllib
+
+import _beautifulsoup
+import _form
+from _headersutil import split_header_words, is_html as _is_html
+import _request
+import _rfc3986
+
+DEFAULT_ENCODING = "latin-1"
+
+COMPRESS_RE = re.compile(r"\s+")
+
+
+class CachingGeneratorFunction(object):
+ """Caching wrapper around a no-arguments iterable."""
+
+ def __init__(self, iterable):
+ self._cache = []
+ # wrap iterable to make it non-restartable (otherwise, repeated
+ # __call__ would give incorrect results)
+ self._iterator = iter(iterable)
+
+ def __call__(self):
+ cache = self._cache
+ for item in cache:
+ yield item
+ for item in self._iterator:
+ cache.append(item)
+ yield item
+
+
+class EncodingFinder:
+ def __init__(self, default_encoding):
+ self._default_encoding = default_encoding
+ def encoding(self, response):
+ # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
+ # headers may be in the response. HTTP-EQUIV headers come last,
+ # so try in order from first to last.
+ for ct in response.info().getheaders("content-type"):
+ for k, v in split_header_words([ct])[0]:
+ if k == "charset":
+ encoding = v
+ try:
+ codecs.lookup(v)
+ except LookupError:
+ continue
+ else:
+ return encoding
+ return self._default_encoding
+
+
+class ResponseTypeFinder:
+ def __init__(self, allow_xhtml):
+ self._allow_xhtml = allow_xhtml
+ def is_html(self, response, encoding):
+ ct_hdrs = response.info().getheaders("content-type")
+ url = response.geturl()
+ # XXX encoding
+ return _is_html(ct_hdrs, url, self._allow_xhtml)
+
+
+class Args(object):
+
+ # idea for this argument-processing trick is from Peter Otten
+
+ def __init__(self, args_map):
+ self.__dict__["dictionary"] = dict(args_map)
+
+ def __getattr__(self, key):
+ try:
+ return self.dictionary[key]
+ except KeyError:
+ return getattr(self.__class__, key)
+
+ def __setattr__(self, key, value):
+ if key == "dictionary":
+ raise AttributeError()
+ self.dictionary[key] = value
+
+
+def form_parser_args(
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ ):
+ return Args(locals())
+
+
+class Link:
+ def __init__(self, base_url, url, text, tag, attrs):
+ assert None not in [url, tag, attrs]
+ self.base_url = base_url
+ self.absolute_url = _rfc3986.urljoin(base_url, url)
+ self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
+ def __cmp__(self, other):
+ try:
+ for name in "url", "text", "tag", "attrs":
+ if getattr(self, name) != getattr(other, name):
+ return -1
+ except AttributeError:
+ return -1
+ return 0
+ def __repr__(self):
+ return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
+ self.base_url, self.url, self.text, self.tag, self.attrs)
+
+
+class LinksFactory:
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import _pullparser
+ if link_parser_class is None:
+ link_parser_class = _pullparser.TolerantPullParser
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+ self._response = None
+ self._encoding = None
+
+ def set_response(self, response, base_url, encoding):
+ self._response = response
+ self._encoding = encoding
+ self._base_url = base_url
+
+ def links(self):
+ """Return an iterator that provides links of the document."""
+ response = self._response
+ encoding = self._encoding
+ base_url = self._base_url
+ p = self.link_parser_class(response, encoding=encoding)
+
+ try:
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.type == "endtag":
+ continue
+ if token.data == "base":
+ base_href = dict(token.attrs).get("href")
+ if base_href is not None:
+ base_url = base_href
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ text = None
+ # XXX use attr_encoding for ref'd doc if that doc does not
+ # provide one by other means
+ #attr_encoding = attrs.get("charset")
+ url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so
+ # ignore this.
+ continue
+
+ url = _rfc3986.clean_url(url, encoding)
+ if tag == "a":
+ if token.type != "startendtag":
+ # hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for e.g.
+ # <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
+
+ yield Link(base_url, url, text, tag, token.attrs)
+ except sgmllib.SGMLParseError, exc:
+ raise _form.ParseError(exc)
+
+class FormsFactory:
+
+ """Makes a sequence of objects satisfying HTMLForm interface.
+
+ After calling .forms(), the .global_form attribute is a form object
+ containing all controls not a descendant of any FORM element.
+
+ For constructor argument docs, see ParseResponse argument docs.
+ """
+
+ def __init__(self,
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ ):
+ self.select_default = select_default
+ if form_parser_class is None:
+ form_parser_class = _form.FormParser
+ self.form_parser_class = form_parser_class
+ if request_class is None:
+ request_class = _request.Request
+ self.request_class = request_class
+ self.backwards_compat = backwards_compat
+ self._response = None
+ self.encoding = None
+ self.global_form = None
+
+ def set_response(self, response, encoding):
+ self._response = response
+ self.encoding = encoding
+ self.global_form = None
+
+ def forms(self):
+ encoding = self.encoding
+ forms = _form.ParseResponseEx(
+ self._response,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ encoding=encoding,
+ _urljoin=_rfc3986.urljoin,
+ _urlparse=_rfc3986.urlsplit,
+ _urlunparse=_rfc3986.urlunsplit,
+ )
+ self.global_form = forms[0]
+ return forms[1:]
+
+class TitleFactory:
+ def __init__(self):
+ self._response = self._encoding = None
+
+ def set_response(self, response, encoding):
+ self._response = response
+ self._encoding = encoding
+
+ def _get_title_text(self, parser):
+ import _pullparser
+ text = []
+ tok = None
+ while 1:
+ try:
+ tok = parser.get_token()
+ except _pullparser.NoMoreTokensError:
+ break
+ if tok.type == "data":
+ text.append(str(tok))
+ elif tok.type == "entityref":
+ t = unescape("&%s;" % tok.data,
+ parser._entitydefs, parser.encoding)
+ text.append(t)
+ elif tok.type == "charref":
+ t = unescape_charref(tok.data, parser.encoding)
+ text.append(t)
+ elif tok.type in ["starttag", "endtag", "startendtag"]:
+ tag_name = tok.data
+ if tok.type == "endtag" and tag_name == "title":
+ break
+ text.append(str(tok))
+ return COMPRESS_RE.sub(" ", "".join(text).strip())
+
+ def title(self):
+ import _pullparser
+ p = _pullparser.TolerantPullParser(
+ self._response, encoding=self._encoding)
+ try:
+ try:
+ p.get_tag("title")
+ except _pullparser.NoMoreTokensError:
+ return None
+ else:
+ return self._get_title_text(p)
+ except sgmllib.SGMLParseError, exc:
+ raise _form.ParseError(exc)
+
+
+def unescape(data, entities, encoding):
+ if data is None or "&" not in data:
+ return data
+
+ def replace_entities(match):
+ ent = match.group()
+ if ent[1] == "#":
+ return unescape_charref(ent[2:-1], encoding)
+
+ repl = entities.get(ent[1:-1])
+ if repl is not None:
+ repl = unichr(repl)
+ if type(repl) != type(""):
+ try:
+ repl = repl.encode(encoding)
+ except UnicodeError:
+ repl = ent
+ else:
+ repl = ent
+ return repl
+
+ return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
+
+def unescape_charref(data, encoding):
+ name, base = data, 10
+ if name.startswith("x"):
+ name, base= name[1:], 16
+ uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
+ return repl
+
+
+class MechanizeBs(_beautifulsoup.BeautifulSoup):
+ _entitydefs = htmlentitydefs.name2codepoint
+ # don't want the magic Microsoft-char workaround
+ PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda(x):x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda(x):'<!' + x.group(1) + '>')
+ ]
+
+ def __init__(self, encoding, text=None, avoidParserProblems=True,
+ initialTextIsEverything=True):
+ self._encoding = encoding
+ _beautifulsoup.BeautifulSoup.__init__(
+ self, text, avoidParserProblems, initialTextIsEverything)
+
+ def handle_charref(self, ref):
+ t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def handle_entityref(self, ref):
+ t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def unescape_attrs(self, attrs):
+ escaped_attrs = []
+ for key, val in attrs:
+ val = unescape(val, self._entitydefs, self._encoding)
+ escaped_attrs.append((key, val))
+ return escaped_attrs
+
+class RobustLinksFactory:
+
+ compress_re = COMPRESS_RE
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ if link_parser_class is None:
+ link_parser_class = MechanizeBs
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+ self._bs = None
+ self._encoding = None
+ self._base_url = None
+
+ def set_soup(self, soup, base_url, encoding):
+ self._bs = soup
+ self._base_url = base_url
+ self._encoding = encoding
+
+ def links(self):
+ bs = self._bs
+ base_url = self._base_url
+ encoding = self._encoding
+ for ch in bs.recursiveChildGenerator():
+ if (isinstance(ch, _beautifulsoup.Tag) and
+ ch.name in self.urltags.keys()+["base"]):
+ link = ch
+ attrs = bs.unescape_attrs(link.attrs)
+ attrs_dict = dict(attrs)
+ if link.name == "base":
+ base_href = attrs_dict.get("href")
+ if base_href is not None:
+ base_url = base_href
+ continue
+ url_attr = self.urltags[link.name]
+ url = attrs_dict.get(url_attr)
+ if not url:
+ continue
+ url = _rfc3986.clean_url(url, encoding)
+ text = link.fetchText(lambda t: True)
+ if not text:
+ # follow _pullparser's weird behaviour rigidly
+ if link.name == "a":
+ text = ""
+ else:
+ text = None
+ else:
+ text = self.compress_re.sub(" ", " ".join(text).strip())
+ yield Link(base_url, url, text, link.name, attrs)
+
+
+class RobustFormsFactory(FormsFactory):
+ def __init__(self, *args, **kwds):
+ args = form_parser_args(*args, **kwds)
+ if args.form_parser_class is None:
+ args.form_parser_class = _form.RobustFormParser
+ FormsFactory.__init__(self, **args.dictionary)
+
+ def set_response(self, response, encoding):
+ self._response = response
+ self.encoding = encoding
+
+
+class RobustTitleFactory:
+ def __init__(self):
+ self._bs = self._encoding = None
+
+ def set_soup(self, soup, encoding):
+ self._bs = soup
+ self._encoding = encoding
+
+ def title(self):
+ title = self._bs.first("title")
+ if title == _beautifulsoup.Null:
+ return None
+ else:
+ inner_html = "".join([str(node) for node in title.contents])
+ return COMPRESS_RE.sub(" ", inner_html.strip())
+
+
+class Factory:
+ """Factory for forms, links, etc.
+
+ This interface may expand in future.
+
+ Public methods:
+
+ set_request_class(request_class)
+ set_response(response)
+ forms()
+ links()
+
+ Public attributes:
+
+ Note that accessing these attributes may raise ParseError.
+
+ encoding: string specifying the encoding of response if it contains a text
+ document (this value is left unspecified for documents that do not have
+ an encoding, e.g. an image file)
+ is_html: true if response contains an HTML document (XHTML may be
+ regarded as HTML too)
+ title: page title, or None if no title or not HTML
+ global_form: form object containing all controls that are not descendants
+ of any FORM element, or None if the forms_factory does not support
+ supplying a global form
+
+ """
+
+ LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
+
+ def __init__(self, forms_factory, links_factory, title_factory,
+ encoding_finder=EncodingFinder(DEFAULT_ENCODING),
+ response_type_finder=ResponseTypeFinder(allow_xhtml=False),
+ ):
+ """
+
+ Pass keyword arguments only.
+
+ default_encoding: character encoding to use if encoding cannot be
+ determined (or guessed) from the response. You should turn on
+ HTTP-EQUIV handling if you want the best chance of getting this right
+ without resorting to this default. The default value of this
+ parameter (currently latin-1) may change in future.
+
+ """
+ self._forms_factory = forms_factory
+ self._links_factory = links_factory
+ self._title_factory = title_factory
+ self._encoding_finder = encoding_finder
+ self._response_type_finder = response_type_finder
+
+ self.set_response(None)
+
+ def set_request_class(self, request_class):
+ """Set request class (mechanize.Request by default).
+
+ HTMLForm instances returned by .forms() will return instances of this
+ class when .click()ed.
+
+ """
+ self._forms_factory.request_class = request_class
+
+ def set_response(self, response):
+ """Set response.
+
+ The response must either be None or implement the same interface as
+ objects returned by mechanize.urlopen().
+
+ """
+ self._response = response
+ self._forms_genf = self._links_genf = None
+ self._get_title = None
+ for name in self.LAZY_ATTRS:
+ try:
+ delattr(self, name)
+ except AttributeError:
+ pass
+
+ def __getattr__(self, name):
+ if name not in self.LAZY_ATTRS:
+ return getattr(self.__class__, name)
+
+ if name == "encoding":
+ self.encoding = self._encoding_finder.encoding(
+ copy.copy(self._response))
+ return self.encoding
+ elif name == "is_html":
+ self.is_html = self._response_type_finder.is_html(
+ copy.copy(self._response), self.encoding)
+ return self.is_html
+ elif name == "title":
+ if self.is_html:
+ self.title = self._title_factory.title()
+ else:
+ self.title = None
+ return self.title
+ elif name == "global_form":
+ self.forms()
+ return self.global_form
+
+ def forms(self):
+ """Return iterable over HTMLForm-like objects.
+
+ Raises mechanize.ParseError on failure.
+ """
+ # this implementation sets .global_form as a side-effect, for benefit
+ # of __getattr__ impl
+ if self._forms_genf is None:
+ try:
+ self._forms_genf = CachingGeneratorFunction(
+ self._forms_factory.forms())
+ except: # XXXX define exception!
+ self.set_response(self._response)
+ raise
+ self.global_form = getattr(
+ self._forms_factory, "global_form", None)
+ return self._forms_genf()
+
+ def links(self):
+ """Return iterable over mechanize.Link-like objects.
+
+ Raises mechanize.ParseError on failure.
+ """
+ if self._links_genf is None:
+ try:
+ self._links_genf = CachingGeneratorFunction(
+ self._links_factory.links())
+ except: # XXXX define exception!
+ self.set_response(self._response)
+ raise
+ return self._links_genf()
+
+class DefaultFactory(Factory):
+ """Based on sgmllib."""
+ def __init__(self, i_want_broken_xhtml_support=False):
+ Factory.__init__(
+ self,
+ forms_factory=FormsFactory(),
+ links_factory=LinksFactory(),
+ title_factory=TitleFactory(),
+ response_type_finder=ResponseTypeFinder(
+ allow_xhtml=i_want_broken_xhtml_support),
+ )
+
+ def set_response(self, response):
+ Factory.set_response(self, response)
+ if response is not None:
+ self._forms_factory.set_response(
+ copy.copy(response), self.encoding)
+ self._links_factory.set_response(
+ copy.copy(response), response.geturl(), self.encoding)
+ self._title_factory.set_response(
+ copy.copy(response), self.encoding)
+
+class RobustFactory(Factory):
+ """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
+ DefaultFactory.
+
+ """
+ def __init__(self, i_want_broken_xhtml_support=False,
+ soup_class=None):
+ Factory.__init__(
+ self,
+ forms_factory=RobustFormsFactory(),
+ links_factory=RobustLinksFactory(),
+ title_factory=RobustTitleFactory(),
+ response_type_finder=ResponseTypeFinder(
+ allow_xhtml=i_want_broken_xhtml_support),
+ )
+ if soup_class is None:
+ soup_class = MechanizeBs
+ self._soup_class = soup_class
+
+ def set_response(self, response):
+ Factory.set_response(self, response)
+ if response is not None:
+ data = response.read()
+ soup = self._soup_class(self.encoding, data)
+ self._forms_factory.set_response(
+ copy.copy(response), self.encoding)
+ self._links_factory.set_soup(
+ soup, response.geturl(), self.encoding)
+ self._title_factory.set_soup(soup, self.encoding)
« no previous file with comments | « Tools/Scripts/webkitpy/thirdparty/mechanize/_headersutil.py ('k') | Tools/Scripts/webkitpy/thirdparty/mechanize/_http.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698