| Index: Tools/Scripts/webkitpy/thirdparty/mechanize/_rfc3986.py
|
| diff --git a/Tools/Scripts/webkitpy/thirdparty/mechanize/_rfc3986.py b/Tools/Scripts/webkitpy/thirdparty/mechanize/_rfc3986.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..91fcd197f4ec173ef00c5e05555c7936d7fbc4ed
|
| --- /dev/null
|
| +++ b/Tools/Scripts/webkitpy/thirdparty/mechanize/_rfc3986.py
|
| @@ -0,0 +1,245 @@
|
| +"""RFC 3986 URI parsing and relative reference resolution / absolutization.
|
| +
|
| +(aka splitting and joining)
|
| +
|
| +Copyright 2006 John J. Lee <jjl@pobox.com>
|
| +
|
| +This code is free software; you can redistribute it and/or modify it under
|
| +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
| +included with the distribution).
|
| +
|
| +"""
|
| +
|
| +# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
|
| +
|
| +import re, urllib
|
| +
|
| +## def chr_range(a, b):
|
| +## return "".join(map(chr, range(ord(a), ord(b)+1)))
|
| +
|
| +## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| +## "abcdefghijklmnopqrstuvwxyz"
|
| +## "0123456789"
|
| +## "-_.~")
|
| +## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
|
| +## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
|
| +# this re matches any character that's not in URI_CHARS
|
| +BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
|
| +
|
| +
|
| +def clean_url(url, encoding):
|
| + # percent-encode illegal URI characters
|
| + # Trying to come up with test cases for this gave me a headache, revisit
|
| + # when do switch to unicode.
|
| + # Somebody else's comments (lost the attribution):
|
| +## - IE will return you the url in the encoding you send it
|
| +## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
|
| +## characters in your link. It will send you utf-8 however if there are...
|
| + if type(url) == type(""):
|
| + url = url.decode(encoding, "replace")
|
| + url = url.strip()
|
| + # for second param to urllib.quote(), we want URI_CHARS, minus the
|
| + # 'always_safe' characters that urllib.quote() never percent-encodes
|
| + return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
|
| +
|
| +def is_clean_uri(uri):
|
| + """
|
| + >>> is_clean_uri("ABC!")
|
| + True
|
| + >>> is_clean_uri(u"ABC!")
|
| + True
|
| + >>> is_clean_uri("ABC|")
|
| + False
|
| + >>> is_clean_uri(u"ABC|")
|
| + False
|
| + >>> is_clean_uri("http://example.com/0")
|
| + True
|
| + >>> is_clean_uri(u"http://example.com/0")
|
| + True
|
| + """
|
| + # note module re treats bytestrings as through they were decoded as latin-1
|
| + # so this function accepts both unicode and bytestrings
|
| + return not bool(BAD_URI_CHARS_RE.search(uri))
|
| +
|
| +
|
| +SPLIT_MATCH = re.compile(
|
| + r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
|
| +def urlsplit(absolute_uri):
|
| + """Return scheme, authority, path, query, fragment."""
|
| + match = SPLIT_MATCH(absolute_uri)
|
| + if match:
|
| + g = match.groups()
|
| + return g[1], g[3], g[4], g[6], g[8]
|
| +
|
| +def urlunsplit(parts):
|
| + scheme, authority, path, query, fragment = parts
|
| + r = []
|
| + append = r.append
|
| + if scheme is not None:
|
| + append(scheme)
|
| + append(":")
|
| + if authority is not None:
|
| + append("//")
|
| + append(authority)
|
| + append(path)
|
| + if query is not None:
|
| + append("?")
|
| + append(query)
|
| + if fragment is not None:
|
| + append("#")
|
| + append(fragment)
|
| + return "".join(r)
|
| +
|
| +def urljoin(base_uri, uri_reference):
|
| + """Join a base URI with a URI reference and return the resulting URI.
|
| +
|
| + See RFC 3986.
|
| + """
|
| + return urlunsplit(urljoin_parts(urlsplit(base_uri),
|
| + urlsplit(uri_reference)))
|
| +
|
| +# oops, this doesn't do the same thing as the literal translation
|
| +# from the RFC below
|
| +## import posixpath
|
| +## def urljoin_parts(base_parts, reference_parts):
|
| +## scheme, authority, path, query, fragment = base_parts
|
| +## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
| +
|
| +## # compute target URI path
|
| +## if rpath == "":
|
| +## tpath = path
|
| +## else:
|
| +## tpath = rpath
|
| +## if not tpath.startswith("/"):
|
| +## tpath = merge(authority, path, tpath)
|
| +## tpath = posixpath.normpath(tpath)
|
| +
|
| +## if rscheme is not None:
|
| +## return (rscheme, rauthority, tpath, rquery, rfragment)
|
| +## elif rauthority is not None:
|
| +## return (scheme, rauthority, tpath, rquery, rfragment)
|
| +## elif rpath == "":
|
| +## if rquery is not None:
|
| +## tquery = rquery
|
| +## else:
|
| +## tquery = query
|
| +## return (scheme, authority, tpath, tquery, rfragment)
|
| +## else:
|
| +## return (scheme, authority, tpath, rquery, rfragment)
|
| +
|
| +def urljoin_parts(base_parts, reference_parts):
|
| + scheme, authority, path, query, fragment = base_parts
|
| + rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
| +
|
| + if rscheme == scheme:
|
| + rscheme = None
|
| +
|
| + if rscheme is not None:
|
| + tscheme, tauthority, tpath, tquery = (
|
| + rscheme, rauthority, remove_dot_segments(rpath), rquery)
|
| + else:
|
| + if rauthority is not None:
|
| + tauthority, tpath, tquery = (
|
| + rauthority, remove_dot_segments(rpath), rquery)
|
| + else:
|
| + if rpath == "":
|
| + tpath = path
|
| + if rquery is not None:
|
| + tquery = rquery
|
| + else:
|
| + tquery = query
|
| + else:
|
| + if rpath.startswith("/"):
|
| + tpath = remove_dot_segments(rpath)
|
| + else:
|
| + tpath = merge(authority, path, rpath)
|
| + tpath = remove_dot_segments(tpath)
|
| + tquery = rquery
|
| + tauthority = authority
|
| + tscheme = scheme
|
| + tfragment = rfragment
|
| + return (tscheme, tauthority, tpath, tquery, tfragment)
|
| +
|
| +# um, something *vaguely* like this is what I want, but I have to generate
|
| +# lots of test cases first, if only to understand what it is that
|
| +# remove_dot_segments really does...
|
| +## def remove_dot_segments(path):
|
| +## if path == '':
|
| +## return ''
|
| +## comps = path.split('/')
|
| +## new_comps = []
|
| +## for comp in comps:
|
| +## if comp in ['.', '']:
|
| +## if not new_comps or new_comps[-1]:
|
| +## new_comps.append('')
|
| +## continue
|
| +## if comp != '..':
|
| +## new_comps.append(comp)
|
| +## elif new_comps:
|
| +## new_comps.pop()
|
| +## return '/'.join(new_comps)
|
| +
|
| +
|
| +def remove_dot_segments(path):
|
| + r = []
|
| + while path:
|
| + # A
|
| + if path.startswith("../"):
|
| + path = path[3:]
|
| + continue
|
| + if path.startswith("./"):
|
| + path = path[2:]
|
| + continue
|
| + # B
|
| + if path.startswith("/./"):
|
| + path = path[2:]
|
| + continue
|
| + if path == "/.":
|
| + path = "/"
|
| + continue
|
| + # C
|
| + if path.startswith("/../"):
|
| + path = path[3:]
|
| + if r:
|
| + r.pop()
|
| + continue
|
| + if path == "/..":
|
| + path = "/"
|
| + if r:
|
| + r.pop()
|
| + continue
|
| + # D
|
| + if path == ".":
|
| + path = path[1:]
|
| + continue
|
| + if path == "..":
|
| + path = path[2:]
|
| + continue
|
| + # E
|
| + start = 0
|
| + if path.startswith("/"):
|
| + start = 1
|
| + ii = path.find("/", start)
|
| + if ii < 0:
|
| + ii = None
|
| + r.append(path[:ii])
|
| + if ii is None:
|
| + break
|
| + path = path[ii:]
|
| + return "".join(r)
|
| +
|
| +def merge(base_authority, base_path, ref_path):
|
| + # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
|
| + # doesn't even take base_authority as a parameter, despite the wording in
|
| + # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
|
| + #if base_authority is not None and base_path == "":
|
| + if base_path == "":
|
| + return "/" + ref_path
|
| + ii = base_path.rfind("/")
|
| + if ii >= 0:
|
| + return base_path[:ii+1] + ref_path
|
| + return ref_path
|
| +
|
| +if __name__ == "__main__":
|
| + import doctest
|
| + doctest.testmod()
|
|
|