| OLD | NEW |
| (Empty) | |
| 1 """HTTP cookie handling for web clients. |
| 2 |
| 3 This module originally developed from my port of Gisle Aas' Perl module |
| 4 HTTP::Cookies, from the libwww-perl library. |
| 5 |
| 6 Docstrings, comments and debug strings in this code refer to the |
| 7 attributes of the HTTP cookie system as cookie-attributes, to distinguish |
| 8 them clearly from Python attributes. |
| 9 |
| 10 CookieJar____ |
| 11 / \ \ |
| 12 FileCookieJar \ \ |
| 13 / | \ \ \ |
| 14 MozillaCookieJar | LWPCookieJar \ \ |
| 15 | | \ |
| 16 | ---MSIEBase | \ |
| 17 | / | | \ |
| 18 | / MSIEDBCookieJar BSDDBCookieJar |
| 19 |/ |
| 20 MSIECookieJar |
| 21 |
| 22 Comments to John J Lee <jjl@pobox.com>. |
| 23 |
| 24 |
| 25 Copyright 2002-2006 John J Lee <jjl@pobox.com> |
| 26 Copyright 1997-1999 Gisle Aas (original libwww-perl code) |
| 27 Copyright 2002-2003 Johnny Lee (original MSIE Perl code) |
| 28 |
| 29 This code is free software; you can redistribute it and/or modify it |
| 30 under the terms of the BSD or ZPL 2.1 licenses (see the file |
| 31 COPYING.txt included with the distribution). |
| 32 |
| 33 """ |
| 34 |
| 35 import sys, re, copy, time, urllib, types, logging |
| 36 try: |
| 37 import threading |
| 38 _threading = threading; del threading |
| 39 except ImportError: |
| 40 import dummy_threading |
| 41 _threading = dummy_threading; del dummy_threading |
| 42 |
| 43 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " |
| 44 "instance initialised with one)") |
| 45 DEFAULT_HTTP_PORT = "80" |
| 46 |
| 47 from _headersutil import split_header_words, parse_ns_headers |
| 48 from _util import isstringlike |
| 49 import _rfc3986 |
| 50 |
| 51 debug = logging.getLogger("mechanize.cookies").debug |
| 52 |
| 53 |
| 54 def reraise_unmasked_exceptions(unmasked=()): |
| 55 # There are a few catch-all except: statements in this module, for |
| 56 # catching input that's bad in unexpected ways. |
| 57 # This function re-raises some exceptions we don't want to trap. |
| 58 import mechanize, warnings |
| 59 if not mechanize.USE_BARE_EXCEPT: |
| 60 raise |
| 61 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) |
| 62 etype = sys.exc_info()[0] |
| 63 if issubclass(etype, unmasked): |
| 64 raise |
| 65 # swallowed an exception |
| 66 import traceback, StringIO |
| 67 f = StringIO.StringIO() |
| 68 traceback.print_exc(None, f) |
| 69 msg = f.getvalue() |
| 70 warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) |
| 71 |
| 72 |
| 73 IPV4_RE = re.compile(r"\.\d+$") |
| 74 def is_HDN(text): |
| 75 """Return True if text is a host domain name.""" |
| 76 # XXX |
| 77 # This may well be wrong. Which RFC is HDN defined in, if any (for |
| 78 # the purposes of RFC 2965)? |
| 79 # For the current implementation, what about IPv6? Remember to look |
| 80 # at other uses of IPV4_RE also, if change this. |
| 81 return not (IPV4_RE.search(text) or |
| 82 text == "" or |
| 83 text[0] == "." or text[-1] == ".") |
| 84 |
| 85 def domain_match(A, B): |
| 86 """Return True if domain A domain-matches domain B, according to RFC 2965. |
| 87 |
| 88 A and B may be host domain names or IP addresses. |
| 89 |
| 90 RFC 2965, section 1: |
| 91 |
| 92 Host names can be specified either as an IP address or a HDN string. |
| 93 Sometimes we compare one host name with another. (Such comparisons SHALL |
| 94 be case-insensitive.) Host A's name domain-matches host B's if |
| 95 |
| 96 * their host name strings string-compare equal; or |
| 97 |
| 98 * A is a HDN string and has the form NB, where N is a non-empty |
| 99 name string, B has the form .B', and B' is a HDN string. (So, |
| 100 x.y.com domain-matches .Y.com but not Y.com.) |
| 101 |
| 102 Note that domain-match is not a commutative operation: a.b.c.com |
| 103 domain-matches .c.com, but not the reverse. |
| 104 |
| 105 """ |
| 106 # Note that, if A or B are IP addresses, the only relevant part of the |
| 107 # definition of the domain-match algorithm is the direct string-compare. |
| 108 A = A.lower() |
| 109 B = B.lower() |
| 110 if A == B: |
| 111 return True |
| 112 if not is_HDN(A): |
| 113 return False |
| 114 i = A.rfind(B) |
| 115 has_form_nb = not (i == -1 or i == 0) |
| 116 return ( |
| 117 has_form_nb and |
| 118 B.startswith(".") and |
| 119 is_HDN(B[1:]) |
| 120 ) |
| 121 |
| 122 def liberal_is_HDN(text): |
| 123 """Return True if text is a sort-of-like a host domain name. |
| 124 |
| 125 For accepting/blocking domains. |
| 126 |
| 127 """ |
| 128 return not IPV4_RE.search(text) |
| 129 |
| 130 def user_domain_match(A, B): |
| 131 """For blocking/accepting domains. |
| 132 |
| 133 A and B may be host domain names or IP addresses. |
| 134 |
| 135 """ |
| 136 A = A.lower() |
| 137 B = B.lower() |
| 138 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): |
| 139 if A == B: |
| 140 # equal IP addresses |
| 141 return True |
| 142 return False |
| 143 initial_dot = B.startswith(".") |
| 144 if initial_dot and A.endswith(B): |
| 145 return True |
| 146 if not initial_dot and A == B: |
| 147 return True |
| 148 return False |
| 149 |
| 150 cut_port_re = re.compile(r":\d+$") |
| 151 def request_host(request): |
| 152 """Return request-host, as defined by RFC 2965. |
| 153 |
| 154 Variation from RFC: returned value is lowercased, for convenient |
| 155 comparison. |
| 156 |
| 157 """ |
| 158 url = request.get_full_url() |
| 159 host = _rfc3986.urlsplit(url)[1] |
| 160 if host is None: |
| 161 host = request.get_header("Host", "") |
| 162 # remove port, if present |
| 163 return cut_port_re.sub("", host, 1) |
| 164 |
| 165 def request_host_lc(request): |
| 166 return request_host(request).lower() |
| 167 |
| 168 def eff_request_host(request): |
| 169 """Return a tuple (request-host, effective request-host name).""" |
| 170 erhn = req_host = request_host(request) |
| 171 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): |
| 172 erhn = req_host + ".local" |
| 173 return req_host, erhn |
| 174 |
| 175 def eff_request_host_lc(request): |
| 176 req_host, erhn = eff_request_host(request) |
| 177 return req_host.lower(), erhn.lower() |
| 178 |
| 179 def effective_request_host(request): |
| 180 """Return the effective request-host, as defined by RFC 2965.""" |
| 181 return eff_request_host(request)[1] |
| 182 |
| 183 def request_path(request): |
| 184 """Return path component of request-URI, as defined by RFC 2965.""" |
| 185 url = request.get_full_url() |
| 186 path = escape_path(_rfc3986.urlsplit(url)[2]) |
| 187 if not path.startswith("/"): |
| 188 path = "/" + path |
| 189 return path |
| 190 |
| 191 def request_port(request): |
| 192 host = request.get_host() |
| 193 i = host.find(':') |
| 194 if i >= 0: |
| 195 port = host[i+1:] |
| 196 try: |
| 197 int(port) |
| 198 except ValueError: |
| 199 debug("nonnumeric port: '%s'", port) |
| 200 return None |
| 201 else: |
| 202 port = DEFAULT_HTTP_PORT |
| 203 return port |
| 204 |
| 205 def request_is_unverifiable(request): |
| 206 try: |
| 207 return request.is_unverifiable() |
| 208 except AttributeError: |
| 209 if hasattr(request, "unverifiable"): |
| 210 return request.unverifiable |
| 211 else: |
| 212 raise |
| 213 |
| 214 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't |
| 215 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). |
| 216 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" |
| 217 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") |
| 218 def uppercase_escaped_char(match): |
| 219 return "%%%s" % match.group(1).upper() |
| 220 def escape_path(path): |
| 221 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" |
| 222 # There's no knowing what character encoding was used to create URLs |
| 223 # containing %-escapes, but since we have to pick one to escape invalid |
| 224 # path characters, we pick UTF-8, as recommended in the HTML 4.0 |
| 225 # specification: |
| 226 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 |
| 227 # And here, kind of: draft-fielding-uri-rfc2396bis-03 |
| 228 # (And in draft IRI specification: draft-duerst-iri-05) |
| 229 # (And here, for new URI schemes: RFC 2718) |
| 230 if isinstance(path, types.UnicodeType): |
| 231 path = path.encode("utf-8") |
| 232 path = urllib.quote(path, HTTP_PATH_SAFE) |
| 233 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) |
| 234 return path |
| 235 |
| 236 def reach(h): |
| 237 """Return reach of host h, as defined by RFC 2965, section 1. |
| 238 |
| 239 The reach R of a host name H is defined as follows: |
| 240 |
| 241 * If |
| 242 |
| 243 - H is the host domain name of a host; and, |
| 244 |
| 245 - H has the form A.B; and |
| 246 |
| 247 - A has no embedded (that is, interior) dots; and |
| 248 |
| 249 - B has at least one embedded dot, or B is the string "local". |
| 250 then the reach of H is .B. |
| 251 |
| 252 * Otherwise, the reach of H is H. |
| 253 |
| 254 >>> reach("www.acme.com") |
| 255 '.acme.com' |
| 256 >>> reach("acme.com") |
| 257 'acme.com' |
| 258 >>> reach("acme.local") |
| 259 '.local' |
| 260 |
| 261 """ |
| 262 i = h.find(".") |
| 263 if i >= 0: |
| 264 #a = h[:i] # this line is only here to show what a is |
| 265 b = h[i+1:] |
| 266 i = b.find(".") |
| 267 if is_HDN(h) and (i >= 0 or b == "local"): |
| 268 return "."+b |
| 269 return h |
| 270 |
| 271 def is_third_party(request): |
| 272 """ |
| 273 |
| 274 RFC 2965, section 3.3.6: |
| 275 |
| 276 An unverifiable transaction is to a third-party host if its request- |
| 277 host U does not domain-match the reach R of the request-host O in the |
| 278 origin transaction. |
| 279 |
| 280 """ |
| 281 req_host = request_host_lc(request) |
| 282 # the origin request's request-host was stuffed into request by |
| 283 # _urllib2_support.AbstractHTTPHandler |
| 284 return not domain_match(req_host, reach(request.origin_req_host)) |
| 285 |
| 286 |
| 287 try: |
| 288 all |
| 289 except NameError: |
| 290 # python 2.4 |
| 291 def all(iterable): |
| 292 for x in iterable: |
| 293 if not x: |
| 294 return False |
| 295 return True |
| 296 |
| 297 |
| 298 class Cookie: |
| 299 """HTTP Cookie. |
| 300 |
| 301 This class represents both Netscape and RFC 2965 cookies. |
| 302 |
| 303 This is deliberately a very simple class. It just holds attributes. It's |
| 304 possible to construct Cookie instances that don't comply with the cookie |
| 305 standards. CookieJar.make_cookies is the factory function for Cookie |
| 306 objects -- it deals with cookie parsing, supplying defaults, and |
| 307 normalising to the representation used in this class. CookiePolicy is |
| 308 responsible for checking them to see whether they should be accepted from |
| 309 and returned to the server. |
| 310 |
| 311 version: integer; |
| 312 name: string; |
| 313 value: string (may be None); |
| 314 port: string; None indicates no attribute was supplied (e.g. "Port", rather |
| 315 than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list |
| 316 string (e.g. "80,8080") |
| 317 port_specified: boolean; true if a value was supplied with the Port |
| 318 cookie-attribute |
| 319 domain: string; |
| 320 domain_specified: boolean; true if Domain was explicitly set |
| 321 domain_initial_dot: boolean; true if Domain as set in HTTP header by server |
| 322 started with a dot (yes, this really is necessary!) |
| 323 path: string; |
| 324 path_specified: boolean; true if Path was explicitly set |
| 325 secure: boolean; true if should only be returned over secure connection |
| 326 expires: integer; seconds since epoch (RFC 2965 cookies should calculate |
| 327 this value from the Max-Age attribute) |
| 328 discard: boolean, true if this is a session cookie; (if no expires value, |
| 329 this should be true) |
| 330 comment: string; |
| 331 comment_url: string; |
| 332 rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not |
| 333 Set-Cookie2:) header, but had a version cookie-attribute of 1 |
| 334 rest: mapping of other cookie-attributes |
| 335 |
| 336 Note that the port may be present in the headers, but unspecified ("Port" |
| 337 rather than"Port=80", for example); if this is the case, port is None. |
| 338 |
| 339 """ |
| 340 |
| 341 |
| 342 _attrs = ("version", "name", "value", |
| 343 "port", "port_specified", |
| 344 "domain", "domain_specified", "domain_initial_dot", |
| 345 "path", "path_specified", |
| 346 "secure", "expires", "discard", "comment", "comment_url", |
| 347 "rfc2109", "_rest") |
| 348 |
| 349 def __init__(self, version, name, value, |
| 350 port, port_specified, |
| 351 domain, domain_specified, domain_initial_dot, |
| 352 path, path_specified, |
| 353 secure, |
| 354 expires, |
| 355 discard, |
| 356 comment, |
| 357 comment_url, |
| 358 rest, |
| 359 rfc2109=False, |
| 360 ): |
| 361 |
| 362 if version is not None: version = int(version) |
| 363 if expires is not None: expires = int(expires) |
| 364 if port is None and port_specified is True: |
| 365 raise ValueError("if port is None, port_specified must be false") |
| 366 |
| 367 self.version = version |
| 368 self.name = name |
| 369 self.value = value |
| 370 self.port = port |
| 371 self.port_specified = port_specified |
| 372 # normalise case, as per RFC 2965 section 3.3.3 |
| 373 self.domain = domain.lower() |
| 374 self.domain_specified = domain_specified |
| 375 # Sigh. We need to know whether the domain given in the |
| 376 # cookie-attribute had an initial dot, in order to follow RFC 2965 |
| 377 # (as clarified in draft errata). Needed for the returned $Domain |
| 378 # value. |
| 379 self.domain_initial_dot = domain_initial_dot |
| 380 self.path = path |
| 381 self.path_specified = path_specified |
| 382 self.secure = secure |
| 383 self.expires = expires |
| 384 self.discard = discard |
| 385 self.comment = comment |
| 386 self.comment_url = comment_url |
| 387 self.rfc2109 = rfc2109 |
| 388 |
| 389 self._rest = copy.copy(rest) |
| 390 |
| 391 def has_nonstandard_attr(self, name): |
| 392 return self._rest.has_key(name) |
| 393 def get_nonstandard_attr(self, name, default=None): |
| 394 return self._rest.get(name, default) |
| 395 def set_nonstandard_attr(self, name, value): |
| 396 self._rest[name] = value |
| 397 def nonstandard_attr_keys(self): |
| 398 return self._rest.keys() |
| 399 |
| 400 def is_expired(self, now=None): |
| 401 if now is None: now = time.time() |
| 402 return (self.expires is not None) and (self.expires <= now) |
| 403 |
| 404 def __eq__(self, other): |
| 405 return all(getattr(self, a) == getattr(other, a) for a in self._attrs) |
| 406 |
| 407 def __ne__(self, other): |
| 408 return not (self == other) |
| 409 |
| 410 def __str__(self): |
| 411 if self.port is None: p = "" |
| 412 else: p = ":"+self.port |
| 413 limit = self.domain + p + self.path |
| 414 if self.value is not None: |
| 415 namevalue = "%s=%s" % (self.name, self.value) |
| 416 else: |
| 417 namevalue = self.name |
| 418 return "<Cookie %s for %s>" % (namevalue, limit) |
| 419 |
| 420 def __repr__(self): |
| 421 args = [] |
| 422 for name in ["version", "name", "value", |
| 423 "port", "port_specified", |
| 424 "domain", "domain_specified", "domain_initial_dot", |
| 425 "path", "path_specified", |
| 426 "secure", "expires", "discard", "comment", "comment_url", |
| 427 ]: |
| 428 attr = getattr(self, name) |
| 429 args.append("%s=%s" % (name, repr(attr))) |
| 430 args.append("rest=%s" % repr(self._rest)) |
| 431 args.append("rfc2109=%s" % repr(self.rfc2109)) |
| 432 return "Cookie(%s)" % ", ".join(args) |
| 433 |
| 434 |
| 435 class CookiePolicy: |
| 436 """Defines which cookies get accepted from and returned to server. |
| 437 |
| 438 May also modify cookies. |
| 439 |
| 440 The subclass DefaultCookiePolicy defines the standard rules for Netscape |
| 441 and RFC 2965 cookies -- override that if you want a customised policy. |
| 442 |
| 443 As well as implementing set_ok and return_ok, implementations of this |
| 444 interface must also supply the following attributes, indicating which |
| 445 protocols should be used, and how. These can be read and set at any time, |
| 446 though whether that makes complete sense from the protocol point of view is |
| 447 doubtful. |
| 448 |
| 449 Public attributes: |
| 450 |
| 451 netscape: implement netscape protocol |
| 452 rfc2965: implement RFC 2965 protocol |
| 453 rfc2109_as_netscape: |
| 454 WARNING: This argument will change or go away if is not accepted into |
| 455 the Python standard library in this form! |
| 456 If true, treat RFC 2109 cookies as though they were Netscape cookies. The |
| 457 default is for this attribute to be None, which means treat 2109 cookies |
| 458 as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, |
| 459 by default), and as Netscape cookies otherwise. |
| 460 hide_cookie2: don't add Cookie2 header to requests (the presence of |
| 461 this header indicates to the server that we understand RFC 2965 |
| 462 cookies) |
| 463 |
| 464 """ |
| 465 def set_ok(self, cookie, request): |
| 466 """Return true if (and only if) cookie should be accepted from server. |
| 467 |
| 468 Currently, pre-expired cookies never get this far -- the CookieJar |
| 469 class deletes such cookies itself. |
| 470 |
| 471 cookie: mechanize.Cookie object |
| 472 request: object implementing the interface defined by |
| 473 CookieJar.extract_cookies.__doc__ |
| 474 |
| 475 """ |
| 476 raise NotImplementedError() |
| 477 |
| 478 def return_ok(self, cookie, request): |
| 479 """Return true if (and only if) cookie should be returned to server. |
| 480 |
| 481 cookie: mechanize.Cookie object |
| 482 request: object implementing the interface defined by |
| 483 CookieJar.add_cookie_header.__doc__ |
| 484 |
| 485 """ |
| 486 raise NotImplementedError() |
| 487 |
| 488 def domain_return_ok(self, domain, request): |
| 489 """Return false if cookies should not be returned, given cookie domain. |
| 490 |
| 491 This is here as an optimization, to remove the need for checking every |
| 492 cookie with a particular domain (which may involve reading many files). |
| 493 The default implementations of domain_return_ok and path_return_ok |
| 494 (return True) leave all the work to return_ok. |
| 495 |
| 496 If domain_return_ok returns true for the cookie domain, path_return_ok |
| 497 is called for the cookie path. Otherwise, path_return_ok and return_ok |
| 498 are never called for that cookie domain. If path_return_ok returns |
| 499 true, return_ok is called with the Cookie object itself for a full |
| 500 check. Otherwise, return_ok is never called for that cookie path. |
| 501 |
| 502 Note that domain_return_ok is called for every *cookie* domain, not |
| 503 just for the *request* domain. For example, the function might be |
| 504 called with both ".acme.com" and "www.acme.com" if the request domain |
| 505 is "www.acme.com". The same goes for path_return_ok. |
| 506 |
| 507 For argument documentation, see the docstring for return_ok. |
| 508 |
| 509 """ |
| 510 return True |
| 511 |
| 512 def path_return_ok(self, path, request): |
| 513 """Return false if cookies should not be returned, given cookie path. |
| 514 |
| 515 See the docstring for domain_return_ok. |
| 516 |
| 517 """ |
| 518 return True |
| 519 |
| 520 |
| 521 class DefaultCookiePolicy(CookiePolicy): |
| 522 """Implements the standard rules for accepting and returning cookies. |
| 523 |
| 524 Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is |
| 525 switched off by default. |
| 526 |
| 527 The easiest way to provide your own policy is to override this class and |
| 528 call its methods in your overriden implementations before adding your own |
| 529 additional checks. |
| 530 |
| 531 import mechanize |
| 532 class MyCookiePolicy(mechanize.DefaultCookiePolicy): |
| 533 def set_ok(self, cookie, request): |
| 534 if not mechanize.DefaultCookiePolicy.set_ok( |
| 535 self, cookie, request): |
| 536 return False |
| 537 if i_dont_want_to_store_this_cookie(): |
| 538 return False |
| 539 return True |
| 540 |
| 541 In addition to the features required to implement the CookiePolicy |
| 542 interface, this class allows you to block and allow domains from setting |
| 543 and receiving cookies. There are also some strictness switches that allow |
| 544 you to tighten up the rather loose Netscape protocol rules a little bit (at |
| 545 the cost of blocking some benign cookies). |
| 546 |
| 547 A domain blacklist and whitelist is provided (both off by default). Only |
| 548 domains not in the blacklist and present in the whitelist (if the whitelist |
| 549 is active) participate in cookie setting and returning. Use the |
| 550 blocked_domains constructor argument, and blocked_domains and |
| 551 set_blocked_domains methods (and the corresponding argument and methods for |
| 552 allowed_domains). If you set a whitelist, you can turn it off again by |
| 553 setting it to None. |
| 554 |
| 555 Domains in block or allow lists that do not start with a dot must |
| 556 string-compare equal. For example, "acme.com" matches a blacklist entry of |
| 557 "acme.com", but "www.acme.com" does not. Domains that do start with a dot |
| 558 are matched by more specific domains too. For example, both "www.acme.com" |
| 559 and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does |
| 560 not). IP addresses are an exception, and must match exactly. For example, |
| 561 if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is |
| 562 blocked, but 193.168.1.2 is not. |
| 563 |
| 564 Additional Public Attributes: |
| 565 |
| 566 General strictness switches |
| 567 |
| 568 strict_domain: don't allow sites to set two-component domains with |
| 569 country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. |
| 570 This is far from perfect and isn't guaranteed to work! |
| 571 |
| 572 RFC 2965 protocol strictness switches |
| 573 |
| 574 strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable |
| 575 transactions (usually, an unverifiable transaction is one resulting from |
| 576 a redirect or an image hosted on another site); if this is false, cookies |
| 577 are NEVER blocked on the basis of verifiability |
| 578 |
| 579 Netscape protocol strictness switches |
| 580 |
| 581 strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions |
| 582 even to Netscape cookies |
| 583 strict_ns_domain: flags indicating how strict to be with domain-matching |
| 584 rules for Netscape cookies: |
| 585 DomainStrictNoDots: when setting cookies, host prefix must not contain a |
| 586 dot (e.g. www.foo.bar.com can't set a cookie for .bar.com, because |
| 587 www.foo contains a dot) |
| 588 DomainStrictNonDomain: cookies that did not explicitly specify a Domain |
| 589 cookie-attribute can only be returned to a domain that string-compares |
| 590 equal to the domain that set the cookie (e.g. rockets.acme.com won't |
| 591 be returned cookies from acme.com that had no Domain cookie-attribute) |
| 592 DomainRFC2965Match: when setting cookies, require a full RFC 2965 |
| 593 domain-match |
| 594 DomainLiberal and DomainStrict are the most useful combinations of the |
| 595 above flags, for convenience |
| 596 strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that |
| 597 have names starting with '$' |
| 598 strict_ns_set_path: don't allow setting cookies whose path doesn't |
| 599 path-match request URI |
| 600 |
| 601 """ |
| 602 |
| 603 DomainStrictNoDots = 1 |
| 604 DomainStrictNonDomain = 2 |
| 605 DomainRFC2965Match = 4 |
| 606 |
| 607 DomainLiberal = 0 |
| 608 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain |
| 609 |
| 610 def __init__(self, |
| 611 blocked_domains=None, allowed_domains=None, |
| 612 netscape=True, rfc2965=False, |
| 613 # WARNING: this argument will change or go away if is not |
| 614 # accepted into the Python standard library in this form! |
| 615 # default, ie. treat 2109 as netscape iff not rfc2965 |
| 616 rfc2109_as_netscape=None, |
| 617 hide_cookie2=False, |
| 618 strict_domain=False, |
| 619 strict_rfc2965_unverifiable=True, |
| 620 strict_ns_unverifiable=False, |
| 621 strict_ns_domain=DomainLiberal, |
| 622 strict_ns_set_initial_dollar=False, |
| 623 strict_ns_set_path=False, |
| 624 ): |
| 625 """ |
| 626 Constructor arguments should be used as keyword arguments only. |
| 627 |
| 628 blocked_domains: sequence of domain names that we never accept cookies |
| 629 from, nor return cookies to |
| 630 allowed_domains: if not None, this is a sequence of the only domains |
| 631 for which we accept and return cookies |
| 632 |
| 633 For other arguments, see CookiePolicy.__doc__ and |
| 634 DefaultCookiePolicy.__doc__.. |
| 635 |
| 636 """ |
| 637 self.netscape = netscape |
| 638 self.rfc2965 = rfc2965 |
| 639 self.rfc2109_as_netscape = rfc2109_as_netscape |
| 640 self.hide_cookie2 = hide_cookie2 |
| 641 self.strict_domain = strict_domain |
| 642 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable |
| 643 self.strict_ns_unverifiable = strict_ns_unverifiable |
| 644 self.strict_ns_domain = strict_ns_domain |
| 645 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar |
| 646 self.strict_ns_set_path = strict_ns_set_path |
| 647 |
| 648 if blocked_domains is not None: |
| 649 self._blocked_domains = tuple(blocked_domains) |
| 650 else: |
| 651 self._blocked_domains = () |
| 652 |
| 653 if allowed_domains is not None: |
| 654 allowed_domains = tuple(allowed_domains) |
| 655 self._allowed_domains = allowed_domains |
| 656 |
| 657 def blocked_domains(self): |
| 658 """Return the sequence of blocked domains (as a tuple).""" |
| 659 return self._blocked_domains |
| 660 def set_blocked_domains(self, blocked_domains): |
| 661 """Set the sequence of blocked domains.""" |
| 662 self._blocked_domains = tuple(blocked_domains) |
| 663 |
| 664 def is_blocked(self, domain): |
| 665 for blocked_domain in self._blocked_domains: |
| 666 if user_domain_match(domain, blocked_domain): |
| 667 return True |
| 668 return False |
| 669 |
| 670 def allowed_domains(self): |
| 671 """Return None, or the sequence of allowed domains (as a tuple).""" |
| 672 return self._allowed_domains |
| 673 def set_allowed_domains(self, allowed_domains): |
| 674 """Set the sequence of allowed domains, or None.""" |
| 675 if allowed_domains is not None: |
| 676 allowed_domains = tuple(allowed_domains) |
| 677 self._allowed_domains = allowed_domains |
| 678 |
| 679 def is_not_allowed(self, domain): |
| 680 if self._allowed_domains is None: |
| 681 return False |
| 682 for allowed_domain in self._allowed_domains: |
| 683 if user_domain_match(domain, allowed_domain): |
| 684 return False |
| 685 return True |
| 686 |
| 687 def set_ok(self, cookie, request): |
| 688 """ |
| 689 If you override set_ok, be sure to call this method. If it returns |
| 690 false, so should your subclass (assuming your subclass wants to be more |
| 691 strict about which cookies to accept). |
| 692 |
| 693 """ |
| 694 debug(" - checking cookie %s", cookie) |
| 695 |
| 696 assert cookie.name is not None |
| 697 |
| 698 for n in "version", "verifiability", "name", "path", "domain", "port": |
| 699 fn_name = "set_ok_"+n |
| 700 fn = getattr(self, fn_name) |
| 701 if not fn(cookie, request): |
| 702 return False |
| 703 |
| 704 return True |
| 705 |
| 706 def set_ok_version(self, cookie, request): |
| 707 if cookie.version is None: |
| 708 # Version is always set to 0 by parse_ns_headers if it's a Netscape |
| 709 # cookie, so this must be an invalid RFC 2965 cookie. |
| 710 debug(" Set-Cookie2 without version attribute (%s)", cookie) |
| 711 return False |
| 712 if cookie.version > 0 and not self.rfc2965: |
| 713 debug(" RFC 2965 cookies are switched off") |
| 714 return False |
| 715 elif cookie.version == 0 and not self.netscape: |
| 716 debug(" Netscape cookies are switched off") |
| 717 return False |
| 718 return True |
| 719 |
| 720 def set_ok_verifiability(self, cookie, request): |
| 721 if request_is_unverifiable(request) and is_third_party(request): |
| 722 if cookie.version > 0 and self.strict_rfc2965_unverifiable: |
| 723 debug(" third-party RFC 2965 cookie during " |
| 724 "unverifiable transaction") |
| 725 return False |
| 726 elif cookie.version == 0 and self.strict_ns_unverifiable: |
| 727 debug(" third-party Netscape cookie during " |
| 728 "unverifiable transaction") |
| 729 return False |
| 730 return True |
| 731 |
| 732 def set_ok_name(self, cookie, request): |
| 733 # Try and stop servers setting V0 cookies designed to hack other |
| 734 # servers that know both V0 and V1 protocols. |
| 735 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and |
| 736 cookie.name.startswith("$")): |
| 737 debug(" illegal name (starts with '$'): '%s'", cookie.name) |
| 738 return False |
| 739 return True |
| 740 |
| 741 def set_ok_path(self, cookie, request): |
| 742 if cookie.path_specified: |
| 743 req_path = request_path(request) |
| 744 if ((cookie.version > 0 or |
| 745 (cookie.version == 0 and self.strict_ns_set_path)) and |
| 746 not req_path.startswith(cookie.path)): |
| 747 debug(" path attribute %s is not a prefix of request " |
| 748 "path %s", cookie.path, req_path) |
| 749 return False |
| 750 return True |
| 751 |
| 752 def set_ok_countrycode_domain(self, cookie, request): |
| 753 """Return False if explicit cookie domain is not acceptable. |
| 754 |
| 755 Called by set_ok_domain, for convenience of overriding by |
| 756 subclasses. |
| 757 |
| 758 """ |
| 759 if cookie.domain_specified and self.strict_domain: |
| 760 domain = cookie.domain |
| 761 # since domain was specified, we know that: |
| 762 assert domain.startswith(".") |
| 763 if domain.count(".") == 2: |
| 764 # domain like .foo.bar |
| 765 i = domain.rfind(".") |
| 766 tld = domain[i+1:] |
| 767 sld = domain[1:i] |
| 768 if (sld.lower() in [ |
| 769 "co", "ac", |
| 770 "com", "edu", "org", "net", "gov", "mil", "int", |
| 771 "aero", "biz", "cat", "coop", "info", "jobs", "mobi", |
| 772 "museum", "name", "pro", "travel", |
| 773 ] and |
| 774 len(tld) == 2): |
| 775 # domain like .co.uk |
| 776 return False |
| 777 return True |
| 778 |
| 779 def set_ok_domain(self, cookie, request): |
| 780 if self.is_blocked(cookie.domain): |
| 781 debug(" domain %s is in user block-list", cookie.domain) |
| 782 return False |
| 783 if self.is_not_allowed(cookie.domain): |
| 784 debug(" domain %s is not in user allow-list", cookie.domain) |
| 785 return False |
| 786 if not self.set_ok_countrycode_domain(cookie, request): |
| 787 debug(" country-code second level domain %s", cookie.domain) |
| 788 return False |
| 789 if cookie.domain_specified: |
| 790 req_host, erhn = eff_request_host_lc(request) |
| 791 domain = cookie.domain |
| 792 if domain.startswith("."): |
| 793 undotted_domain = domain[1:] |
| 794 else: |
| 795 undotted_domain = domain |
| 796 embedded_dots = (undotted_domain.find(".") >= 0) |
| 797 if not embedded_dots and domain != ".local": |
| 798 debug(" non-local domain %s contains no embedded dot", |
| 799 domain) |
| 800 return False |
| 801 if cookie.version == 0: |
| 802 if (not erhn.endswith(domain) and |
| 803 (not erhn.startswith(".") and |
| 804 not ("."+erhn).endswith(domain))): |
| 805 debug(" effective request-host %s (even with added " |
| 806 "initial dot) does not end end with %s", |
| 807 erhn, domain) |
| 808 return False |
| 809 if (cookie.version > 0 or |
| 810 (self.strict_ns_domain & self.DomainRFC2965Match)): |
| 811 if not domain_match(erhn, domain): |
| 812 debug(" effective request-host %s does not domain-match " |
| 813 "%s", erhn, domain) |
| 814 return False |
| 815 if (cookie.version > 0 or |
| 816 (self.strict_ns_domain & self.DomainStrictNoDots)): |
| 817 host_prefix = req_host[:-len(domain)] |
| 818 if (host_prefix.find(".") >= 0 and |
| 819 not IPV4_RE.search(req_host)): |
| 820 debug(" host prefix %s for domain %s contains a dot", |
| 821 host_prefix, domain) |
| 822 return False |
| 823 return True |
| 824 |
| 825 def set_ok_port(self, cookie, request): |
| 826 if cookie.port_specified: |
| 827 req_port = request_port(request) |
| 828 if req_port is None: |
| 829 req_port = "80" |
| 830 else: |
| 831 req_port = str(req_port) |
| 832 for p in cookie.port.split(","): |
| 833 try: |
| 834 int(p) |
| 835 except ValueError: |
| 836 debug(" bad port %s (not numeric)", p) |
| 837 return False |
| 838 if p == req_port: |
| 839 break |
| 840 else: |
| 841 debug(" request port (%s) not found in %s", |
| 842 req_port, cookie.port) |
| 843 return False |
| 844 return True |
| 845 |
| 846 def return_ok(self, cookie, request): |
| 847 """ |
| 848 If you override return_ok, be sure to call this method. If it returns |
| 849 false, so should your subclass (assuming your subclass wants to be more |
| 850 strict about which cookies to return). |
| 851 |
| 852 """ |
| 853 # Path has already been checked by path_return_ok, and domain blocking |
| 854 # done by domain_return_ok. |
| 855 debug(" - checking cookie %s", cookie) |
| 856 |
| 857 for n in ("version", "verifiability", "secure", "expires", "port", |
| 858 "domain"): |
| 859 fn_name = "return_ok_"+n |
| 860 fn = getattr(self, fn_name) |
| 861 if not fn(cookie, request): |
| 862 return False |
| 863 return True |
| 864 |
| 865 def return_ok_version(self, cookie, request): |
| 866 if cookie.version > 0 and not self.rfc2965: |
| 867 debug(" RFC 2965 cookies are switched off") |
| 868 return False |
| 869 elif cookie.version == 0 and not self.netscape: |
| 870 debug(" Netscape cookies are switched off") |
| 871 return False |
| 872 return True |
| 873 |
| 874 def return_ok_verifiability(self, cookie, request): |
| 875 if request_is_unverifiable(request) and is_third_party(request): |
| 876 if cookie.version > 0 and self.strict_rfc2965_unverifiable: |
| 877 debug(" third-party RFC 2965 cookie during unverifiable " |
| 878 "transaction") |
| 879 return False |
| 880 elif cookie.version == 0 and self.strict_ns_unverifiable: |
| 881 debug(" third-party Netscape cookie during unverifiable " |
| 882 "transaction") |
| 883 return False |
| 884 return True |
| 885 |
| 886 def return_ok_secure(self, cookie, request): |
| 887 if cookie.secure and request.get_type() != "https": |
| 888 debug(" secure cookie with non-secure request") |
| 889 return False |
| 890 return True |
| 891 |
| 892 def return_ok_expires(self, cookie, request): |
| 893 if cookie.is_expired(self._now): |
| 894 debug(" cookie expired") |
| 895 return False |
| 896 return True |
| 897 |
| 898 def return_ok_port(self, cookie, request): |
| 899 if cookie.port: |
| 900 req_port = request_port(request) |
| 901 if req_port is None: |
| 902 req_port = "80" |
| 903 for p in cookie.port.split(","): |
| 904 if p == req_port: |
| 905 break |
| 906 else: |
| 907 debug(" request port %s does not match cookie port %s", |
| 908 req_port, cookie.port) |
| 909 return False |
| 910 return True |
| 911 |
| 912 def return_ok_domain(self, cookie, request): |
| 913 req_host, erhn = eff_request_host_lc(request) |
| 914 domain = cookie.domain |
| 915 |
| 916 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't |
| 917 if (cookie.version == 0 and |
| 918 (self.strict_ns_domain & self.DomainStrictNonDomain) and |
| 919 not cookie.domain_specified and domain != erhn): |
| 920 debug(" cookie with unspecified domain does not string-compare " |
| 921 "equal to request domain") |
| 922 return False |
| 923 |
| 924 if cookie.version > 0 and not domain_match(erhn, domain): |
| 925 debug(" effective request-host name %s does not domain-match " |
| 926 "RFC 2965 cookie domain %s", erhn, domain) |
| 927 return False |
| 928 if cookie.version == 0 and not ("."+erhn).endswith(domain): |
| 929 debug(" request-host %s does not match Netscape cookie domain " |
| 930 "%s", req_host, domain) |
| 931 return False |
| 932 return True |
| 933 |
| 934 def domain_return_ok(self, domain, request): |
| 935 # Liberal check of domain. This is here as an optimization to avoid |
| 936 # having to load lots of MSIE cookie files unless necessary. |
| 937 |
| 938 # Munge req_host and erhn to always start with a dot, so as to err on |
| 939 # the side of letting cookies through. |
| 940 dotted_req_host, dotted_erhn = eff_request_host_lc(request) |
| 941 if not dotted_req_host.startswith("."): |
| 942 dotted_req_host = "."+dotted_req_host |
| 943 if not dotted_erhn.startswith("."): |
| 944 dotted_erhn = "."+dotted_erhn |
| 945 if not (dotted_req_host.endswith(domain) or |
| 946 dotted_erhn.endswith(domain)): |
| 947 #debug(" request domain %s does not match cookie domain %s", |
| 948 # req_host, domain) |
| 949 return False |
| 950 |
| 951 if self.is_blocked(domain): |
| 952 debug(" domain %s is in user block-list", domain) |
| 953 return False |
| 954 if self.is_not_allowed(domain): |
| 955 debug(" domain %s is not in user allow-list", domain) |
| 956 return False |
| 957 |
| 958 return True |
| 959 |
| 960 def path_return_ok(self, path, request): |
| 961 debug("- checking cookie path=%s", path) |
| 962 req_path = request_path(request) |
| 963 if not req_path.startswith(path): |
| 964 debug(" %s does not path-match %s", req_path, path) |
| 965 return False |
| 966 return True |
| 967 |
| 968 |
| 969 def vals_sorted_by_key(adict): |
| 970 keys = adict.keys() |
| 971 keys.sort() |
| 972 return map(adict.get, keys) |
| 973 |
| 974 class MappingIterator: |
| 975 """Iterates over nested mapping, depth-first, in sorted order by key.""" |
| 976 def __init__(self, mapping): |
| 977 self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack |
| 978 |
| 979 def __iter__(self): return self |
| 980 |
| 981 def next(self): |
| 982 # this is hairy because of lack of generators |
| 983 while 1: |
| 984 try: |
| 985 vals, i, prev_item = self._s.pop() |
| 986 except IndexError: |
| 987 raise StopIteration() |
| 988 if i < len(vals): |
| 989 item = vals[i] |
| 990 i = i + 1 |
| 991 self._s.append((vals, i, prev_item)) |
| 992 try: |
| 993 item.items |
| 994 except AttributeError: |
| 995 # non-mapping |
| 996 break |
| 997 else: |
| 998 # mapping |
| 999 self._s.append((vals_sorted_by_key(item), 0, item)) |
| 1000 continue |
| 1001 return item |
| 1002 |
| 1003 |
| 1004 # Used as second parameter to dict.get method, to distinguish absent |
| 1005 # dict key from one with a None value. |
| 1006 class Absent: pass |
| 1007 |
| 1008 class CookieJar: |
| 1009 """Collection of HTTP cookies. |
| 1010 |
| 1011 You may not need to know about this class: try mechanize.urlopen(). |
| 1012 |
| 1013 The major methods are extract_cookies and add_cookie_header; these are all |
| 1014 you are likely to need. |
| 1015 |
| 1016 CookieJar supports the iterator protocol: |
| 1017 |
| 1018 for cookie in cookiejar: |
| 1019 # do something with cookie |
| 1020 |
| 1021 Methods: |
| 1022 |
| 1023 add_cookie_header(request) |
| 1024 extract_cookies(response, request) |
| 1025 get_policy() |
| 1026 set_policy(policy) |
| 1027 cookies_for_request(request) |
| 1028 make_cookies(response, request) |
| 1029 set_cookie_if_ok(cookie, request) |
| 1030 set_cookie(cookie) |
| 1031 clear_session_cookies() |
| 1032 clear_expired_cookies() |
| 1033 clear(domain=None, path=None, name=None) |
| 1034 |
| 1035 Public attributes |
| 1036 |
| 1037 policy: CookiePolicy object |
| 1038 |
| 1039 """ |
| 1040 |
| 1041 non_word_re = re.compile(r"\W") |
| 1042 quote_re = re.compile(r"([\"\\])") |
| 1043 strict_domain_re = re.compile(r"\.?[^.]*") |
| 1044 domain_re = re.compile(r"[^.]*") |
| 1045 dots_re = re.compile(r"^\.+") |
| 1046 |
| 1047 def __init__(self, policy=None): |
| 1048 """ |
| 1049 See CookieJar.__doc__ for argument documentation. |
| 1050 |
| 1051 """ |
| 1052 if policy is None: |
| 1053 policy = DefaultCookiePolicy() |
| 1054 self._policy = policy |
| 1055 |
| 1056 self._cookies = {} |
| 1057 |
| 1058 # for __getitem__ iteration in pre-2.2 Pythons |
| 1059 self._prev_getitem_index = 0 |
| 1060 |
| 1061 def get_policy(self): |
| 1062 return self._policy |
| 1063 |
| 1064 def set_policy(self, policy): |
| 1065 self._policy = policy |
| 1066 |
| 1067 def _cookies_for_domain(self, domain, request): |
| 1068 cookies = [] |
| 1069 if not self._policy.domain_return_ok(domain, request): |
| 1070 return [] |
| 1071 debug("Checking %s for cookies to return", domain) |
| 1072 cookies_by_path = self._cookies[domain] |
| 1073 for path in cookies_by_path.keys(): |
| 1074 if not self._policy.path_return_ok(path, request): |
| 1075 continue |
| 1076 cookies_by_name = cookies_by_path[path] |
| 1077 for cookie in cookies_by_name.values(): |
| 1078 if not self._policy.return_ok(cookie, request): |
| 1079 debug(" not returning cookie") |
| 1080 continue |
| 1081 debug(" it's a match") |
| 1082 cookies.append(cookie) |
| 1083 return cookies |
| 1084 |
| 1085 def cookies_for_request(self, request): |
| 1086 """Return a list of cookies to be returned to server. |
| 1087 |
| 1088 The returned list of cookie instances is sorted in the order they |
| 1089 should appear in the Cookie: header for return to the server. |
| 1090 |
| 1091 See add_cookie_header.__doc__ for the interface required of the |
| 1092 request argument. |
| 1093 |
| 1094 New in version 0.1.10 |
| 1095 |
| 1096 """ |
| 1097 self._policy._now = self._now = int(time.time()) |
| 1098 cookies = self._cookies_for_request(request) |
| 1099 # add cookies in order of most specific (i.e. longest) path first |
| 1100 def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) |
| 1101 cookies.sort(decreasing_size) |
| 1102 return cookies |
| 1103 |
| 1104 def _cookies_for_request(self, request): |
| 1105 """Return a list of cookies to be returned to server.""" |
| 1106 # this method still exists (alongside cookies_for_request) because it |
| 1107 # is part of an implied protected interface for subclasses of cookiejar |
| 1108 # XXX document that implied interface, or provide another way of |
| 1109 # implementing cookiejars than subclassing |
| 1110 cookies = [] |
| 1111 for domain in self._cookies.keys(): |
| 1112 cookies.extend(self._cookies_for_domain(domain, request)) |
| 1113 return cookies |
| 1114 |
| 1115 def _cookie_attrs(self, cookies): |
| 1116 """Return a list of cookie-attributes to be returned to server. |
| 1117 |
| 1118 The $Version attribute is also added when appropriate (currently only |
| 1119 once per request). |
| 1120 |
| 1121 >>> jar = CookieJar() |
| 1122 >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False, |
| 1123 ... "example.com", False, False, |
| 1124 ... "/", False, False, None, True, |
| 1125 ... None, None, {}) |
| 1126 >>> jar._cookie_attrs([ns_cookie]) |
| 1127 ['foo="bar"'] |
| 1128 >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False, |
| 1129 ... ".example.com", True, False, |
| 1130 ... "/", False, False, None, True, |
| 1131 ... None, None, {}) |
| 1132 >>> jar._cookie_attrs([rfc2965_cookie]) |
| 1133 ['$Version=1', 'foo=bar', '$Domain="example.com"'] |
| 1134 |
| 1135 """ |
| 1136 version_set = False |
| 1137 |
| 1138 attrs = [] |
| 1139 for cookie in cookies: |
| 1140 # set version of Cookie header |
| 1141 # XXX |
| 1142 # What should it be if multiple matching Set-Cookie headers have |
| 1143 # different versions themselves? |
| 1144 # Answer: there is no answer; was supposed to be settled by |
| 1145 # RFC 2965 errata, but that may never appear... |
| 1146 version = cookie.version |
| 1147 if not version_set: |
| 1148 version_set = True |
| 1149 if version > 0: |
| 1150 attrs.append("$Version=%s" % version) |
| 1151 |
| 1152 # quote cookie value if necessary |
| 1153 # (not for Netscape protocol, which already has any quotes |
| 1154 # intact, due to the poorly-specified Netscape Cookie: syntax) |
| 1155 if ((cookie.value is not None) and |
| 1156 self.non_word_re.search(cookie.value) and version > 0): |
| 1157 value = self.quote_re.sub(r"\\\1", cookie.value) |
| 1158 else: |
| 1159 value = cookie.value |
| 1160 |
| 1161 # add cookie-attributes to be returned in Cookie header |
| 1162 if cookie.value is None: |
| 1163 attrs.append(cookie.name) |
| 1164 else: |
| 1165 attrs.append("%s=%s" % (cookie.name, value)) |
| 1166 if version > 0: |
| 1167 if cookie.path_specified: |
| 1168 attrs.append('$Path="%s"' % cookie.path) |
| 1169 if cookie.domain.startswith("."): |
| 1170 domain = cookie.domain |
| 1171 if (not cookie.domain_initial_dot and |
| 1172 domain.startswith(".")): |
| 1173 domain = domain[1:] |
| 1174 attrs.append('$Domain="%s"' % domain) |
| 1175 if cookie.port is not None: |
| 1176 p = "$Port" |
| 1177 if cookie.port_specified: |
| 1178 p = p + ('="%s"' % cookie.port) |
| 1179 attrs.append(p) |
| 1180 |
| 1181 return attrs |
| 1182 |
| 1183 def add_cookie_header(self, request): |
| 1184 """Add correct Cookie: header to request (mechanize.Request object). |
| 1185 |
| 1186 The Cookie2 header is also added unless policy.hide_cookie2 is true. |
| 1187 |
| 1188 The request object (usually a mechanize.Request instance) must support |
| 1189 the methods get_full_url, get_host, is_unverifiable, get_type, |
| 1190 has_header, get_header, header_items and add_unredirected_header, as |
| 1191 documented by urllib2. |
| 1192 """ |
| 1193 debug("add_cookie_header") |
| 1194 cookies = self.cookies_for_request(request) |
| 1195 |
| 1196 attrs = self._cookie_attrs(cookies) |
| 1197 if attrs: |
| 1198 if not request.has_header("Cookie"): |
| 1199 request.add_unredirected_header("Cookie", "; ".join(attrs)) |
| 1200 |
| 1201 # if necessary, advertise that we know RFC 2965 |
| 1202 if self._policy.rfc2965 and not self._policy.hide_cookie2: |
| 1203 for cookie in cookies: |
| 1204 if cookie.version != 1 and not request.has_header("Cookie2"): |
| 1205 request.add_unredirected_header("Cookie2", '$Version="1"') |
| 1206 break |
| 1207 |
| 1208 self.clear_expired_cookies() |
| 1209 |
| 1210 def _normalized_cookie_tuples(self, attrs_set): |
| 1211 """Return list of tuples containing normalised cookie information. |
| 1212 |
| 1213 attrs_set is the list of lists of key,value pairs extracted from |
| 1214 the Set-Cookie or Set-Cookie2 headers. |
| 1215 |
| 1216 Tuples are name, value, standard, rest, where name and value are the |
| 1217 cookie name and value, standard is a dictionary containing the standard |
| 1218 cookie-attributes (discard, secure, version, expires or max-age, |
| 1219 domain, path and port) and rest is a dictionary containing the rest of |
| 1220 the cookie-attributes. |
| 1221 |
| 1222 """ |
| 1223 cookie_tuples = [] |
| 1224 |
| 1225 boolean_attrs = "discard", "secure" |
| 1226 value_attrs = ("version", |
| 1227 "expires", "max-age", |
| 1228 "domain", "path", "port", |
| 1229 "comment", "commenturl") |
| 1230 |
| 1231 for cookie_attrs in attrs_set: |
| 1232 name, value = cookie_attrs[0] |
| 1233 |
| 1234 # Build dictionary of standard cookie-attributes (standard) and |
| 1235 # dictionary of other cookie-attributes (rest). |
| 1236 |
| 1237 # Note: expiry time is normalised to seconds since epoch. V0 |
| 1238 # cookies should have the Expires cookie-attribute, and V1 cookies |
| 1239 # should have Max-Age, but since V1 includes RFC 2109 cookies (and |
| 1240 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we |
| 1241 # accept either (but prefer Max-Age). |
| 1242 max_age_set = False |
| 1243 |
| 1244 bad_cookie = False |
| 1245 |
| 1246 standard = {} |
| 1247 rest = {} |
| 1248 for k, v in cookie_attrs[1:]: |
| 1249 lc = k.lower() |
| 1250 # don't lose case distinction for unknown fields |
| 1251 if lc in value_attrs or lc in boolean_attrs: |
| 1252 k = lc |
| 1253 if k in boolean_attrs and v is None: |
| 1254 # boolean cookie-attribute is present, but has no value |
| 1255 # (like "discard", rather than "port=80") |
| 1256 v = True |
| 1257 if standard.has_key(k): |
| 1258 # only first value is significant |
| 1259 continue |
| 1260 if k == "domain": |
| 1261 if v is None: |
| 1262 debug(" missing value for domain attribute") |
| 1263 bad_cookie = True |
| 1264 break |
| 1265 # RFC 2965 section 3.3.3 |
| 1266 v = v.lower() |
| 1267 if k == "expires": |
| 1268 if max_age_set: |
| 1269 # Prefer max-age to expires (like Mozilla) |
| 1270 continue |
| 1271 if v is None: |
| 1272 debug(" missing or invalid value for expires " |
| 1273 "attribute: treating as session cookie") |
| 1274 continue |
| 1275 if k == "max-age": |
| 1276 max_age_set = True |
| 1277 if v is None: |
| 1278 debug(" missing value for max-age attribute") |
| 1279 bad_cookie = True |
| 1280 break |
| 1281 try: |
| 1282 v = int(v) |
| 1283 except ValueError: |
| 1284 debug(" missing or invalid (non-numeric) value for " |
| 1285 "max-age attribute") |
| 1286 bad_cookie = True |
| 1287 break |
| 1288 # convert RFC 2965 Max-Age to seconds since epoch |
| 1289 # XXX Strictly you're supposed to follow RFC 2616 |
| 1290 # age-calculation rules. Remember that zero Max-Age is a |
| 1291 # is a request to discard (old and new) cookie, though. |
| 1292 k = "expires" |
| 1293 v = self._now + v |
| 1294 if (k in value_attrs) or (k in boolean_attrs): |
| 1295 if (v is None and |
| 1296 k not in ["port", "comment", "commenturl"]): |
| 1297 debug(" missing value for %s attribute" % k) |
| 1298 bad_cookie = True |
| 1299 break |
| 1300 standard[k] = v |
| 1301 else: |
| 1302 rest[k] = v |
| 1303 |
| 1304 if bad_cookie: |
| 1305 continue |
| 1306 |
| 1307 cookie_tuples.append((name, value, standard, rest)) |
| 1308 |
| 1309 return cookie_tuples |
| 1310 |
| 1311 def _cookie_from_cookie_tuple(self, tup, request): |
| 1312 # standard is dict of standard cookie-attributes, rest is dict of the |
| 1313 # rest of them |
| 1314 name, value, standard, rest = tup |
| 1315 |
| 1316 domain = standard.get("domain", Absent) |
| 1317 path = standard.get("path", Absent) |
| 1318 port = standard.get("port", Absent) |
| 1319 expires = standard.get("expires", Absent) |
| 1320 |
| 1321 # set the easy defaults |
| 1322 version = standard.get("version", None) |
| 1323 if version is not None: |
| 1324 try: |
| 1325 version = int(version) |
| 1326 except ValueError: |
| 1327 return None # invalid version, ignore cookie |
| 1328 secure = standard.get("secure", False) |
| 1329 # (discard is also set if expires is Absent) |
| 1330 discard = standard.get("discard", False) |
| 1331 comment = standard.get("comment", None) |
| 1332 comment_url = standard.get("commenturl", None) |
| 1333 |
| 1334 # set default path |
| 1335 if path is not Absent and path != "": |
| 1336 path_specified = True |
| 1337 path = escape_path(path) |
| 1338 else: |
| 1339 path_specified = False |
| 1340 path = request_path(request) |
| 1341 i = path.rfind("/") |
| 1342 if i != -1: |
| 1343 if version == 0: |
| 1344 # Netscape spec parts company from reality here |
| 1345 path = path[:i] |
| 1346 else: |
| 1347 path = path[:i+1] |
| 1348 if len(path) == 0: path = "/" |
| 1349 |
| 1350 # set default domain |
| 1351 domain_specified = domain is not Absent |
| 1352 # but first we have to remember whether it starts with a dot |
| 1353 domain_initial_dot = False |
| 1354 if domain_specified: |
| 1355 domain_initial_dot = bool(domain.startswith(".")) |
| 1356 if domain is Absent: |
| 1357 req_host, erhn = eff_request_host_lc(request) |
| 1358 domain = erhn |
| 1359 elif not domain.startswith("."): |
| 1360 domain = "."+domain |
| 1361 |
| 1362 # set default port |
| 1363 port_specified = False |
| 1364 if port is not Absent: |
| 1365 if port is None: |
| 1366 # Port attr present, but has no value: default to request port. |
| 1367 # Cookie should then only be sent back on that port. |
| 1368 port = request_port(request) |
| 1369 else: |
| 1370 port_specified = True |
| 1371 port = re.sub(r"\s+", "", port) |
| 1372 else: |
| 1373 # No port attr present. Cookie can be sent back on any port. |
| 1374 port = None |
| 1375 |
| 1376 # set default expires and discard |
| 1377 if expires is Absent: |
| 1378 expires = None |
| 1379 discard = True |
| 1380 |
| 1381 return Cookie(version, |
| 1382 name, value, |
| 1383 port, port_specified, |
| 1384 domain, domain_specified, domain_initial_dot, |
| 1385 path, path_specified, |
| 1386 secure, |
| 1387 expires, |
| 1388 discard, |
| 1389 comment, |
| 1390 comment_url, |
| 1391 rest) |
| 1392 |
| 1393 def _cookies_from_attrs_set(self, attrs_set, request): |
| 1394 cookie_tuples = self._normalized_cookie_tuples(attrs_set) |
| 1395 |
| 1396 cookies = [] |
| 1397 for tup in cookie_tuples: |
| 1398 cookie = self._cookie_from_cookie_tuple(tup, request) |
| 1399 if cookie: cookies.append(cookie) |
| 1400 return cookies |
| 1401 |
| 1402 def _process_rfc2109_cookies(self, cookies): |
| 1403 if self._policy.rfc2109_as_netscape is None: |
| 1404 rfc2109_as_netscape = not self._policy.rfc2965 |
| 1405 else: |
| 1406 rfc2109_as_netscape = self._policy.rfc2109_as_netscape |
| 1407 for cookie in cookies: |
| 1408 if cookie.version == 1: |
| 1409 cookie.rfc2109 = True |
| 1410 if rfc2109_as_netscape: |
| 1411 # treat 2109 cookies as Netscape cookies rather than |
| 1412 # as RFC2965 cookies |
| 1413 cookie.version = 0 |
| 1414 |
| 1415 def _make_cookies(self, response, request): |
| 1416 # get cookie-attributes for RFC 2965 and Netscape protocols |
| 1417 headers = response.info() |
| 1418 rfc2965_hdrs = headers.getheaders("Set-Cookie2") |
| 1419 ns_hdrs = headers.getheaders("Set-Cookie") |
| 1420 |
| 1421 rfc2965 = self._policy.rfc2965 |
| 1422 netscape = self._policy.netscape |
| 1423 |
| 1424 if ((not rfc2965_hdrs and not ns_hdrs) or |
| 1425 (not ns_hdrs and not rfc2965) or |
| 1426 (not rfc2965_hdrs and not netscape) or |
| 1427 (not netscape and not rfc2965)): |
| 1428 return [] # no relevant cookie headers: quick exit |
| 1429 |
| 1430 try: |
| 1431 cookies = self._cookies_from_attrs_set( |
| 1432 split_header_words(rfc2965_hdrs), request) |
| 1433 except: |
| 1434 reraise_unmasked_exceptions() |
| 1435 cookies = [] |
| 1436 |
| 1437 if ns_hdrs and netscape: |
| 1438 try: |
| 1439 # RFC 2109 and Netscape cookies |
| 1440 ns_cookies = self._cookies_from_attrs_set( |
| 1441 parse_ns_headers(ns_hdrs), request) |
| 1442 except: |
| 1443 reraise_unmasked_exceptions() |
| 1444 ns_cookies = [] |
| 1445 self._process_rfc2109_cookies(ns_cookies) |
| 1446 |
| 1447 # Look for Netscape cookies (from Set-Cookie headers) that match |
| 1448 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). |
| 1449 # For each match, keep the RFC 2965 cookie and ignore the Netscape |
| 1450 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are |
| 1451 # bundled in with the Netscape cookies for this purpose, which is |
| 1452 # reasonable behaviour. |
| 1453 if rfc2965: |
| 1454 lookup = {} |
| 1455 for cookie in cookies: |
| 1456 lookup[(cookie.domain, cookie.path, cookie.name)] = None |
| 1457 |
| 1458 def no_matching_rfc2965(ns_cookie, lookup=lookup): |
| 1459 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name |
| 1460 return not lookup.has_key(key) |
| 1461 ns_cookies = filter(no_matching_rfc2965, ns_cookies) |
| 1462 |
| 1463 if ns_cookies: |
| 1464 cookies.extend(ns_cookies) |
| 1465 |
| 1466 return cookies |
| 1467 |
| 1468 def make_cookies(self, response, request): |
| 1469 """Return sequence of Cookie objects extracted from response object. |
| 1470 |
| 1471 See extract_cookies.__doc__ for the interface required of the |
| 1472 response and request arguments. |
| 1473 |
| 1474 """ |
| 1475 self._policy._now = self._now = int(time.time()) |
| 1476 return [cookie for cookie in self._make_cookies(response, request) |
| 1477 if cookie.expires is None or not cookie.expires <= self._now] |
| 1478 |
| 1479 def set_cookie_if_ok(self, cookie, request): |
| 1480 """Set a cookie if policy says it's OK to do so. |
| 1481 |
| 1482 cookie: mechanize.Cookie instance |
| 1483 request: see extract_cookies.__doc__ for the required interface |
| 1484 |
| 1485 """ |
| 1486 self._policy._now = self._now = int(time.time()) |
| 1487 |
| 1488 if self._policy.set_ok(cookie, request): |
| 1489 self.set_cookie(cookie) |
| 1490 |
| 1491 def set_cookie(self, cookie): |
| 1492 """Set a cookie, without checking whether or not it should be set. |
| 1493 |
| 1494 cookie: mechanize.Cookie instance |
| 1495 """ |
| 1496 c = self._cookies |
| 1497 if not c.has_key(cookie.domain): c[cookie.domain] = {} |
| 1498 c2 = c[cookie.domain] |
| 1499 if not c2.has_key(cookie.path): c2[cookie.path] = {} |
| 1500 c3 = c2[cookie.path] |
| 1501 c3[cookie.name] = cookie |
| 1502 |
| 1503 def extract_cookies(self, response, request): |
| 1504 """Extract cookies from response, where allowable given the request. |
| 1505 |
| 1506 Look for allowable Set-Cookie: and Set-Cookie2: headers in the response |
| 1507 object passed as argument. Any of these headers that are found are |
| 1508 used to update the state of the object (subject to the policy.set_ok |
| 1509 method's approval). |
| 1510 |
| 1511 The response object (usually be the result of a call to |
| 1512 mechanize.urlopen, or similar) should support an info method, which |
| 1513 returns a mimetools.Message object (in fact, the 'mimetools.Message |
| 1514 object' may be any object that provides a getheaders method). |
| 1515 |
| 1516 The request object (usually a mechanize.Request instance) must support |
| 1517 the methods get_full_url, get_type, get_host, and is_unverifiable, as |
| 1518 documented by mechanize, and the port attribute (the port number). The |
| 1519 request is used to set default values for cookie-attributes as well as |
| 1520 for checking that the cookie is OK to be set. |
| 1521 |
| 1522 """ |
| 1523 debug("extract_cookies: %s", response.info()) |
| 1524 self._policy._now = self._now = int(time.time()) |
| 1525 |
| 1526 for cookie in self._make_cookies(response, request): |
| 1527 if cookie.expires is not None and cookie.expires <= self._now: |
| 1528 # Expiry date in past is request to delete cookie. This can't b
e |
| 1529 # in DefaultCookiePolicy, because can't delete cookies there. |
| 1530 try: |
| 1531 self.clear(cookie.domain, cookie.path, cookie.name) |
| 1532 except KeyError: |
| 1533 pass |
| 1534 debug("Expiring cookie, domain='%s', path='%s', name='%s'", |
| 1535 cookie.domain, cookie.path, cookie.name) |
| 1536 elif self._policy.set_ok(cookie, request): |
| 1537 debug(" setting cookie: %s", cookie) |
| 1538 self.set_cookie(cookie) |
| 1539 |
| 1540 def clear(self, domain=None, path=None, name=None): |
| 1541 """Clear some cookies. |
| 1542 |
| 1543 Invoking this method without arguments will clear all cookies. If |
| 1544 given a single argument, only cookies belonging to that domain will be |
| 1545 removed. If given two arguments, cookies belonging to the specified |
| 1546 path within that domain are removed. If given three arguments, then |
| 1547 the cookie with the specified name, path and domain is removed. |
| 1548 |
| 1549 Raises KeyError if no matching cookie exists. |
| 1550 |
| 1551 """ |
| 1552 if name is not None: |
| 1553 if (domain is None) or (path is None): |
| 1554 raise ValueError( |
| 1555 "domain and path must be given to remove a cookie by name") |
| 1556 del self._cookies[domain][path][name] |
| 1557 elif path is not None: |
| 1558 if domain is None: |
| 1559 raise ValueError( |
| 1560 "domain must be given to remove cookies by path") |
| 1561 del self._cookies[domain][path] |
| 1562 elif domain is not None: |
| 1563 del self._cookies[domain] |
| 1564 else: |
| 1565 self._cookies = {} |
| 1566 |
| 1567 def clear_session_cookies(self): |
| 1568 """Discard all session cookies. |
| 1569 |
| 1570 Discards all cookies held by object which had either no Max-Age or |
| 1571 Expires cookie-attribute or an explicit Discard cookie-attribute, or |
| 1572 which otherwise have ended up with a true discard attribute. For |
| 1573 interactive browsers, the end of a session usually corresponds to |
| 1574 closing the browser window. |
| 1575 |
| 1576 Note that the save method won't save session cookies anyway, unless you |
| 1577 ask otherwise by passing a true ignore_discard argument. |
| 1578 |
| 1579 """ |
| 1580 for cookie in self: |
| 1581 if cookie.discard: |
| 1582 self.clear(cookie.domain, cookie.path, cookie.name) |
| 1583 |
| 1584 def clear_expired_cookies(self): |
| 1585 """Discard all expired cookies. |
| 1586 |
| 1587 You probably don't need to call this method: expired cookies are never |
| 1588 sent back to the server (provided you're using DefaultCookiePolicy), |
| 1589 this method is called by CookieJar itself every so often, and the save |
| 1590 method won't save expired cookies anyway (unless you ask otherwise by |
| 1591 passing a true ignore_expires argument). |
| 1592 |
| 1593 """ |
| 1594 now = time.time() |
| 1595 for cookie in self: |
| 1596 if cookie.is_expired(now): |
| 1597 self.clear(cookie.domain, cookie.path, cookie.name) |
| 1598 |
| 1599 def __getitem__(self, i): |
| 1600 if i == 0: |
| 1601 self._getitem_iterator = self.__iter__() |
| 1602 elif self._prev_getitem_index != i-1: raise IndexError( |
| 1603 "CookieJar.__getitem__ only supports sequential iteration") |
| 1604 self._prev_getitem_index = i |
| 1605 try: |
| 1606 return self._getitem_iterator.next() |
| 1607 except StopIteration: |
| 1608 raise IndexError() |
| 1609 |
| 1610 def __iter__(self): |
| 1611 return MappingIterator(self._cookies) |
| 1612 |
| 1613 def __len__(self): |
| 1614 """Return number of contained cookies.""" |
| 1615 i = 0 |
| 1616 for cookie in self: i = i + 1 |
| 1617 return i |
| 1618 |
| 1619 def __repr__(self): |
| 1620 r = [] |
| 1621 for cookie in self: r.append(repr(cookie)) |
| 1622 return "<%s[%s]>" % (self.__class__, ", ".join(r)) |
| 1623 |
| 1624 def __str__(self): |
| 1625 r = [] |
| 1626 for cookie in self: r.append(str(cookie)) |
| 1627 return "<%s[%s]>" % (self.__class__, ", ".join(r)) |
| 1628 |
| 1629 |
| 1630 class LoadError(Exception): pass |
| 1631 |
| 1632 class FileCookieJar(CookieJar): |
| 1633 """CookieJar that can be loaded from and saved to a file. |
| 1634 |
| 1635 Additional methods |
| 1636 |
| 1637 save(filename=None, ignore_discard=False, ignore_expires=False) |
| 1638 load(filename=None, ignore_discard=False, ignore_expires=False) |
| 1639 revert(filename=None, ignore_discard=False, ignore_expires=False) |
| 1640 |
| 1641 Additional public attributes |
| 1642 |
| 1643 filename: filename for loading and saving cookies |
| 1644 |
| 1645 Additional public readable attributes |
| 1646 |
| 1647 delayload: request that cookies are lazily loaded from disk; this is only |
| 1648 a hint since this only affects performance, not behaviour (unless the |
| 1649 cookies on disk are changing); a CookieJar object may ignore it (in fact, |
| 1650 only MSIECookieJar lazily loads cookies at the moment) |
| 1651 |
| 1652 """ |
| 1653 |
| 1654 def __init__(self, filename=None, delayload=False, policy=None): |
| 1655 """ |
| 1656 See FileCookieJar.__doc__ for argument documentation. |
| 1657 |
| 1658 Cookies are NOT loaded from the named file until either the load or |
| 1659 revert method is called. |
| 1660 |
| 1661 """ |
| 1662 CookieJar.__init__(self, policy) |
| 1663 if filename is not None and not isstringlike(filename): |
| 1664 raise ValueError("filename must be string-like") |
| 1665 self.filename = filename |
| 1666 self.delayload = bool(delayload) |
| 1667 |
| 1668 def save(self, filename=None, ignore_discard=False, ignore_expires=False): |
| 1669 """Save cookies to a file. |
| 1670 |
| 1671 filename: name of file in which to save cookies |
| 1672 ignore_discard: save even cookies set to be discarded |
| 1673 ignore_expires: save even cookies that have expired |
| 1674 |
| 1675 The file is overwritten if it already exists, thus wiping all its |
| 1676 cookies. Saved cookies can be restored later using the load or revert |
| 1677 methods. If filename is not specified, self.filename is used; if |
| 1678 self.filename is None, ValueError is raised. |
| 1679 |
| 1680 """ |
| 1681 raise NotImplementedError() |
| 1682 |
| 1683 def load(self, filename=None, ignore_discard=False, ignore_expires=False): |
| 1684 """Load cookies from a file. |
| 1685 |
| 1686 Old cookies are kept unless overwritten by newly loaded ones. |
| 1687 |
| 1688 Arguments are as for .save(). |
| 1689 |
| 1690 If filename is not specified, self.filename is used; if self.filename |
| 1691 is None, ValueError is raised. The named file must be in the format |
| 1692 understood by the class, or LoadError will be raised. This format will |
| 1693 be identical to that written by the save method, unless the load format |
| 1694 is not sufficiently well understood (as is the case for MSIECookieJar). |
| 1695 |
| 1696 """ |
| 1697 if filename is None: |
| 1698 if self.filename is not None: filename = self.filename |
| 1699 else: raise ValueError(MISSING_FILENAME_TEXT) |
| 1700 |
| 1701 f = open(filename) |
| 1702 try: |
| 1703 self._really_load(f, filename, ignore_discard, ignore_expires) |
| 1704 finally: |
| 1705 f.close() |
| 1706 |
| 1707 def revert(self, filename=None, |
| 1708 ignore_discard=False, ignore_expires=False): |
| 1709 """Clear all cookies and reload cookies from a saved file. |
| 1710 |
| 1711 Raises LoadError (or IOError) if reversion is not successful; the |
| 1712 object's state will not be altered if this happens. |
| 1713 |
| 1714 """ |
| 1715 if filename is None: |
| 1716 if self.filename is not None: filename = self.filename |
| 1717 else: raise ValueError(MISSING_FILENAME_TEXT) |
| 1718 |
| 1719 old_state = copy.deepcopy(self._cookies) |
| 1720 self._cookies = {} |
| 1721 try: |
| 1722 self.load(filename, ignore_discard, ignore_expires) |
| 1723 except (LoadError, IOError): |
| 1724 self._cookies = old_state |
| 1725 raise |
| OLD | NEW |