third_party/google-endpoints/future/backports/urllib/robotparser.py - Issue 2666783008: Add google-endpoints to third_party/.

Side by Side Diff: third_party/google-endpoints/future/backports/urllib/robotparser.py

Issue 2666783008: Add google-endpoints to third_party/. (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/google-endpoints/future/backports/urllib/response.py ('k') | third_party/google-endpoints/future/backports/xmlrpc/__init__.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 from __future__ import absolute_import, division, unicode_literals

	2 from future.builtins import str

	3 """ robotparser.py

	4

	5 Copyright (C) 2000 Bastian Kleineidam

	6

	7 You can choose between two licenses when using this package:

	8 1) GNU GPLv2

	9 2) PSF license for Python 2.2

	10

	11 The robots.txt Exclusion Protocol is implemented as specified in

	12 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html

	13 """

	14

	15 # Was: import urllib.parse, urllib.request

	16 from future.backports import urllib

	17 from future.backports.urllib import parse as _parse, request as _request

	18 urllib.parse = _parse

	19 urllib.request = _request

	20

	21

	22 __all__ = ["RobotFileParser"]

	23

	24 class RobotFileParser(object):

	25 """ This class provides a set of methods to read, parse and answer

	26 questions about a single robots.txt file.

	27

	28 """

	29

	30 def __init__(self, url=''):

	31 self.entries = []

	32 self.default_entry = None

	33 self.disallow_all = False

	34 self.allow_all = False

	35 self.set_url(url)

	36 self.last_checked = 0

	37

	38 def mtime(self):

	39 """Returns the time the robots.txt file was last fetched.

	40

	41 This is useful for long-running web spiders that need to

	42 check for new robots.txt files periodically.

	43

	44 """

	45 return self.last_checked

	46

	47 def modified(self):

	48 """Sets the time the robots.txt file was last fetched to the

	49 current time.

	50

	51 """

	52 import time

	53 self.last_checked = time.time()

	54

	55 def set_url(self, url):

	56 """Sets the URL referring to a robots.txt file."""

	57 self.url = url

	58 self.host, self.path = urllib.parse.urlparse(url)[1:3]

	59

	60 def read(self):

	61 """Reads the robots.txt URL and feeds it to the parser."""

	62 try:

	63 f = urllib.request.urlopen(self.url)

	64 except urllib.error.HTTPError as err:

	65 if err.code in (401, 403):

	66 self.disallow_all = True

	67 elif err.code >= 400:

	68 self.allow_all = True

	69 else:

	70 raw = f.read()

	71 self.parse(raw.decode("utf-8").splitlines())

	72

	73 def _add_entry(self, entry):

	74 if "*" in entry.useragents:

	75 # the default entry is considered last

	76 if self.default_entry is None:

	77 # the first default entry wins

	78 self.default_entry = entry

	79 else:

	80 self.entries.append(entry)

	81

	82 def parse(self, lines):

	83 """Parse the input lines from a robots.txt file.

	84

	85 We allow that a user-agent: line is not preceded by

	86 one or more blank lines.

	87 """

	88 # states:

	89 # 0: start state

	90 # 1: saw user-agent line

	91 # 2: saw an allow or disallow line

	92 state = 0

	93 entry = Entry()

	94

	95 for line in lines:

	96 if not line:

	97 if state == 1:

	98 entry = Entry()

	99 state = 0

	100 elif state == 2:

	101 self._add_entry(entry)

	102 entry = Entry()

	103 state = 0

	104 # remove optional comment and strip line

	105 i = line.find('#')

	106 if i >= 0:

	107 line = line[:i]

	108 line = line.strip()

	109 if not line:

	110 continue

	111 line = line.split(':', 1)

	112 if len(line) == 2:

	113 line[0] = line[0].strip().lower()

	114 line[1] = urllib.parse.unquote(line[1].strip())

	115 if line[0] == "user-agent":

	116 if state == 2:

	117 self._add_entry(entry)

	118 entry = Entry()

	119 entry.useragents.append(line[1])

	120 state = 1

	121 elif line[0] == "disallow":

	122 if state != 0:

	123 entry.rulelines.append(RuleLine(line[1], False))

	124 state = 2

	125 elif line[0] == "allow":

	126 if state != 0:

	127 entry.rulelines.append(RuleLine(line[1], True))

	128 state = 2

	129 if state == 2:

	130 self._add_entry(entry)

	131

	132

	133 def can_fetch(self, useragent, url):

	134 """using the parsed robots.txt decide if useragent can fetch url"""

	135 if self.disallow_all:

	136 return False

	137 if self.allow_all:

	138 return True

	139 # search for given user agent matches

	140 # the first match counts

	141 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))

	142 url = urllib.parse.urlunparse(('','',parsed_url.path,

	143 parsed_url.params,parsed_url.query, parsed_url.fragment))

	144 url = urllib.parse.quote(url)

	145 if not url:

	146 url = "/"

	147 for entry in self.entries:

	148 if entry.applies_to(useragent):

	149 return entry.allowance(url)

	150 # try the default entry last

	151 if self.default_entry:

	152 return self.default_entry.allowance(url)

	153 # agent not found ==> access granted

	154 return True

	155

	156 def __str__(self):

	157 return ''.join([str(entry) + "\n" for entry in self.entries])

	158

	159

	160 class RuleLine(object):

	161 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"

	162 (allowance==False) followed by a path."""

	163 def __init__(self, path, allowance):

	164 if path == '' and not allowance:

	165 # an empty value means allow all

	166 allowance = True

	167 self.path = urllib.parse.quote(path)

	168 self.allowance = allowance

	169

	170 def applies_to(self, filename):

	171 return self.path == "*" or filename.startswith(self.path)

	172

	173 def __str__(self):

	174 return (self.allowance and "Allow" or "Disallow") + ": " + self.path

	175

	176

	177 class Entry(object):

	178 """An entry has one or more user-agents and zero or more rulelines"""

	179 def __init__(self):

	180 self.useragents = []

	181 self.rulelines = []

	182

	183 def __str__(self):

	184 ret = []

	185 for agent in self.useragents:

	186 ret.extend(["User-agent: ", agent, "\n"])

	187 for line in self.rulelines:

	188 ret.extend([str(line), "\n"])

	189 return ''.join(ret)

	190

	191 def applies_to(self, useragent):

	192 """check if this entry applies to the specified agent"""

	193 # split the name token and make it lower case

	194 useragent = useragent.split("/")[0].lower()

	195 for agent in self.useragents:

	196 if agent == '*':

	197 # we have the catch-all agent

	198 return True

	199 agent = agent.lower()

	200 if agent in useragent:

	201 return True

	202 return False

	203

	204 def allowance(self, filename):

	205 """Preconditions:

	206 - our agent applies to this entry

	207 - filename is URL decoded"""

	208 for line in self.rulelines:

	209 if line.applies_to(filename):

	210 return line.allowance

	211 return True

OLD	NEW