OLD | NEW |
(Empty) | |
| 1 from __future__ import absolute_import, division, unicode_literals |
| 2 from future.builtins import str |
| 3 """ robotparser.py |
| 4 |
| 5 Copyright (C) 2000 Bastian Kleineidam |
| 6 |
| 7 You can choose between two licenses when using this package: |
| 8 1) GNU GPLv2 |
| 9 2) PSF license for Python 2.2 |
| 10 |
| 11 The robots.txt Exclusion Protocol is implemented as specified in |
| 12 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html |
| 13 """ |
| 14 |
| 15 # Was: import urllib.parse, urllib.request |
| 16 from future.backports import urllib |
| 17 from future.backports.urllib import parse as _parse, request as _request |
| 18 urllib.parse = _parse |
| 19 urllib.request = _request |
| 20 |
| 21 |
| 22 __all__ = ["RobotFileParser"] |
| 23 |
| 24 class RobotFileParser(object): |
| 25 """ This class provides a set of methods to read, parse and answer |
| 26 questions about a single robots.txt file. |
| 27 |
| 28 """ |
| 29 |
| 30 def __init__(self, url=''): |
| 31 self.entries = [] |
| 32 self.default_entry = None |
| 33 self.disallow_all = False |
| 34 self.allow_all = False |
| 35 self.set_url(url) |
| 36 self.last_checked = 0 |
| 37 |
| 38 def mtime(self): |
| 39 """Returns the time the robots.txt file was last fetched. |
| 40 |
| 41 This is useful for long-running web spiders that need to |
| 42 check for new robots.txt files periodically. |
| 43 |
| 44 """ |
| 45 return self.last_checked |
| 46 |
| 47 def modified(self): |
| 48 """Sets the time the robots.txt file was last fetched to the |
| 49 current time. |
| 50 |
| 51 """ |
| 52 import time |
| 53 self.last_checked = time.time() |
| 54 |
| 55 def set_url(self, url): |
| 56 """Sets the URL referring to a robots.txt file.""" |
| 57 self.url = url |
| 58 self.host, self.path = urllib.parse.urlparse(url)[1:3] |
| 59 |
| 60 def read(self): |
| 61 """Reads the robots.txt URL and feeds it to the parser.""" |
| 62 try: |
| 63 f = urllib.request.urlopen(self.url) |
| 64 except urllib.error.HTTPError as err: |
| 65 if err.code in (401, 403): |
| 66 self.disallow_all = True |
| 67 elif err.code >= 400: |
| 68 self.allow_all = True |
| 69 else: |
| 70 raw = f.read() |
| 71 self.parse(raw.decode("utf-8").splitlines()) |
| 72 |
| 73 def _add_entry(self, entry): |
| 74 if "*" in entry.useragents: |
| 75 # the default entry is considered last |
| 76 if self.default_entry is None: |
| 77 # the first default entry wins |
| 78 self.default_entry = entry |
| 79 else: |
| 80 self.entries.append(entry) |
| 81 |
| 82 def parse(self, lines): |
| 83 """Parse the input lines from a robots.txt file. |
| 84 |
| 85 We allow that a user-agent: line is not preceded by |
| 86 one or more blank lines. |
| 87 """ |
| 88 # states: |
| 89 # 0: start state |
| 90 # 1: saw user-agent line |
| 91 # 2: saw an allow or disallow line |
| 92 state = 0 |
| 93 entry = Entry() |
| 94 |
| 95 for line in lines: |
| 96 if not line: |
| 97 if state == 1: |
| 98 entry = Entry() |
| 99 state = 0 |
| 100 elif state == 2: |
| 101 self._add_entry(entry) |
| 102 entry = Entry() |
| 103 state = 0 |
| 104 # remove optional comment and strip line |
| 105 i = line.find('#') |
| 106 if i >= 0: |
| 107 line = line[:i] |
| 108 line = line.strip() |
| 109 if not line: |
| 110 continue |
| 111 line = line.split(':', 1) |
| 112 if len(line) == 2: |
| 113 line[0] = line[0].strip().lower() |
| 114 line[1] = urllib.parse.unquote(line[1].strip()) |
| 115 if line[0] == "user-agent": |
| 116 if state == 2: |
| 117 self._add_entry(entry) |
| 118 entry = Entry() |
| 119 entry.useragents.append(line[1]) |
| 120 state = 1 |
| 121 elif line[0] == "disallow": |
| 122 if state != 0: |
| 123 entry.rulelines.append(RuleLine(line[1], False)) |
| 124 state = 2 |
| 125 elif line[0] == "allow": |
| 126 if state != 0: |
| 127 entry.rulelines.append(RuleLine(line[1], True)) |
| 128 state = 2 |
| 129 if state == 2: |
| 130 self._add_entry(entry) |
| 131 |
| 132 |
| 133 def can_fetch(self, useragent, url): |
| 134 """using the parsed robots.txt decide if useragent can fetch url""" |
| 135 if self.disallow_all: |
| 136 return False |
| 137 if self.allow_all: |
| 138 return True |
| 139 # search for given user agent matches |
| 140 # the first match counts |
| 141 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) |
| 142 url = urllib.parse.urlunparse(('','',parsed_url.path, |
| 143 parsed_url.params,parsed_url.query, parsed_url.fragment)) |
| 144 url = urllib.parse.quote(url) |
| 145 if not url: |
| 146 url = "/" |
| 147 for entry in self.entries: |
| 148 if entry.applies_to(useragent): |
| 149 return entry.allowance(url) |
| 150 # try the default entry last |
| 151 if self.default_entry: |
| 152 return self.default_entry.allowance(url) |
| 153 # agent not found ==> access granted |
| 154 return True |
| 155 |
| 156 def __str__(self): |
| 157 return ''.join([str(entry) + "\n" for entry in self.entries]) |
| 158 |
| 159 |
| 160 class RuleLine(object): |
| 161 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" |
| 162 (allowance==False) followed by a path.""" |
| 163 def __init__(self, path, allowance): |
| 164 if path == '' and not allowance: |
| 165 # an empty value means allow all |
| 166 allowance = True |
| 167 self.path = urllib.parse.quote(path) |
| 168 self.allowance = allowance |
| 169 |
| 170 def applies_to(self, filename): |
| 171 return self.path == "*" or filename.startswith(self.path) |
| 172 |
| 173 def __str__(self): |
| 174 return (self.allowance and "Allow" or "Disallow") + ": " + self.path |
| 175 |
| 176 |
| 177 class Entry(object): |
| 178 """An entry has one or more user-agents and zero or more rulelines""" |
| 179 def __init__(self): |
| 180 self.useragents = [] |
| 181 self.rulelines = [] |
| 182 |
| 183 def __str__(self): |
| 184 ret = [] |
| 185 for agent in self.useragents: |
| 186 ret.extend(["User-agent: ", agent, "\n"]) |
| 187 for line in self.rulelines: |
| 188 ret.extend([str(line), "\n"]) |
| 189 return ''.join(ret) |
| 190 |
| 191 def applies_to(self, useragent): |
| 192 """check if this entry applies to the specified agent""" |
| 193 # split the name token and make it lower case |
| 194 useragent = useragent.split("/")[0].lower() |
| 195 for agent in self.useragents: |
| 196 if agent == '*': |
| 197 # we have the catch-all agent |
| 198 return True |
| 199 agent = agent.lower() |
| 200 if agent in useragent: |
| 201 return True |
| 202 return False |
| 203 |
| 204 def allowance(self, filename): |
| 205 """Preconditions: |
| 206 - our agent applies to this entry |
| 207 - filename is URL decoded""" |
| 208 for line in self.rulelines: |
| 209 if line.applies_to(filename): |
| 210 return line.allowance |
| 211 return True |
OLD | NEW |