OLD | NEW |
1 # This Source Code Form is subject to the terms of the Mozilla Public | 1 # This Source Code Form is subject to the terms of the Mozilla Public |
2 # License, v. 2.0. If a copy of the MPL was not distributed with this | 2 # License, v. 2.0. If a copy of the MPL was not distributed with this |
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. | 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4 | 4 |
5 """Module to parse directory listings on a remote FTP server.""" | 5 """Module to parse directory listings on a remote FTP server.""" |
6 | 6 |
7 from HTMLParser import HTMLParser | 7 from HTMLParser import HTMLParser |
8 import re | 8 import re |
| 9 import requests |
9 import urllib | 10 import urllib |
10 | 11 |
11 | 12 |
12 class DirectoryParser(HTMLParser): | 13 class DirectoryParser(HTMLParser): |
13 """Class to parse directory listings""" | 14 """ |
| 15 Class to parse directory listings. |
14 | 16 |
15 def __init__(self, url): | 17 :param url: url of the directory on the web server. |
| 18 :param session: a requests Session instance used to fetch the directory |
| 19 content. If None, a new session will be created. |
| 20 :param authentication: a tuple (username, password) to authenticate against |
| 21 the web server, or None for no authentication. Note |
| 22 that it will only be used if the given *session* is |
| 23 None. |
| 24 :param timeout: timeout in seconds used when fetching the directory |
| 25 content. |
| 26 """ |
| 27 |
| 28 def __init__(self, url, session=None, authentication=None, timeout=None): |
| 29 if not session: |
| 30 session = requests.Session() |
| 31 session.auth = authentication |
| 32 self.session = session |
| 33 self.timeout = timeout |
| 34 |
| 35 self.active_url = None |
| 36 self.entries = [] |
| 37 |
16 HTMLParser.__init__(self) | 38 HTMLParser.__init__(self) |
17 | 39 |
18 self.entries = [ ] | 40 # Force the server to not send cached content |
19 self.active_url = None | 41 headers = {'Cache-Control': 'max-age=0'} |
| 42 r = self.session.get(url, headers=headers, timeout=self.timeout) |
20 | 43 |
21 req = urllib.urlopen(url) | 44 try: |
22 self.feed(req.read()) | 45 r.raise_for_status() |
| 46 self.feed(r.text) |
| 47 finally: |
| 48 r.close() |
23 | 49 |
24 def filter(self, regex): | 50 def filter(self, filter): |
25 pattern = re.compile(regex, re.IGNORECASE) | 51 """Filter entries by calling function or applying regex.""" |
26 return [entry for entry in self.entries if pattern.match(entry)] | 52 |
| 53 if hasattr(filter, '__call__'): |
| 54 return [entry for entry in self.entries if filter(entry)] |
| 55 else: |
| 56 pattern = re.compile(filter, re.IGNORECASE) |
| 57 return [entry for entry in self.entries if pattern.match(entry)] |
27 | 58 |
28 def handle_starttag(self, tag, attrs): | 59 def handle_starttag(self, tag, attrs): |
29 if not tag == 'a': | 60 if not tag == 'a': |
30 return | 61 return |
31 | 62 |
32 for attr in attrs: | 63 for attr in attrs: |
33 if attr[0] == 'href': | 64 if attr[0] == 'href': |
34 self.active_url = attr[1].strip('/') | 65 # Links look like: /pub/firefox/nightly/2015/ |
| 66 # We have to trim the fragment down to the last item. Also to en
sure we |
| 67 # always get it, we remove a possible final slash first |
| 68 has_final_slash = attr[1][-1] == '/' |
| 69 self.active_url = attr[1].rstrip('/').split('/')[-1] |
| 70 |
| 71 # Add back slash in case of sub folders |
| 72 if has_final_slash: |
| 73 self.active_url = '%s/' % self.active_url |
| 74 |
35 return | 75 return |
36 | 76 |
37 def handle_endtag(self, tag): | 77 def handle_endtag(self, tag): |
38 if tag == 'a': | 78 if tag == 'a': |
39 self.active_url = None | 79 self.active_url = None |
40 | 80 |
41 def handle_data(self, data): | 81 def handle_data(self, data): |
42 # Only process the data when we are in an active a tag and have an URL | 82 # Only process the data when we are in an active a tag and have an URL |
43 if not self.active_url: | 83 if not self.active_url: |
44 return | 84 return |
45 | 85 |
46 name = urllib.quote(data.strip('/')) | 86 if self.active_url in (data, urllib.quote(data)): |
47 if self.active_url == name: | 87 self.entries.append(self.active_url.strip('/')) |
48 self.entries.append(self.active_url) | |
OLD | NEW |