| OLD | NEW |
| 1 # This Source Code Form is subject to the terms of the Mozilla Public | 1 # This Source Code Form is subject to the terms of the Mozilla Public |
| 2 # License, v. 2.0. If a copy of the MPL was not distributed with this | 2 # License, v. 2.0. If a copy of the MPL was not distributed with this |
| 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. | 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 4 | 4 |
| 5 """Module to parse directory listings on a remote FTP server.""" | 5 """Module to parse directory listings on a remote FTP server.""" |
| 6 | 6 |
| 7 from HTMLParser import HTMLParser | 7 from HTMLParser import HTMLParser |
| 8 import re | 8 import re |
| 9 import requests |
| 9 import urllib | 10 import urllib |
| 10 | 11 |
| 11 | 12 |
| 12 class DirectoryParser(HTMLParser): | 13 class DirectoryParser(HTMLParser): |
| 13 """Class to parse directory listings""" | 14 """ |
| 15 Class to parse directory listings. |
| 14 | 16 |
| 15 def __init__(self, url): | 17 :param url: url of the directory on the web server. |
| 18 :param session: a requests Session instance used to fetch the directory |
| 19 content. If None, a new session will be created. |
| 20 :param authentication: a tuple (username, password) to authenticate against |
| 21 the web server, or None for no authentication. Note |
| 22 that it will only be used if the given *session* is |
| 23 None. |
| 24 :param timeout: timeout in seconds used when fetching the directory |
| 25 content. |
| 26 """ |
| 27 |
| 28 def __init__(self, url, session=None, authentication=None, timeout=None): |
| 29 if not session: |
| 30 session = requests.Session() |
| 31 session.auth = authentication |
| 32 self.session = session |
| 33 self.timeout = timeout |
| 34 |
| 35 self.active_url = None |
| 36 self.entries = [] |
| 37 |
| 16 HTMLParser.__init__(self) | 38 HTMLParser.__init__(self) |
| 17 | 39 |
| 18 self.entries = [ ] | 40 # Force the server to not send cached content |
| 19 self.active_url = None | 41 headers = {'Cache-Control': 'max-age=0'} |
| 42 r = self.session.get(url, headers=headers, timeout=self.timeout) |
| 20 | 43 |
| 21 req = urllib.urlopen(url) | 44 try: |
| 22 self.feed(req.read()) | 45 r.raise_for_status() |
| 46 self.feed(r.text) |
| 47 finally: |
| 48 r.close() |
| 23 | 49 |
| 24 def filter(self, regex): | 50 def filter(self, filter): |
| 25 pattern = re.compile(regex, re.IGNORECASE) | 51 """Filter entries by calling function or applying regex.""" |
| 26 return [entry for entry in self.entries if pattern.match(entry)] | 52 |
| 53 if hasattr(filter, '__call__'): |
| 54 return [entry for entry in self.entries if filter(entry)] |
| 55 else: |
| 56 pattern = re.compile(filter, re.IGNORECASE) |
| 57 return [entry for entry in self.entries if pattern.match(entry)] |
| 27 | 58 |
| 28 def handle_starttag(self, tag, attrs): | 59 def handle_starttag(self, tag, attrs): |
| 29 if not tag == 'a': | 60 if not tag == 'a': |
| 30 return | 61 return |
| 31 | 62 |
| 32 for attr in attrs: | 63 for attr in attrs: |
| 33 if attr[0] == 'href': | 64 if attr[0] == 'href': |
| 34 self.active_url = attr[1].strip('/') | 65 # Links look like: /pub/firefox/nightly/2015/ |
| 66 # We have to trim the fragment down to the last item. Also to en
sure we |
| 67 # always get it, we remove a possible final slash first |
| 68 has_final_slash = attr[1][-1] == '/' |
| 69 self.active_url = attr[1].rstrip('/').split('/')[-1] |
| 70 |
| 71 # Add back slash in case of sub folders |
| 72 if has_final_slash: |
| 73 self.active_url = '%s/' % self.active_url |
| 74 |
| 35 return | 75 return |
| 36 | 76 |
| 37 def handle_endtag(self, tag): | 77 def handle_endtag(self, tag): |
| 38 if tag == 'a': | 78 if tag == 'a': |
| 39 self.active_url = None | 79 self.active_url = None |
| 40 | 80 |
| 41 def handle_data(self, data): | 81 def handle_data(self, data): |
| 42 # Only process the data when we are in an active a tag and have an URL | 82 # Only process the data when we are in an active a tag and have an URL |
| 43 if not self.active_url: | 83 if not self.active_url: |
| 44 return | 84 return |
| 45 | 85 |
| 46 name = urllib.quote(data.strip('/')) | 86 if self.active_url in (data, urllib.quote(data)): |
| 47 if self.active_url == name: | 87 self.entries.append(self.active_url.strip('/')) |
| 48 self.entries.append(self.active_url) | |
| OLD | NEW |