Index: mozdownload/parser.py |
diff --git a/mozdownload/parser.py b/mozdownload/parser.py |
index 98390a59a582c7acd5b772a6a7130c8f248f9413..85902862091826067e2f0e3c092eb3ce8fdab083 100644 |
--- a/mozdownload/parser.py |
+++ b/mozdownload/parser.py |
@@ -6,24 +6,55 @@ |
from HTMLParser import HTMLParser |
import re |
+import requests |
import urllib |
class DirectoryParser(HTMLParser): |
- """Class to parse directory listings""" |
+ """ |
+ Class to parse directory listings. |
- def __init__(self, url): |
- HTMLParser.__init__(self) |
+ :param url: url of the directory on the web server. |
+ :param session: a requests Session instance used to fetch the directory |
+ content. If None, a new session will be created. |
+ :param authentication: a tuple (username, password) to authenticate against |
+ the web server, or None for no authentication. Note |
+ that it will only be used if the given *session* is |
+ None. |
+ :param timeout: timeout in seconds used when fetching the directory |
+ content. |
+ """ |
+ |
+ def __init__(self, url, session=None, authentication=None, timeout=None): |
+ if not session: |
+ session = requests.Session() |
+ session.auth = authentication |
+ self.session = session |
+ self.timeout = timeout |
- self.entries = [ ] |
self.active_url = None |
+ self.entries = [] |
+ |
+ HTMLParser.__init__(self) |
- req = urllib.urlopen(url) |
- self.feed(req.read()) |
+ # Force the server to not send cached content |
+ headers = {'Cache-Control': 'max-age=0'} |
+ r = self.session.get(url, headers=headers, timeout=self.timeout) |
- def filter(self, regex): |
- pattern = re.compile(regex, re.IGNORECASE) |
- return [entry for entry in self.entries if pattern.match(entry)] |
+ try: |
+ r.raise_for_status() |
+ self.feed(r.text) |
+ finally: |
+ r.close() |
+ |
+ def filter(self, filter): |
+ """Filter entries by calling function or applying regex.""" |
+ |
+ if hasattr(filter, '__call__'): |
+ return [entry for entry in self.entries if filter(entry)] |
+ else: |
+ pattern = re.compile(filter, re.IGNORECASE) |
+ return [entry for entry in self.entries if pattern.match(entry)] |
def handle_starttag(self, tag, attrs): |
if not tag == 'a': |
@@ -31,7 +62,16 @@ class DirectoryParser(HTMLParser): |
for attr in attrs: |
if attr[0] == 'href': |
- self.active_url = attr[1].strip('/') |
+ # Links look like: /pub/firefox/nightly/2015/ |
+ # We have to trim the fragment down to the last item. Also to ensure we |
+ # always get it, we remove a possible final slash first |
+ has_final_slash = attr[1][-1] == '/' |
+ self.active_url = attr[1].rstrip('/').split('/')[-1] |
+ |
+ # Add back slash in case of sub folders |
+ if has_final_slash: |
+ self.active_url = '%s/' % self.active_url |
+ |
return |
def handle_endtag(self, tag): |
@@ -43,6 +83,5 @@ class DirectoryParser(HTMLParser): |
if not self.active_url: |
return |
- name = urllib.quote(data.strip('/')) |
- if self.active_url == name: |
- self.entries.append(self.active_url) |
+ if self.active_url in (data, urllib.quote(data)): |
+ self.entries.append(self.active_url.strip('/')) |