| Index: mozdownload/parser.py
|
| diff --git a/mozdownload/parser.py b/mozdownload/parser.py
|
| index 98390a59a582c7acd5b772a6a7130c8f248f9413..85902862091826067e2f0e3c092eb3ce8fdab083 100644
|
| --- a/mozdownload/parser.py
|
| +++ b/mozdownload/parser.py
|
| @@ -6,24 +6,55 @@
|
|
|
| from HTMLParser import HTMLParser
|
| import re
|
| +import requests
|
| import urllib
|
|
|
|
|
| class DirectoryParser(HTMLParser):
|
| - """Class to parse directory listings"""
|
| + """
|
| + Class to parse directory listings.
|
|
|
| - def __init__(self, url):
|
| - HTMLParser.__init__(self)
|
| + :param url: url of the directory on the web server.
|
| + :param session: a requests Session instance used to fetch the directory
|
| + content. If None, a new session will be created.
|
| + :param authentication: a tuple (username, password) to authenticate against
|
| + the web server, or None for no authentication. Note
|
| + that it will only be used if the given *session* is
|
| + None.
|
| + :param timeout: timeout in seconds used when fetching the directory
|
| + content.
|
| + """
|
| +
|
| + def __init__(self, url, session=None, authentication=None, timeout=None):
|
| + if not session:
|
| + session = requests.Session()
|
| + session.auth = authentication
|
| + self.session = session
|
| + self.timeout = timeout
|
|
|
| - self.entries = [ ]
|
| self.active_url = None
|
| + self.entries = []
|
| +
|
| + HTMLParser.__init__(self)
|
|
|
| - req = urllib.urlopen(url)
|
| - self.feed(req.read())
|
| + # Force the server to not send cached content
|
| + headers = {'Cache-Control': 'max-age=0'}
|
| + r = self.session.get(url, headers=headers, timeout=self.timeout)
|
|
|
| - def filter(self, regex):
|
| - pattern = re.compile(regex, re.IGNORECASE)
|
| - return [entry for entry in self.entries if pattern.match(entry)]
|
| + try:
|
| + r.raise_for_status()
|
| + self.feed(r.text)
|
| + finally:
|
| + r.close()
|
| +
|
| + def filter(self, filter):
|
| + """Filter entries by calling function or applying regex."""
|
| +
|
| + if hasattr(filter, '__call__'):
|
| + return [entry for entry in self.entries if filter(entry)]
|
| + else:
|
| + pattern = re.compile(filter, re.IGNORECASE)
|
| + return [entry for entry in self.entries if pattern.match(entry)]
|
|
|
| def handle_starttag(self, tag, attrs):
|
| if not tag == 'a':
|
| @@ -31,7 +62,16 @@ class DirectoryParser(HTMLParser):
|
|
|
| for attr in attrs:
|
| if attr[0] == 'href':
|
| - self.active_url = attr[1].strip('/')
|
| + # Links look like: /pub/firefox/nightly/2015/
|
| + # We have to trim the fragment down to the last item. Also to ensure we
|
| + # always get it, we remove a possible final slash first
|
| + has_final_slash = attr[1][-1] == '/'
|
| + self.active_url = attr[1].rstrip('/').split('/')[-1]
|
| +
|
| + # Add back slash in case of sub folders
|
| + if has_final_slash:
|
| + self.active_url = '%s/' % self.active_url
|
| +
|
| return
|
|
|
| def handle_endtag(self, tag):
|
| @@ -43,6 +83,5 @@ class DirectoryParser(HTMLParser):
|
| if not self.active_url:
|
| return
|
|
|
| - name = urllib.quote(data.strip('/'))
|
| - if self.active_url == name:
|
| - self.entries.append(self.active_url)
|
| + if self.active_url in (data, urllib.quote(data)):
|
| + self.entries.append(self.active_url.strip('/'))
|
|
|