Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(359)

Unified Diff: mozdownload/parser.py

Issue 1451373002: Updating mozdownload (excluding tests) (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/mozdownload@master
Patch Set: Updated README.md Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « mozdownload/factory.py ('k') | mozdownload/scraper.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: mozdownload/parser.py
diff --git a/mozdownload/parser.py b/mozdownload/parser.py
index 98390a59a582c7acd5b772a6a7130c8f248f9413..85902862091826067e2f0e3c092eb3ce8fdab083 100644
--- a/mozdownload/parser.py
+++ b/mozdownload/parser.py
@@ -6,24 +6,55 @@
from HTMLParser import HTMLParser
import re
+import requests
import urllib
class DirectoryParser(HTMLParser):
- """Class to parse directory listings"""
+ """
+ Class to parse directory listings.
- def __init__(self, url):
- HTMLParser.__init__(self)
+ :param url: url of the directory on the web server.
+ :param session: a requests Session instance used to fetch the directory
+ content. If None, a new session will be created.
+ :param authentication: a tuple (username, password) to authenticate against
+ the web server, or None for no authentication. Note
+ that it will only be used if the given *session* is
+ None.
+ :param timeout: timeout in seconds used when fetching the directory
+ content.
+ """
+
+ def __init__(self, url, session=None, authentication=None, timeout=None):
+ if not session:
+ session = requests.Session()
+ session.auth = authentication
+ self.session = session
+ self.timeout = timeout
- self.entries = [ ]
self.active_url = None
+ self.entries = []
+
+ HTMLParser.__init__(self)
- req = urllib.urlopen(url)
- self.feed(req.read())
+ # Force the server to not send cached content
+ headers = {'Cache-Control': 'max-age=0'}
+ r = self.session.get(url, headers=headers, timeout=self.timeout)
- def filter(self, regex):
- pattern = re.compile(regex, re.IGNORECASE)
- return [entry for entry in self.entries if pattern.match(entry)]
+ try:
+ r.raise_for_status()
+ self.feed(r.text)
+ finally:
+ r.close()
+
+ def filter(self, filter):
+ """Filter entries by calling function or applying regex."""
+
+ if hasattr(filter, '__call__'):
+ return [entry for entry in self.entries if filter(entry)]
+ else:
+ pattern = re.compile(filter, re.IGNORECASE)
+ return [entry for entry in self.entries if pattern.match(entry)]
def handle_starttag(self, tag, attrs):
if not tag == 'a':
@@ -31,7 +62,16 @@ class DirectoryParser(HTMLParser):
for attr in attrs:
if attr[0] == 'href':
- self.active_url = attr[1].strip('/')
+ # Links look like: /pub/firefox/nightly/2015/
+ # We have to trim the fragment down to the last item. Also to ensure we
+ # always get it, we remove a possible final slash first
+ has_final_slash = attr[1][-1] == '/'
+ self.active_url = attr[1].rstrip('/').split('/')[-1]
+
+ # Add back slash in case of sub folders
+ if has_final_slash:
+ self.active_url = '%s/' % self.active_url
+
return
def handle_endtag(self, tag):
@@ -43,6 +83,5 @@ class DirectoryParser(HTMLParser):
if not self.active_url:
return
- name = urllib.quote(data.strip('/'))
- if self.active_url == name:
- self.entries.append(self.active_url)
+ if self.active_url in (data, urllib.quote(data)):
+ self.entries.append(self.active_url.strip('/'))
« no previous file with comments | « mozdownload/factory.py ('k') | mozdownload/scraper.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698