mozdownload/parser.py - Issue 1451373002: Updating mozdownload (excluding tests)

Unified Diff: mozdownload/parser.py

Issue 1451373002: Updating mozdownload (excluding tests) (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/mozdownload@master

Patch Set: Updated README.md Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: mozdownload/parser.py

diff --git a/mozdownload/parser.py b/mozdownload/parser.py

index 98390a59a582c7acd5b772a6a7130c8f248f9413..85902862091826067e2f0e3c092eb3ce8fdab083 100644

--- a/mozdownload/parser.py

+++ b/mozdownload/parser.py

@@ -6,24 +6,55 @@

from HTMLParser import HTMLParser

import re

+import requests

import urllib

class DirectoryParser(HTMLParser):

- """Class to parse directory listings"""

+ """

+ Class to parse directory listings.

- def __init__(self, url):

- HTMLParser.__init__(self)

+ :param url: url of the directory on the web server.

+ :param session: a requests Session instance used to fetch the directory

+ content. If None, a new session will be created.

+ :param authentication: a tuple (username, password) to authenticate against

+ the web server, or None for no authentication. Note

+ that it will only be used if the given *session* is

+ None.

+ :param timeout: timeout in seconds used when fetching the directory

+ content.

+ """

+ def __init__(self, url, session=None, authentication=None, timeout=None):

+ if not session:

+ session = requests.Session()

+ session.auth = authentication

+ self.session = session

+ self.timeout = timeout

- self.entries = [ ]

self.active_url = None

+ self.entries = []

+ HTMLParser.__init__(self)

- req = urllib.urlopen(url)

- self.feed(req.read())

+ # Force the server to not send cached content

+ headers = {'Cache-Control': 'max-age=0'}

+ r = self.session.get(url, headers=headers, timeout=self.timeout)

- def filter(self, regex):

- pattern = re.compile(regex, re.IGNORECASE)

- return [entry for entry in self.entries if pattern.match(entry)]

+ try:

+ r.raise_for_status()

+ self.feed(r.text)

+ finally:

+ r.close()

+ def filter(self, filter):

+ """Filter entries by calling function or applying regex."""

+ if hasattr(filter, '__call__'):

+ return [entry for entry in self.entries if filter(entry)]

+ else:

+ pattern = re.compile(filter, re.IGNORECASE)

+ return [entry for entry in self.entries if pattern.match(entry)]

def handle_starttag(self, tag, attrs):

if not tag == 'a':

@@ -31,7 +62,16 @@ class DirectoryParser(HTMLParser):

for attr in attrs:

if attr[0] == 'href':

- self.active_url = attr[1].strip('/')

+ # Links look like: /pub/firefox/nightly/2015/

+ # We have to trim the fragment down to the last item. Also to ensure we

+ # always get it, we remove a possible final slash first

+ has_final_slash = attr[1][-1] == '/'

+ self.active_url = attr[1].rstrip('/').split('/')[-1]

+ # Add back slash in case of sub folders

+ if has_final_slash:

+ self.active_url = '%s/' % self.active_url

return

def handle_endtag(self, tag):

@@ -43,6 +83,5 @@ class DirectoryParser(HTMLParser):

if not self.active_url:

return

- name = urllib.quote(data.strip('/'))

- if self.active_url == name:

- self.entries.append(self.active_url)

+ if self.active_url in (data, urllib.quote(data)):

+ self.entries.append(self.active_url.strip('/'))

« no previous file with comments | « mozdownload/factory.py ('k') | mozdownload/scraper.py » ('j') | no next file with comments »