Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(70)

Side by Side Diff: mozdownload/parser.py

Issue 1451373002: Updating mozdownload (excluding tests) (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/mozdownload@master
Patch Set: Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # This Source Code Form is subject to the terms of the Mozilla Public 1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this 2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 4
5 """Module to parse directory listings on a remote FTP server.""" 5 """Module to parse directory listings on a remote FTP server."""
6 6
7 from HTMLParser import HTMLParser 7 from HTMLParser import HTMLParser
8 import re 8 import re
9 import requests
9 import urllib 10 import urllib
10 11
11 12
12 class DirectoryParser(HTMLParser): 13 class DirectoryParser(HTMLParser):
13 """Class to parse directory listings""" 14 """
15 Class to parse directory listings.
14 16
15 def __init__(self, url): 17 :param url: url of the directory on the web server.
18 :param session: a requests Session instance used to fetch the directory
19 content. If None, a new session will be created.
20 :param authentication: a tuple (username, password) to authenticate against
21 the web server, or None for no authentication. Note
22 that it will only be used if the given *session* is
23 None.
24 :param timeout: timeout in seconds used when fetching the directory
25 content.
26 """
27
28 def __init__(self, url, session=None, authentication=None, timeout=None):
29 if not session:
30 session = requests.Session()
31 session.auth = authentication
32 self.session = session
33 self.timeout = timeout
34
35 self.active_url = None
36 self.entries = []
37
16 HTMLParser.__init__(self) 38 HTMLParser.__init__(self)
17 39
18 self.entries = [ ] 40 # Force the server to not send cached content
19 self.active_url = None 41 headers = {'Cache-Control': 'max-age=0'}
42 r = self.session.get(url, headers=headers, timeout=self.timeout)
20 43
21 req = urllib.urlopen(url) 44 try:
22 self.feed(req.read()) 45 r.raise_for_status()
46 self.feed(r.text)
47 finally:
48 r.close()
23 49
24 def filter(self, regex): 50 def filter(self, filter):
25 pattern = re.compile(regex, re.IGNORECASE) 51 """Filter entries by calling function or applying regex."""
26 return [entry for entry in self.entries if pattern.match(entry)] 52
53 if hasattr(filter, '__call__'):
54 return [entry for entry in self.entries if filter(entry)]
55 else:
56 pattern = re.compile(filter, re.IGNORECASE)
57 return [entry for entry in self.entries if pattern.match(entry)]
27 58
28 def handle_starttag(self, tag, attrs): 59 def handle_starttag(self, tag, attrs):
29 if not tag == 'a': 60 if not tag == 'a':
30 return 61 return
31 62
32 for attr in attrs: 63 for attr in attrs:
33 if attr[0] == 'href': 64 if attr[0] == 'href':
34 self.active_url = attr[1].strip('/') 65 # Links look like: /pub/firefox/nightly/2015/
66 # We have to trim the fragment down to the last item. Also to en sure we
67 # always get it, we remove a possible final slash first
68 has_final_slash = attr[1][-1] == '/'
69 self.active_url = attr[1].rstrip('/').split('/')[-1]
70
71 # Add back slash in case of sub folders
72 if has_final_slash:
73 self.active_url = '%s/' % self.active_url
74
35 return 75 return
36 76
37 def handle_endtag(self, tag): 77 def handle_endtag(self, tag):
38 if tag == 'a': 78 if tag == 'a':
39 self.active_url = None 79 self.active_url = None
40 80
41 def handle_data(self, data): 81 def handle_data(self, data):
42 # Only process the data when we are in an active a tag and have an URL 82 # Only process the data when we are in an active a tag and have an URL
43 if not self.active_url: 83 if not self.active_url:
44 return 84 return
45 85
46 name = urllib.quote(data.strip('/')) 86 if self.active_url in (data, urllib.quote(data)):
47 if self.active_url == name: 87 self.entries.append(self.active_url.strip('/'))
48 self.entries.append(self.active_url)
OLDNEW
« no previous file with comments | « mozdownload/factory.py ('k') | mozdownload/scraper.py » ('j') | mozdownload/scraper.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698