mozdownload/parser.py - Issue 1451373002: Updating mozdownload (excluding tests)

Side by Side Diff: mozdownload/parser.py

Issue 1451373002: Updating mozdownload (excluding tests) (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/mozdownload@master

Patch Set: Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This Source Code Form is subject to the terms of the Mozilla Public	1 # This Source Code Form is subject to the terms of the Mozilla Public

2 # License, v. 2.0. If a copy of the MPL was not distributed with this	2 # License, v. 2.0. If a copy of the MPL was not distributed with this

3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.	3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

4	4

5 """Module to parse directory listings on a remote FTP server."""	5 """Module to parse directory listings on a remote FTP server."""

6	6

7 from HTMLParser import HTMLParser	7 from HTMLParser import HTMLParser

8 import re	8 import re

	9 import requests

9 import urllib	10 import urllib

10	11

11	12

12 class DirectoryParser(HTMLParser):	13 class DirectoryParser(HTMLParser):

13 """Class to parse directory listings"""	14 """

	15 Class to parse directory listings.

14	16

15 def __init__(self, url):	17 :param url: url of the directory on the web server.

	18 :param session: a requests Session instance used to fetch the directory

	19 content. If None, a new session will be created.

	20 :param authentication: a tuple (username, password) to authenticate against

	21 the web server, or None for no authentication. Note

	22 that it will only be used if the given session is

	23 None.

	24 :param timeout: timeout in seconds used when fetching the directory

	25 content.

	26 """

	27

	28 def __init__(self, url, session=None, authentication=None, timeout=None):

	29 if not session:

	30 session = requests.Session()

	31 session.auth = authentication

	32 self.session = session

	33 self.timeout = timeout

	34

	35 self.active_url = None

	36 self.entries = []

	37

16 HTMLParser.__init__(self)	38 HTMLParser.__init__(self)

17	39

18 self.entries = [ ]	40 # Force the server to not send cached content

19 self.active_url = None	41 headers = {'Cache-Control': 'max-age=0'}

	42 r = self.session.get(url, headers=headers, timeout=self.timeout)

20	43

21 req = urllib.urlopen(url)	44 try:

22 self.feed(req.read())	45 r.raise_for_status()

	46 self.feed(r.text)

	47 finally:

	48 r.close()

23	49

24 def filter(self, regex):	50 def filter(self, filter):

25 pattern = re.compile(regex, re.IGNORECASE)	51 """Filter entries by calling function or applying regex."""

26 return [entry for entry in self.entries if pattern.match(entry)]	52

	53 if hasattr(filter, '__call__'):

	54 return [entry for entry in self.entries if filter(entry)]

	55 else:

	56 pattern = re.compile(filter, re.IGNORECASE)

	57 return [entry for entry in self.entries if pattern.match(entry)]

27	58

28 def handle_starttag(self, tag, attrs):	59 def handle_starttag(self, tag, attrs):

29 if not tag == 'a':	60 if not tag == 'a':

30 return	61 return

31	62

32 for attr in attrs:	63 for attr in attrs:

33 if attr[0] == 'href':	64 if attr[0] == 'href':

34 self.active_url = attr[1].strip('/')	65 # Links look like: /pub/firefox/nightly/2015/

	66 # We have to trim the fragment down to the last item. Also to en sure we

	67 # always get it, we remove a possible final slash first

	68 has_final_slash = attr[1][-1] == '/'

	69 self.active_url = attr[1].rstrip('/').split('/')[-1]

	70

	71 # Add back slash in case of sub folders

	72 if has_final_slash:

	73 self.active_url = '%s/' % self.active_url

	74

35 return	75 return

36	76

37 def handle_endtag(self, tag):	77 def handle_endtag(self, tag):

38 if tag == 'a':	78 if tag == 'a':

39 self.active_url = None	79 self.active_url = None

40	80

41 def handle_data(self, data):	81 def handle_data(self, data):

42 # Only process the data when we are in an active a tag and have an URL	82 # Only process the data when we are in an active a tag and have an URL

43 if not self.active_url:	83 if not self.active_url:

44 return	84 return

45	85

46 name = urllib.quote(data.strip('/'))	86 if self.active_url in (data, urllib.quote(data)):

47 if self.active_url == name:	87 self.entries.append(self.active_url.strip('/'))

48 self.entries.append(self.active_url)

OLD	NEW

« no previous file with comments | « mozdownload/factory.py ('k') | mozdownload/scraper.py » ('j') | mozdownload/scraper.py » ('J')