OLD | NEW |
(Empty) | |
| 1 # This Source Code Form is subject to the terms of the Mozilla Public |
| 2 # License, v. 2.0. If a copy of the MPL was not distributed with this |
| 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 4 |
| 5 """Module to parse directory listings on a remote FTP server.""" |
| 6 |
| 7 from HTMLParser import HTMLParser |
| 8 import re |
| 9 import urllib |
| 10 |
| 11 |
| 12 class DirectoryParser(HTMLParser): |
| 13 """Class to parse directory listings""" |
| 14 |
| 15 def __init__(self, url): |
| 16 HTMLParser.__init__(self) |
| 17 |
| 18 self.entries = [ ] |
| 19 self.active_url = None |
| 20 |
| 21 req = urllib.urlopen(url) |
| 22 self.feed(req.read()) |
| 23 |
| 24 def filter(self, regex): |
| 25 pattern = re.compile(regex, re.IGNORECASE) |
| 26 return [entry for entry in self.entries if pattern.match(entry)] |
| 27 |
| 28 def handle_starttag(self, tag, attrs): |
| 29 if not tag == 'a': |
| 30 return |
| 31 |
| 32 for attr in attrs: |
| 33 if attr[0] == 'href': |
| 34 self.active_url = attr[1].strip('/') |
| 35 return |
| 36 |
| 37 def handle_endtag(self, tag): |
| 38 if tag == 'a': |
| 39 self.active_url = None |
| 40 |
| 41 def handle_data(self, data): |
| 42 # Only process the data when we are in an active a tag and have an URL |
| 43 if not self.active_url: |
| 44 return |
| 45 |
| 46 name = urllib.quote(data.strip('/')) |
| 47 if self.active_url == name: |
| 48 self.entries.append(self.active_url) |
OLD | NEW |