OLD | NEW |
(Empty) | |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. |
| 4 |
| 5 import HTMLParser |
| 6 import json |
| 7 import logging |
| 8 import urllib2 |
| 9 import urlparse |
| 10 |
| 11 |
| 12 class _HRefParser(HTMLParser.HTMLParser): |
| 13 def __init__(self): |
| 14 HTMLParser.HTMLParser.__init__(self) |
| 15 self.hrefs = [] |
| 16 |
| 17 def handle_starttag(self, tag, attrs): |
| 18 if tag == "a": |
| 19 for name, value in attrs: |
| 20 if name == "href": |
| 21 self.hrefs.append(value) |
| 22 |
| 23 |
| 24 def GenerateSafeUrls(): |
| 25 """Prints a list of safe urls. |
| 26 |
| 27 Generates a safe list of urls from a seed list. Each href in the HTML |
| 28 fetched from the url from the seed list is placed into the safe list. The |
| 29 safe list contains unsanitized urls. |
| 30 """ |
| 31 # A list of websites whose hrefs are unlikely to link to sites that contain |
| 32 # malware. |
| 33 seed_urls = [ |
| 34 "https://www.cnn.com", |
| 35 "https://www.youtube.com", |
| 36 "https://www.facebook.com", |
| 37 "https://www.twitter.com", |
| 38 "https://www.yahoo.com", |
| 39 "https://www.amazon.com", |
| 40 "https://www.wikipedia.com", |
| 41 "https://www.bing.com", |
| 42 "https://www.dailymotion.com", |
| 43 "https://www.stackoverflow.com", |
| 44 "https://www.google.com/#q=dumpling", |
| 45 "http://www.baidu.com/s?wd=rice", |
| 46 "http://www.baidu.com/s?wd=cow", |
| 47 "https://www.google.com/#q=fox", |
| 48 "http://www.yahoo.co.jp/", |
| 49 "http://www.yandex.ru/", |
| 50 "https://www.imdb.com/", |
| 51 "http://www.huffingtonpost.com/", |
| 52 "https://www.deviantart.com/", |
| 53 "http://www.wsj.com/", |
| 54 ] |
| 55 |
| 56 safe_urls = set() |
| 57 |
| 58 for url in seed_urls: |
| 59 try: |
| 60 # Fetch and parse the HTML. |
| 61 response = urllib2.urlopen(url) |
| 62 encoding = response.headers.getparam('charset') |
| 63 html = response.read() |
| 64 if encoding: |
| 65 html = html.decode(encoding) |
| 66 |
| 67 parser = _HRefParser() |
| 68 parser.feed(html) |
| 69 except: |
| 70 logging.exception("Error fetching or parsing url: %s", url) |
| 71 raise |
| 72 |
| 73 # Looks for all hrefs. |
| 74 for relative_url in parser.hrefs: |
| 75 if not relative_url: |
| 76 continue |
| 77 |
| 78 absolute_url = urlparse.urljoin(url, relative_url) |
| 79 safe_urls.add(absolute_url) |
| 80 |
| 81 # Sort the urls, to make them easier to view in bulk. |
| 82 safe_urls_list = list(safe_urls) |
| 83 safe_urls_list.sort() |
| 84 |
| 85 print json.dumps(safe_urls_list, indent=2, separators=(",", ":")) |
| 86 |
| 87 if __name__ == "__main__": |
| 88 GenerateSafeUrls() |
OLD | NEW |