OLD | NEW |
---|---|
(Empty) | |
1 # Copyright 2015 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 # This script requires the python package BeautifulSoup (v4). This package is | |
6 # not included in the telemetry repository. You can install it with the | |
7 # command: "pip install beautifulsoup4" | |
8 | |
9 import json | |
10 import logging | |
11 import urllib2 | |
12 import urlparse | |
13 | |
14 from bs4 import BeautifulSoup | |
dtu
2015/02/12 23:46:12
style guide nit: don't import classes directly
erikchen
2015/02/13 03:58:57
Done.
| |
15 | |
16 def _IsURLASCII(url): | |
17 """Whether |url| is ascii encoded.""" | |
18 try: | |
19 url.decode("ascii") | |
20 except UnicodeEncodeError: | |
21 return False | |
22 return True | |
23 | |
24 def GenerateSafeUrls(): | |
25 """Prints a list of safe urls. | |
26 | |
27 Generates a safe list of urls from a seed list. Each href in the HTML | |
28 fetched from the url from the seed list is placed into the safe list. The | |
29 safe list contains unsanitized urls. | |
30 """ | |
31 # A list of websites whose hrefs are unlikely to link to sites that contain | |
32 # malware. | |
33 seed_urls = [ | |
34 "https://www.cnn.com", | |
35 "https://www.youtube.com", | |
36 "https://www.facebook.com", | |
37 "https://www.twitter.com", | |
38 "https://www.yahoo.com", | |
39 "https://www.amazon.com", | |
40 "https://www.wikipedia.com", | |
41 "https://www.bing.com", | |
42 "https://www.dailymotion.com", | |
43 "https://www.stackoverflow.com", | |
44 "https://www.google.com/#q=dumpling", | |
45 "http://www.baidu.com/s?wd=rice", | |
46 "http://www.baidu.com/s?wd=cow", | |
47 "https://www.google.com/#q=fox", | |
48 "http://www.yahoo.co.jp/", | |
49 "http://www.yandex.ru/", | |
50 "https://www.imdb.com/", | |
51 "http://www.huffingtonpost.com/", | |
52 "https://www.deviantart.com/", | |
53 "http://www.wsj.com/", | |
54 ] | |
55 | |
56 safe_urls = set() | |
57 | |
58 for url in seed_urls: | |
59 try: | |
60 # Fetch and parse the HTML. | |
61 response = urllib2.urlopen(url) | |
62 html = response.read() | |
63 soup = BeautifulSoup(html) | |
64 except: | |
65 logging.exception("Error fetching or parsing url: %s", url) | |
66 raise | |
67 | |
68 # Looks for all hrefs. | |
69 for link in soup.find_all('a'): | |
70 possibly_relative_url = link.get("href") | |
71 if not possibly_relative_url: | |
72 continue | |
73 | |
74 # For simplicity, ignore urls that aren't ascii encoded. | |
75 if not _IsURLASCII(possibly_relative_url): | |
76 continue | |
77 | |
78 absolute_url = urlparse.urljoin(url, possibly_relative_url) | |
79 safe_urls.add(absolute_url) | |
80 | |
81 # Sort the urls, to make them easier to view in bulk. | |
82 safe_urls_list = list(safe_urls) | |
83 safe_urls_list.sort() | |
84 | |
85 print json.dumps(safe_urls_list, indent=2, separators=(",", ":")) | |
86 | |
87 if __name__ == "__main__": | |
88 GenerateSafeUrls() | |
OLD | NEW |