Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 # This script requires the python package BeautifulSoup (v4). This package is | |
| 6 # not included in the telemetry repository. You can install it with the | |
| 7 # command: "pip install beautifulsoup4" | |
| 8 | |
| 9 import logging | |
| 10 import urllib2 | |
| 11 import urlparse | |
| 12 | |
| 13 from bs4 import BeautifulSoup | |
|
nednguyen
2015/02/12 01:43:46
Does bot machine has bs4?
erikchen
2015/02/12 01:46:57
No.
| |
| 14 | |
| 15 def _IsURLASCII(url): | |
| 16 """Whether |url| is ascii encoded.""" | |
| 17 try: | |
| 18 url.decode("ascii") | |
| 19 except UnicodeEncodeError: | |
| 20 return False | |
| 21 return True | |
| 22 | |
| 23 def GenerateSafeUrls(): | |
| 24 """ | |
| 25 Generates a safe list of urls from a seed list. Each href in the HTML fetched | |
| 26 from the url from the seed list is placed into the safe list. The safe list | |
| 27 contains unsanitized urls. | |
| 28 """ | |
| 29 # A list of websites whose hrefs are unlikely to link to sites that contain | |
| 30 # malware. | |
| 31 seed_urls = [ | |
| 32 "https://www.cnn.com", | |
| 33 "https://www.youtube.com", | |
| 34 "https://www.facebook.com", | |
| 35 "https://www.twitter.com", | |
| 36 "https://www.yahoo.com", | |
| 37 "https://www.amazon.com", | |
| 38 "https://www.wikipedia.com", | |
| 39 "https://www.bing.com", | |
| 40 "https://www.dailymotion.com", | |
| 41 "https://www.stackoverflow.com", | |
| 42 "https://www.google.com/#q=dumpling", | |
| 43 "http://www.baidu.com/s?wd=rice", | |
| 44 "http://www.baidu.com/s?wd=cow", | |
| 45 "https://www.google.com/#q=fox", | |
| 46 "http://www.yahoo.co.jp/", | |
| 47 "http://www.yandex.ru/", | |
| 48 "https://www.imdb.com/", | |
| 49 "http://www.huffingtonpost.com/", | |
| 50 "https://www.deviantart.com/", | |
| 51 "http://www.wsj.com/", | |
| 52 ] | |
| 53 | |
| 54 safe_urls = set() | |
| 55 | |
| 56 for url in seed_urls: | |
| 57 try: | |
| 58 # Fetch and parse the HTML. | |
| 59 response = urllib2.urlopen(url) | |
| 60 html = response.read() | |
| 61 soup = BeautifulSoup(html) | |
| 62 except: | |
| 63 logging.exception("Error fetching or parsing url: %s", url) | |
| 64 raise | |
| 65 | |
| 66 # Looks for all hrefs. | |
| 67 for link in soup.find_all('a'): | |
| 68 possibly_relative_url = link.get("href") | |
| 69 if not possibly_relative_url: | |
| 70 continue | |
| 71 | |
| 72 # For simplicity, ignore urls that aren't ascii encoded. | |
| 73 if not _IsURLASCII(possibly_relative_url): | |
| 74 continue | |
| 75 | |
| 76 absolute_url = urlparse.urljoin(url, possibly_relative_url) | |
| 77 safe_urls.add(absolute_url) | |
| 78 | |
| 79 # Sort the urls, to make them easier to view in bulk. | |
| 80 safe_urls_list = list(safe_urls) | |
| 81 safe_urls_list.sort() | |
| 82 | |
| 83 # Print out the safe urls in a format that is conducive to being copied into | |
| 84 # a python file. | |
| 85 for url in safe_urls_list: | |
| 86 print "\"%s\"," % (url) | |
| 87 | |
| 88 if __name__ == "__main__": | |
| 89 GenerateSafeUrls() | |
| OLD | NEW |