Chromium Code Reviews| Index: tools/perf/profile_creators/profile_safe_url_generator.py |
| diff --git a/tools/perf/profile_creators/profile_safe_url_generator.py b/tools/perf/profile_creators/profile_safe_url_generator.py |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..f375a7ad014a7980adfd4efc7f7f91d2d1586ec2 |
| --- /dev/null |
| +++ b/tools/perf/profile_creators/profile_safe_url_generator.py |
| @@ -0,0 +1,89 @@ |
| +# Copyright 2015 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +# This script requires the python package BeautifulSoup (v4). This package is |
| +# not included in the telemetry repository. You can install it with the |
| +# command: "pip install beautifulsoup4" |
| + |
| +import logging |
| +import urllib2 |
| +import urlparse |
| + |
| +from bs4 import BeautifulSoup |
|
nednguyen
2015/02/12 01:43:46
Does bot machine has bs4?
erikchen
2015/02/12 01:46:57
No.
|
| + |
| +def _IsURLASCII(url): |
| + """Whether |url| is ascii encoded.""" |
| + try: |
| + url.decode("ascii") |
| + except UnicodeEncodeError: |
| + return False |
| + return True |
| + |
| +def GenerateSafeUrls(): |
| + """ |
| + Generates a safe list of urls from a seed list. Each href in the HTML fetched |
| + from the url from the seed list is placed into the safe list. The safe list |
| + contains unsanitized urls. |
| + """ |
| + # A list of websites whose hrefs are unlikely to link to sites that contain |
| + # malware. |
| + seed_urls = [ |
| + "https://www.cnn.com", |
| + "https://www.youtube.com", |
| + "https://www.facebook.com", |
| + "https://www.twitter.com", |
| + "https://www.yahoo.com", |
| + "https://www.amazon.com", |
| + "https://www.wikipedia.com", |
| + "https://www.bing.com", |
| + "https://www.dailymotion.com", |
| + "https://www.stackoverflow.com", |
| + "https://www.google.com/#q=dumpling", |
| + "http://www.baidu.com/s?wd=rice", |
| + "http://www.baidu.com/s?wd=cow", |
| + "https://www.google.com/#q=fox", |
| + "http://www.yahoo.co.jp/", |
| + "http://www.yandex.ru/", |
| + "https://www.imdb.com/", |
| + "http://www.huffingtonpost.com/", |
| + "https://www.deviantart.com/", |
| + "http://www.wsj.com/", |
| + ] |
| + |
| + safe_urls = set() |
| + |
| + for url in seed_urls: |
| + try: |
| + # Fetch and parse the HTML. |
| + response = urllib2.urlopen(url) |
| + html = response.read() |
| + soup = BeautifulSoup(html) |
| + except: |
| + logging.exception("Error fetching or parsing url: %s", url) |
| + raise |
| + |
| + # Looks for all hrefs. |
| + for link in soup.find_all('a'): |
| + possibly_relative_url = link.get("href") |
| + if not possibly_relative_url: |
| + continue |
| + |
| + # For simplicity, ignore urls that aren't ascii encoded. |
| + if not _IsURLASCII(possibly_relative_url): |
| + continue |
| + |
| + absolute_url = urlparse.urljoin(url, possibly_relative_url) |
| + safe_urls.add(absolute_url) |
| + |
| + # Sort the urls, to make them easier to view in bulk. |
| + safe_urls_list = list(safe_urls) |
| + safe_urls_list.sort() |
| + |
| + # Print out the safe urls in a format that is conducive to being copied into |
| + # a python file. |
| + for url in safe_urls_list: |
| + print "\"%s\"," % (url) |
| + |
| +if __name__ == "__main__": |
| + GenerateSafeUrls() |