| Index: tools/perf/profile_creators/profile_safe_url_generator.py
|
| diff --git a/tools/perf/profile_creators/profile_safe_url_generator.py b/tools/perf/profile_creators/profile_safe_url_generator.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..8cae1b9d499ce94591a4d3cabc9247425e37ed67
|
| --- /dev/null
|
| +++ b/tools/perf/profile_creators/profile_safe_url_generator.py
|
| @@ -0,0 +1,88 @@
|
| +# Copyright 2015 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +import HTMLParser
|
| +import json
|
| +import logging
|
| +import urllib2
|
| +import urlparse
|
| +
|
| +
|
| +class _HRefParser(HTMLParser.HTMLParser):
|
| + def __init__(self):
|
| + HTMLParser.HTMLParser.__init__(self)
|
| + self.hrefs = []
|
| +
|
| + def handle_starttag(self, tag, attrs):
|
| + if tag == "a":
|
| + for name, value in attrs:
|
| + if name == "href":
|
| + self.hrefs.append(value)
|
| +
|
| +
|
| +def GenerateSafeUrls():
|
| + """Prints a list of safe urls.
|
| +
|
| + Generates a safe list of urls from a seed list. Each href in the HTML
|
| + fetched from the url from the seed list is placed into the safe list. The
|
| + safe list contains unsanitized urls.
|
| + """
|
| + # A list of websites whose hrefs are unlikely to link to sites that contain
|
| + # malware.
|
| + seed_urls = [
|
| + "https://www.cnn.com",
|
| + "https://www.youtube.com",
|
| + "https://www.facebook.com",
|
| + "https://www.twitter.com",
|
| + "https://www.yahoo.com",
|
| + "https://www.amazon.com",
|
| + "https://www.wikipedia.com",
|
| + "https://www.bing.com",
|
| + "https://www.dailymotion.com",
|
| + "https://www.stackoverflow.com",
|
| + "https://www.google.com/#q=dumpling",
|
| + "http://www.baidu.com/s?wd=rice",
|
| + "http://www.baidu.com/s?wd=cow",
|
| + "https://www.google.com/#q=fox",
|
| + "http://www.yahoo.co.jp/",
|
| + "http://www.yandex.ru/",
|
| + "https://www.imdb.com/",
|
| + "http://www.huffingtonpost.com/",
|
| + "https://www.deviantart.com/",
|
| + "http://www.wsj.com/",
|
| + ]
|
| +
|
| + safe_urls = set()
|
| +
|
| + for url in seed_urls:
|
| + try:
|
| + # Fetch and parse the HTML.
|
| + response = urllib2.urlopen(url)
|
| + encoding = response.headers.getparam('charset')
|
| + html = response.read()
|
| + if encoding:
|
| + html = html.decode(encoding)
|
| +
|
| + parser = _HRefParser()
|
| + parser.feed(html)
|
| + except:
|
| + logging.exception("Error fetching or parsing url: %s", url)
|
| + raise
|
| +
|
| + # Looks for all hrefs.
|
| + for relative_url in parser.hrefs:
|
| + if not relative_url:
|
| + continue
|
| +
|
| + absolute_url = urlparse.urljoin(url, relative_url)
|
| + safe_urls.add(absolute_url)
|
| +
|
| + # Sort the urls, to make them easier to view in bulk.
|
| + safe_urls_list = list(safe_urls)
|
| + safe_urls_list.sort()
|
| +
|
| + print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
|
| +
|
| +if __name__ == "__main__":
|
| + GenerateSafeUrls()
|
|
|