Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(41)

Unified Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Style nit. Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | tools/perf/profile_creators/profile_safe_url_list.json » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/perf/profile_creators/profile_safe_url_generator.py
diff --git a/tools/perf/profile_creators/profile_safe_url_generator.py b/tools/perf/profile_creators/profile_safe_url_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cae1b9d499ce94591a4d3cabc9247425e37ed67
--- /dev/null
+++ b/tools/perf/profile_creators/profile_safe_url_generator.py
@@ -0,0 +1,88 @@
+# Copyright 2015 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import HTMLParser
+import json
+import logging
+import urllib2
+import urlparse
+
+
+class _HRefParser(HTMLParser.HTMLParser):
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ self.hrefs = []
+
+ def handle_starttag(self, tag, attrs):
+ if tag == "a":
+ for name, value in attrs:
+ if name == "href":
+ self.hrefs.append(value)
+
+
+def GenerateSafeUrls():
+ """Prints a list of safe urls.
+
+ Generates a safe list of urls from a seed list. Each href in the HTML
+ fetched from the url from the seed list is placed into the safe list. The
+ safe list contains unsanitized urls.
+ """
+ # A list of websites whose hrefs are unlikely to link to sites that contain
+ # malware.
+ seed_urls = [
+ "https://www.cnn.com",
+ "https://www.youtube.com",
+ "https://www.facebook.com",
+ "https://www.twitter.com",
+ "https://www.yahoo.com",
+ "https://www.amazon.com",
+ "https://www.wikipedia.com",
+ "https://www.bing.com",
+ "https://www.dailymotion.com",
+ "https://www.stackoverflow.com",
+ "https://www.google.com/#q=dumpling",
+ "http://www.baidu.com/s?wd=rice",
+ "http://www.baidu.com/s?wd=cow",
+ "https://www.google.com/#q=fox",
+ "http://www.yahoo.co.jp/",
+ "http://www.yandex.ru/",
+ "https://www.imdb.com/",
+ "http://www.huffingtonpost.com/",
+ "https://www.deviantart.com/",
+ "http://www.wsj.com/",
+ ]
+
+ safe_urls = set()
+
+ for url in seed_urls:
+ try:
+ # Fetch and parse the HTML.
+ response = urllib2.urlopen(url)
+ encoding = response.headers.getparam('charset')
+ html = response.read()
+ if encoding:
+ html = html.decode(encoding)
+
+ parser = _HRefParser()
+ parser.feed(html)
+ except:
+ logging.exception("Error fetching or parsing url: %s", url)
+ raise
+
+ # Looks for all hrefs.
+ for relative_url in parser.hrefs:
+ if not relative_url:
+ continue
+
+ absolute_url = urlparse.urljoin(url, relative_url)
+ safe_urls.add(absolute_url)
+
+ # Sort the urls, to make them easier to view in bulk.
+ safe_urls_list = list(safe_urls)
+ safe_urls_list.sort()
+
+ print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
+
+if __name__ == "__main__":
+ GenerateSafeUrls()
« no previous file with comments | « no previous file | tools/perf/profile_creators/profile_safe_url_list.json » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698