Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1265)

Unified Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Comments from nednguyen. Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: tools/perf/profile_creators/profile_safe_url_generator.py
diff --git a/tools/perf/profile_creators/profile_safe_url_generator.py b/tools/perf/profile_creators/profile_safe_url_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d08b52632a5802caf50f65c670d4fc54a96f17f
--- /dev/null
+++ b/tools/perf/profile_creators/profile_safe_url_generator.py
@@ -0,0 +1,88 @@
+# Copyright 2015 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This script requires the python package BeautifulSoup (v4). This package is
+# not included in the telemetry repository. You can install it with the
+# command: "pip install beautifulsoup4"
+
+import json
+import logging
+import urllib2
+import urlparse
+
+from bs4 import BeautifulSoup
dtu 2015/02/12 23:46:12 style guide nit: don't import classes directly
erikchen 2015/02/13 03:58:57 Done.
+
+def _IsURLASCII(url):
+ """Whether |url| is ascii encoded."""
+ try:
+ url.decode("ascii")
+ except UnicodeEncodeError:
+ return False
+ return True
+
+def GenerateSafeUrls():
+ """Prints a list of safe urls.
+
+ Generates a safe list of urls from a seed list. Each href in the HTML
+ fetched from the url from the seed list is placed into the safe list. The
+ safe list contains unsanitized urls.
+ """
+ # A list of websites whose hrefs are unlikely to link to sites that contain
+ # malware.
+ seed_urls = [
+ "https://www.cnn.com",
+ "https://www.youtube.com",
+ "https://www.facebook.com",
+ "https://www.twitter.com",
+ "https://www.yahoo.com",
+ "https://www.amazon.com",
+ "https://www.wikipedia.com",
+ "https://www.bing.com",
+ "https://www.dailymotion.com",
+ "https://www.stackoverflow.com",
+ "https://www.google.com/#q=dumpling",
+ "http://www.baidu.com/s?wd=rice",
+ "http://www.baidu.com/s?wd=cow",
+ "https://www.google.com/#q=fox",
+ "http://www.yahoo.co.jp/",
+ "http://www.yandex.ru/",
+ "https://www.imdb.com/",
+ "http://www.huffingtonpost.com/",
+ "https://www.deviantart.com/",
+ "http://www.wsj.com/",
+ ]
+
+ safe_urls = set()
+
+ for url in seed_urls:
+ try:
+ # Fetch and parse the HTML.
+ response = urllib2.urlopen(url)
+ html = response.read()
+ soup = BeautifulSoup(html)
+ except:
+ logging.exception("Error fetching or parsing url: %s", url)
+ raise
+
+ # Looks for all hrefs.
+ for link in soup.find_all('a'):
+ possibly_relative_url = link.get("href")
+ if not possibly_relative_url:
+ continue
+
+ # For simplicity, ignore urls that aren't ascii encoded.
+ if not _IsURLASCII(possibly_relative_url):
+ continue
+
+ absolute_url = urlparse.urljoin(url, possibly_relative_url)
+ safe_urls.add(absolute_url)
+
+ # Sort the urls, to make them easier to view in bulk.
+ safe_urls_list = list(safe_urls)
+ safe_urls_list.sort()
+
+ print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
+
+if __name__ == "__main__":
+ GenerateSafeUrls()

Powered by Google App Engine
This is Rietveld 408576698