tools/perf/profile_creators/profile_safe_url_generator.py - Issue 918893002: telemetry: Create a safe url generator.

Unified Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Style nit. Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/perf/profile_creators/profile_safe_url_generator.py

diff --git a/tools/perf/profile_creators/profile_safe_url_generator.py b/tools/perf/profile_creators/profile_safe_url_generator.py

new file mode 100644

index 0000000000000000000000000000000000000000..8cae1b9d499ce94591a4d3cabc9247425e37ed67

--- /dev/null

+++ b/tools/perf/profile_creators/profile_safe_url_generator.py

@@ -0,0 +1,88 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+import HTMLParser

+import json

+import logging

+import urllib2

+import urlparse

+class _HRefParser(HTMLParser.HTMLParser):

+ def __init__(self):

+ HTMLParser.HTMLParser.__init__(self)

+ self.hrefs = []

+ def handle_starttag(self, tag, attrs):

+ if tag == "a":

+ for name, value in attrs:

+ if name == "href":

+ self.hrefs.append(value)

+def GenerateSafeUrls():

+ """Prints a list of safe urls.

+ Generates a safe list of urls from a seed list. Each href in the HTML

+ fetched from the url from the seed list is placed into the safe list. The

+ safe list contains unsanitized urls.

+ """

+ # A list of websites whose hrefs are unlikely to link to sites that contain

+ # malware.

+ seed_urls = [

+ "https://www.cnn.com",

+ "https://www.youtube.com",

+ "https://www.facebook.com",

+ "https://www.twitter.com",

+ "https://www.yahoo.com",

+ "https://www.amazon.com",

+ "https://www.wikipedia.com",

+ "https://www.bing.com",

+ "https://www.dailymotion.com",

+ "https://www.stackoverflow.com",

+ "https://www.google.com/#q=dumpling",

+ "http://www.baidu.com/s?wd=rice",

+ "http://www.baidu.com/s?wd=cow",

+ "https://www.google.com/#q=fox",

+ "http://www.yahoo.co.jp/",

+ "http://www.yandex.ru/",

+ "https://www.imdb.com/",

+ "http://www.huffingtonpost.com/",

+ "https://www.deviantart.com/",

+ "http://www.wsj.com/",

+ ]

+ safe_urls = set()

+ for url in seed_urls:

+ try:

+ # Fetch and parse the HTML.

+ response = urllib2.urlopen(url)

+ encoding = response.headers.getparam('charset')

+ html = response.read()

+ if encoding:

+ html = html.decode(encoding)

+ parser = _HRefParser()

+ parser.feed(html)

+ except:

+ logging.exception("Error fetching or parsing url: %s", url)

+ raise

+ # Looks for all hrefs.

+ for relative_url in parser.hrefs:

+ if not relative_url:

+ continue

+ absolute_url = urlparse.urljoin(url, relative_url)

+ safe_urls.add(absolute_url)

+ # Sort the urls, to make them easier to view in bulk.

+ safe_urls_list = list(safe_urls)

+ safe_urls_list.sort()

+ print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))

+if __name__ == "__main__":

+ GenerateSafeUrls()

« no previous file with comments | « no previous file | tools/perf/profile_creators/profile_safe_url_list.json » ('j') | no next file with comments »