tools/perf/profile_creators/profile_safe_url_generator.py - Issue 918893002: telemetry: Create a safe url generator.

Unified Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/perf/profile_creators/profile_safe_url_generator.py

diff --git a/tools/perf/profile_creators/profile_safe_url_generator.py b/tools/perf/profile_creators/profile_safe_url_generator.py

new file mode 100644

index 0000000000000000000000000000000000000000..f375a7ad014a7980adfd4efc7f7f91d2d1586ec2

--- /dev/null

+++ b/tools/perf/profile_creators/profile_safe_url_generator.py

@@ -0,0 +1,89 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+# This script requires the python package BeautifulSoup (v4). This package is

+# not included in the telemetry repository. You can install it with the

+# command: "pip install beautifulsoup4"

+import logging

+import urllib2

+import urlparse

+from bs4 import BeautifulSoup

nednguyen 2015/02/12 01:43:46 Does bot machine has bs4?

erikchen 2015/02/12 01:46:57 No.

+def _IsURLASCII(url):

+ """Whether |url| is ascii encoded."""

+ try:

+ url.decode("ascii")

+ except UnicodeEncodeError:

+ return False

+ return True

+def GenerateSafeUrls():

+ """

+ Generates a safe list of urls from a seed list. Each href in the HTML fetched

+ from the url from the seed list is placed into the safe list. The safe list

+ contains unsanitized urls.

+ """

+ # A list of websites whose hrefs are unlikely to link to sites that contain

+ # malware.

+ seed_urls = [

+ "https://www.cnn.com",

+ "https://www.youtube.com",

+ "https://www.facebook.com",

+ "https://www.twitter.com",

+ "https://www.yahoo.com",

+ "https://www.amazon.com",

+ "https://www.wikipedia.com",

+ "https://www.bing.com",

+ "https://www.dailymotion.com",

+ "https://www.stackoverflow.com",

+ "https://www.google.com/#q=dumpling",

+ "http://www.baidu.com/s?wd=rice",

+ "http://www.baidu.com/s?wd=cow",

+ "https://www.google.com/#q=fox",

+ "http://www.yahoo.co.jp/",

+ "http://www.yandex.ru/",

+ "https://www.imdb.com/",

+ "http://www.huffingtonpost.com/",

+ "https://www.deviantart.com/",

+ "http://www.wsj.com/",

+ ]

+ safe_urls = set()

+ for url in seed_urls:

+ try:

+ # Fetch and parse the HTML.

+ response = urllib2.urlopen(url)

+ html = response.read()

+ soup = BeautifulSoup(html)

+ except:

+ logging.exception("Error fetching or parsing url: %s", url)

+ raise

+ # Looks for all hrefs.

+ for link in soup.find_all('a'):

+ possibly_relative_url = link.get("href")

+ if not possibly_relative_url:

+ continue

+ # For simplicity, ignore urls that aren't ascii encoded.

+ if not _IsURLASCII(possibly_relative_url):

+ continue

+ absolute_url = urlparse.urljoin(url, possibly_relative_url)

+ safe_urls.add(absolute_url)

+ # Sort the urls, to make them easier to view in bulk.

+ safe_urls_list = list(safe_urls)

+ safe_urls_list.sort()

+ # Print out the safe urls in a format that is conducive to being copied into

+ # a python file.

+ for url in safe_urls_list:

+ print "\"%s\"," % (url)

+if __name__ == "__main__":

+ GenerateSafeUrls()

« no previous file with comments | « no previous file | tools/perf/profile_creators/profile_safe_url_list.py » ('j') | tools/perf/profile_creators/profile_safe_url_list.py » ('J')