Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 # This script requires the python package BeautifulSoup (v4). This package is
6 # not included in the telemetry repository. You can install it with the
7 # command: "pip install beautifulsoup4"
8
9 import logging
10 import urllib2
11 import urlparse
12
13 from bs4 import BeautifulSoup
nednguyen 2015/02/12 01:43:46 Does bot machine has bs4?
erikchen 2015/02/12 01:46:57 No.
14
15 def _IsURLASCII(url):
16 """Whether |url| is ascii encoded."""
17 try:
18 url.decode("ascii")
19 except UnicodeEncodeError:
20 return False
21 return True
22
23 def GenerateSafeUrls():
24 """
25 Generates a safe list of urls from a seed list. Each href in the HTML fetched
26 from the url from the seed list is placed into the safe list. The safe list
27 contains unsanitized urls.
28 """
29 # A list of websites whose hrefs are unlikely to link to sites that contain
30 # malware.
31 seed_urls = [
32 "https://www.cnn.com",
33 "https://www.youtube.com",
34 "https://www.facebook.com",
35 "https://www.twitter.com",
36 "https://www.yahoo.com",
37 "https://www.amazon.com",
38 "https://www.wikipedia.com",
39 "https://www.bing.com",
40 "https://www.dailymotion.com",
41 "https://www.stackoverflow.com",
42 "https://www.google.com/#q=dumpling",
43 "http://www.baidu.com/s?wd=rice",
44 "http://www.baidu.com/s?wd=cow",
45 "https://www.google.com/#q=fox",
46 "http://www.yahoo.co.jp/",
47 "http://www.yandex.ru/",
48 "https://www.imdb.com/",
49 "http://www.huffingtonpost.com/",
50 "https://www.deviantart.com/",
51 "http://www.wsj.com/",
52 ]
53
54 safe_urls = set()
55
56 for url in seed_urls:
57 try:
58 # Fetch and parse the HTML.
59 response = urllib2.urlopen(url)
60 html = response.read()
61 soup = BeautifulSoup(html)
62 except:
63 logging.exception("Error fetching or parsing url: %s", url)
64 raise
65
66 # Looks for all hrefs.
67 for link in soup.find_all('a'):
68 possibly_relative_url = link.get("href")
69 if not possibly_relative_url:
70 continue
71
72 # For simplicity, ignore urls that aren't ascii encoded.
73 if not _IsURLASCII(possibly_relative_url):
74 continue
75
76 absolute_url = urlparse.urljoin(url, possibly_relative_url)
77 safe_urls.add(absolute_url)
78
79 # Sort the urls, to make them easier to view in bulk.
80 safe_urls_list = list(safe_urls)
81 safe_urls_list.sort()
82
83 # Print out the safe urls in a format that is conducive to being copied into
84 # a python file.
85 for url in safe_urls_list:
86 print "\"%s\"," % (url)
87
88 if __name__ == "__main__":
89 GenerateSafeUrls()
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698