Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(66)

Side by Side Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Comments from nednguyen. Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 # This script requires the python package BeautifulSoup (v4). This package is
6 # not included in the telemetry repository. You can install it with the
7 # command: "pip install beautifulsoup4"
8
9 import json
10 import logging
11 import urllib2
12 import urlparse
13
14 from bs4 import BeautifulSoup
dtu 2015/02/12 23:46:12 style guide nit: don't import classes directly
erikchen 2015/02/13 03:58:57 Done.
15
16 def _IsURLASCII(url):
17 """Whether |url| is ascii encoded."""
18 try:
19 url.decode("ascii")
20 except UnicodeEncodeError:
21 return False
22 return True
23
24 def GenerateSafeUrls():
25 """Prints a list of safe urls.
26
27 Generates a safe list of urls from a seed list. Each href in the HTML
28 fetched from the url from the seed list is placed into the safe list. The
29 safe list contains unsanitized urls.
30 """
31 # A list of websites whose hrefs are unlikely to link to sites that contain
32 # malware.
33 seed_urls = [
34 "https://www.cnn.com",
35 "https://www.youtube.com",
36 "https://www.facebook.com",
37 "https://www.twitter.com",
38 "https://www.yahoo.com",
39 "https://www.amazon.com",
40 "https://www.wikipedia.com",
41 "https://www.bing.com",
42 "https://www.dailymotion.com",
43 "https://www.stackoverflow.com",
44 "https://www.google.com/#q=dumpling",
45 "http://www.baidu.com/s?wd=rice",
46 "http://www.baidu.com/s?wd=cow",
47 "https://www.google.com/#q=fox",
48 "http://www.yahoo.co.jp/",
49 "http://www.yandex.ru/",
50 "https://www.imdb.com/",
51 "http://www.huffingtonpost.com/",
52 "https://www.deviantart.com/",
53 "http://www.wsj.com/",
54 ]
55
56 safe_urls = set()
57
58 for url in seed_urls:
59 try:
60 # Fetch and parse the HTML.
61 response = urllib2.urlopen(url)
62 html = response.read()
63 soup = BeautifulSoup(html)
64 except:
65 logging.exception("Error fetching or parsing url: %s", url)
66 raise
67
68 # Looks for all hrefs.
69 for link in soup.find_all('a'):
70 possibly_relative_url = link.get("href")
71 if not possibly_relative_url:
72 continue
73
74 # For simplicity, ignore urls that aren't ascii encoded.
75 if not _IsURLASCII(possibly_relative_url):
76 continue
77
78 absolute_url = urlparse.urljoin(url, possibly_relative_url)
79 safe_urls.add(absolute_url)
80
81 # Sort the urls, to make them easier to view in bulk.
82 safe_urls_list = list(safe_urls)
83 safe_urls_list.sort()
84
85 print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
86
87 if __name__ == "__main__":
88 GenerateSafeUrls()
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698