tools/perf/profile_creators/profile_safe_url_generator.py - Issue 918893002: telemetry: Create a safe url generator.

Side by Side Diff

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Side by Side Diff: tools/perf/profile_creators/profile_safe_url_generator.py

Issue 918893002: telemetry: Create a safe url generator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Style nit. Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright 2015 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import HTMLParser

	6 import json

	7 import logging

	8 import urllib2

	9 import urlparse

	10

	11

	12 class _HRefParser(HTMLParser.HTMLParser):

	13 def __init__(self):

	14 HTMLParser.HTMLParser.__init__(self)

	15 self.hrefs = []

	16

	17 def handle_starttag(self, tag, attrs):

	18 if tag == "a":

	19 for name, value in attrs:

	20 if name == "href":

	21 self.hrefs.append(value)

	22

	23

	24 def GenerateSafeUrls():

	25 """Prints a list of safe urls.

	26

	27 Generates a safe list of urls from a seed list. Each href in the HTML

	28 fetched from the url from the seed list is placed into the safe list. The

	29 safe list contains unsanitized urls.

	30 """

	31 # A list of websites whose hrefs are unlikely to link to sites that contain

	32 # malware.

	33 seed_urls = [

	34 "https://www.cnn.com",

	35 "https://www.youtube.com",

	36 "https://www.facebook.com",

	37 "https://www.twitter.com",

	38 "https://www.yahoo.com",

	39 "https://www.amazon.com",

	40 "https://www.wikipedia.com",

	41 "https://www.bing.com",

	42 "https://www.dailymotion.com",

	43 "https://www.stackoverflow.com",

	44 "https://www.google.com/#q=dumpling",

	45 "http://www.baidu.com/s?wd=rice",

	46 "http://www.baidu.com/s?wd=cow",

	47 "https://www.google.com/#q=fox",

	48 "http://www.yahoo.co.jp/",

	49 "http://www.yandex.ru/",

	50 "https://www.imdb.com/",

	51 "http://www.huffingtonpost.com/",

	52 "https://www.deviantart.com/",

	53 "http://www.wsj.com/",

	54 ]

	55

	56 safe_urls = set()

	57

	58 for url in seed_urls:

	59 try:

	60 # Fetch and parse the HTML.

	61 response = urllib2.urlopen(url)

	62 encoding = response.headers.getparam('charset')

	63 html = response.read()

	64 if encoding:

	65 html = html.decode(encoding)

	66

	67 parser = _HRefParser()

	68 parser.feed(html)

	69 except:

	70 logging.exception("Error fetching or parsing url: %s", url)

	71 raise

	72

	73 # Looks for all hrefs.

	74 for relative_url in parser.hrefs:

	75 if not relative_url:

	76 continue

	77

	78 absolute_url = urlparse.urljoin(url, relative_url)

	79 safe_urls.add(absolute_url)

	80

	81 # Sort the urls, to make them easier to view in bulk.

	82 safe_urls_list = list(safe_urls)

	83 safe_urls_list.sort()

	84

	85 print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))

	86

	87 if __name__ == "__main__":

	88 GenerateSafeUrls()

OLD	NEW

« no previous file with comments | « no previous file | tools/perf/profile_creators/profile_safe_url_list.json » ('j') | no next file with comments »