Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(550)

Side by Side Diff: components/url_formatter/top_domains/make_alexa_top_list.py

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: pull IDNSpoofChecker to separae h/cc files Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2017 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Generates alexa_domains.list from
7 src/tools/perf/page_sets/alexa1-10000-urls.json. By default, all the domains
8 extracted from the input will be recorded in alexa_domains.list in the script
9 directory except for duplicates and domains in ccTLDs known to disallow
10 non-ASCII Latin letters (cn,jp,kr,tw).
11 Optional command line arguments can be used to limit the output to top N
12 domains and to specify an output file.
13 """
14
15 import re
16 import sys
17 import os
18
19 script_dir = os.path.dirname(os.path.realpath(__file__))
20 alexa10k_path = os.path.join(script_dir, "..", "..", "..", "tools", "perf",
21 "page_sets", "alexa1-10000-urls.json")
22 max_num_domains = 10000 if len(sys.argv) < 2 else int(sys.argv[1])
23 alexa_out = os.path.join(script_dir, "alexa_domains.list") \
24 if len(sys.argv) < 3 else os.path.join(script_dir, sys.argv[2])
25
26 domain_extractor = re.compile(r'^.*"https?://(?:www.)?([^/]*)/.*$')
27 excluded_tld = re.compile(r'.(cn|kr|jp|tw)$')
28 domains = set()
29 n_domains = 0
30
31 with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile:
32 for line in infile:
33 if line.startswith('#'):
34 continue
35 match = domain_extractor.match(line)
36 if match and n_domains < max_num_domains:
37 n_domains = n_domains + 1
38 domain = match.group(1)
39 labels = domain.split('.')
40 if len(labels) > 3:
41 domain = '.'.join(labels[-3:])
42 if not excluded_tld.search(match.group(1)) and domain not in domains:
43 domains.add(domain)
44 outfile.write(domain + "\n")
45
46 # Add some popular domains if they're missing.
Peter Kasting 2017/05/10 22:38:47 Nit: Maybe link to a bug on getting a process in p
jungshik at Google 2017/05/14 09:36:23 https://bugs.chromium.org/p/chromium/issues/detail
47 for domain in ["gmail.com", "hotmail.com", "360.cn", "ntd.tv", "onclkds.com",
48 "uber.com", "lyft.com"]:
49 if domain not in domains:
50 outfile.write(domain + "\n")
51
52 # Add a few made-up domains for testing.
53 outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n")
54 outfile.write("islkpx123.com\n")
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698