OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python | |
2 # Copyright 2017 The Chromium Authors. All rights reserved. | |
3 # # Use of this source code is governed by a BSD-style license that can be | |
Peter Kasting
2017/05/09 01:37:02
Nit: No need for # #? (2 places)
jungshik at Google
2017/05/10 18:05:12
Done.
| |
4 # # found in the LICENSE file. | |
5 | |
6 """Generate alexa_domains.list from | |
Peter Kasting
2017/05/09 01:37:02
Nit: Generates?
jungshik at Google
2017/05/10 18:05:12
Done.
| |
7 src/tools/perf/page_sets/alexa1-10000-urls.json. By default, all the domains | |
8 extracted from the input will be recorded in alexa_domains.list in the script | |
9 directory except for duplicates and domains in ccTLDs known to disallow | |
10 non-ASCII Latin letters (cn,jp,kr,tw). | |
11 Optional command line arguments can be used to limit the output to top N | |
12 domains and to specify an output file. | |
13 """ | |
14 | |
15 import re | |
16 import sys | |
17 import os | |
18 | |
19 script_dir = os.path.dirname(os.path.realpath(__file__)) | |
20 alexa10k_path = os.path.join(script_dir, "..", "..", "..", "tools", "perf", | |
21 "page_sets", "alexa1-10000-urls.json") | |
Ryan Sleevi
2017/05/09 00:00:29
It seems like this should be an input parameter to
jungshik at Google
2017/05/09 19:57:39
I checked with Chrome counsel (which is why I was
jungshik at Google
2017/05/09 20:19:42
This is not run during build. It's run manually to
Peter Kasting
2017/05/09 20:50:14
Separately, are we going to be updating that list?
jungshik at Google
2017/05/10 18:05:12
That's a weak link because up-to-date Alexa list i
| |
22 max_num_domains = 10000 if len(sys.argv) < 2 else int(sys.argv[1]) | |
23 alexa_out = os.path.join(script_dir, "alexa_domains.list") \ | |
24 if len(sys.argv) < 3 else os.path.join(script_dir, sys.argv[2]) | |
25 | |
26 domain_extractor = re.compile(r'^.*"https?://(?:www.)?([^/]*)/.*$') | |
27 excluded_tld = re.compile(r'.(cn|kr|jp|tw)$') | |
28 domains = set() | |
29 n_domains = 0 | |
30 | |
31 with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile: | |
32 for line in infile: | |
33 if line.startswith('#'): | |
34 continue | |
35 match = domain_extractor.match(line) | |
36 if match and n_domains < max_num_domains: | |
37 n_domains = n_domains + 1 | |
38 domain = match.group(1) | |
39 labels = domain.split('.') | |
40 if len(labels) > 3: | |
41 domain = '.'.join(labels[-3:]) | |
42 if not excluded_tld.search(match.group(1)) and domain not in domains: | |
43 domains.add(domain) | |
44 outfile.write(domain + "\n") | |
45 | |
46 # Add some popular domains if they're missing. | |
Ryan Sleevi
2017/05/09 00:00:29
I'm not sure why this part - it seems to be more s
jungshik at Google
2017/05/09 19:57:39
Yes, it's subjective. Because the list is old, som
| |
47 for domain in ["gmail.com", "hotmail.com", "360.cn", "ntd.tv", "onclkds.com", | |
48 "uber.com", "lyft.com"]: | |
49 if domain not in domains: | |
50 outfile.write(domain + "\n") | |
51 | |
52 # Add a few made-up domains for testing. | |
53 outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n") | |
54 outfile.write("islkpx123.com\n") | |
OLD | NEW |