Chromium Code Reviews| Index: components/url_formatter/top_domains/make_alexa_top_list.py | 
| diff --git a/components/url_formatter/top_domains/make_alexa_top_list.py b/components/url_formatter/top_domains/make_alexa_top_list.py | 
| new file mode 100755 | 
| index 0000000000000000000000000000000000000000..18526f769b3935bb7c16d4179acb260d616878d3 | 
| --- /dev/null | 
| +++ b/components/url_formatter/top_domains/make_alexa_top_list.py | 
| @@ -0,0 +1,54 @@ | 
| +#!/usr/bin/env python | 
| +# Copyright 2017 The Chromium Authors. All rights reserved. | 
| +# # Use of this source code is governed by a BSD-style license that can be | 
| 
 
Peter Kasting
2017/05/09 01:37:02
Nit: No need for # #? (2 places)
 
jungshik at Google
2017/05/10 18:05:12
Done.
 
 | 
| +# # found in the LICENSE file. | 
| + | 
| +"""Generate alexa_domains.list from | 
| 
 
Peter Kasting
2017/05/09 01:37:02
Nit: Generates?
 
jungshik at Google
2017/05/10 18:05:12
Done.
 
 | 
| + src/tools/perf/page_sets/alexa1-10000-urls.json. By default, all the domains | 
| + extracted from the input will be recorded in alexa_domains.list in the script | 
| + directory except for duplicates and domains in ccTLDs known to disallow | 
| + non-ASCII Latin letters (cn,jp,kr,tw). | 
| + Optional command line arguments can be used to limit the output to top N | 
| + domains and to specify an output file. | 
| +""" | 
| + | 
| +import re | 
| +import sys | 
| +import os | 
| + | 
| +script_dir = os.path.dirname(os.path.realpath(__file__)) | 
| +alexa10k_path = os.path.join(script_dir, "..", "..", "..", "tools", "perf", | 
| + "page_sets", "alexa1-10000-urls.json") | 
| 
 
Ryan Sleevi
2017/05/09 00:00:29
It seems like this should be an input parameter to
 
jungshik at Google
2017/05/09 19:57:39
I checked with Chrome counsel (which is why I was
 
jungshik at Google
2017/05/09 20:19:42
This is not run during build. It's run manually to
 
Peter Kasting
2017/05/09 20:50:14
Separately, are we going to be updating that list?
 
jungshik at Google
2017/05/10 18:05:12
That's a weak link because up-to-date Alexa list i
 
 | 
| +max_num_domains = 10000 if len(sys.argv) < 2 else int(sys.argv[1]) | 
| +alexa_out = os.path.join(script_dir, "alexa_domains.list") \ | 
| + if len(sys.argv) < 3 else os.path.join(script_dir, sys.argv[2]) | 
| + | 
| +domain_extractor = re.compile(r'^.*"https?://(?:www.)?([^/]*)/.*$') | 
| +excluded_tld = re.compile(r'.(cn|kr|jp|tw)$') | 
| +domains = set() | 
| +n_domains = 0 | 
| + | 
| +with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile: | 
| + for line in infile: | 
| + if line.startswith('#'): | 
| + continue | 
| + match = domain_extractor.match(line) | 
| + if match and n_domains < max_num_domains: | 
| + n_domains = n_domains + 1 | 
| + domain = match.group(1) | 
| + labels = domain.split('.') | 
| + if len(labels) > 3: | 
| + domain = '.'.join(labels[-3:]) | 
| + if not excluded_tld.search(match.group(1)) and domain not in domains: | 
| + domains.add(domain) | 
| + outfile.write(domain + "\n") | 
| + | 
| + # Add some popular domains if they're missing. | 
| 
 
Ryan Sleevi
2017/05/09 00:00:29
I'm not sure why this part - it seems to be more s
 
jungshik at Google
2017/05/09 19:57:39
Yes, it's subjective. Because the list is old, som
 
 | 
| + for domain in ["gmail.com", "hotmail.com", "360.cn", "ntd.tv", "onclkds.com", | 
| + "uber.com", "lyft.com"]: | 
| + if domain not in domains: | 
| + outfile.write(domain + "\n") | 
| + | 
| + # Add a few made-up domains for testing. | 
| + outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n") | 
| + outfile.write("islkpx123.com\n") |