OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2017 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 """Generates alexa_domains.list from |
| 7 src/tools/perf/page_sets/alexa1-10000-urls.json. By default, all the domains |
| 8 extracted from the input will be recorded in alexa_domains.list in the script |
| 9 directory except for duplicates and domains in ccTLDs known to disallow |
| 10 non-ASCII Latin letters (cn,jp,kr,tw). |
| 11 Optional command line arguments can be used to limit the output to top N |
| 12 domains and to specify an output file. |
| 13 """ |
| 14 |
| 15 import re |
| 16 import sys |
| 17 import os |
| 18 |
| 19 script_dir = os.path.dirname(os.path.realpath(__file__)) |
| 20 alexa10k_path = os.path.join(script_dir, "..", "..", "..", "tools", "perf", |
| 21 "page_sets", "alexa1-10000-urls.json") |
| 22 max_num_domains = 10000 if len(sys.argv) < 2 else int(sys.argv[1]) |
| 23 alexa_out = os.path.join(script_dir, "alexa_domains.list") \ |
| 24 if len(sys.argv) < 3 else os.path.join(script_dir, sys.argv[2]) |
| 25 |
| 26 domain_extractor = re.compile(r'^.*"https?://(?:www.)?([^/]*)/.*$') |
| 27 excluded_tld = re.compile(r'.(cn|kr|jp|tw)$') |
| 28 domains = set() |
| 29 n_domains = 0 |
| 30 |
| 31 with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile: |
| 32 for line in infile: |
| 33 if line.startswith('#'): |
| 34 continue |
| 35 match = domain_extractor.match(line) |
| 36 if match and n_domains < max_num_domains: |
| 37 n_domains = n_domains + 1 |
| 38 domain = match.group(1) |
| 39 labels = domain.split('.') |
| 40 if len(labels) > 3: |
| 41 domain = '.'.join(labels[-3:]) |
| 42 if not excluded_tld.search(match.group(1)) and domain not in domains: |
| 43 domains.add(domain) |
| 44 outfile.write(domain + "\n") |
| 45 |
| 46 # Add some popular domains if they're missing. |
| 47 # TODO(jshin): Find a way to update the list. (crbug.com/722022) |
| 48 for domain in ["gmail.com", "hotmail.com", "360.cn", "ntd.tv", "onclkds.com", |
| 49 "uber.com", "lyft.com", "ok.ru"]: |
| 50 if domain not in domains: |
| 51 outfile.write(domain + "\n") |
| 52 |
| 53 # Add a few made-up domains for testing. |
| 54 outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n") |
| 55 outfile.write("islkpx123.com\n") |
OLD | NEW |