Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Unified Diff: components/url_formatter/top_domains/make_alexa_top_list.py

Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/url_formatter/top_domains/make_alexa_top_list.py
diff --git a/components/url_formatter/top_domains/make_alexa_top_list.py b/components/url_formatter/top_domains/make_alexa_top_list.py
deleted file mode 100755
index 20820e84c3ed12e2b8a0e0840a239bda4c765749..0000000000000000000000000000000000000000
--- a/components/url_formatter/top_domains/make_alexa_top_list.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2017 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-"""Generates alexa_domains.list from
- src/tools/perf/page_sets/alexa1-10000-urls.json. By default, all the domains
- extracted from the input will be recorded in alexa_domains.list in the script
- directory except for duplicates and domains in ccTLDs known to disallow
- non-ASCII Latin letters (cn,jp,kr,tw).
- Optional command line arguments can be used to limit the output to top N
- domains and to specify an output file.
-"""
-
-import re
-import sys
-import os
-
-script_dir = os.path.dirname(os.path.realpath(__file__))
-alexa10k_path = os.path.join(script_dir, "..", "..", "..", "tools", "perf",
- "page_sets", "alexa1-10000-urls.json")
-max_num_domains = 10000 if len(sys.argv) < 2 else int(sys.argv[1])
-alexa_out = os.path.join(script_dir, "alexa_domains.list") \
- if len(sys.argv) < 3 else os.path.join(script_dir, sys.argv[2])
-
-domain_extractor = re.compile(r'^.*"https?://(?:www.)?([^/]*)/.*$')
-excluded_tld = re.compile(r'.(cn|kr|jp|tw)$')
-domains = set()
-n_domains = 0
-
-with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile:
- for line in infile:
- if line.startswith('#'):
- continue
- match = domain_extractor.match(line)
- if match and n_domains < max_num_domains:
- n_domains = n_domains + 1
- domain = match.group(1)
- labels = domain.split('.')
- if len(labels) > 3:
- domain = '.'.join(labels[-3:])
- if not excluded_tld.search(match.group(1)) and domain not in domains:
- domains.add(domain)
- outfile.write(domain + "\n")
-
- # Add some popular domains if they're missing.
- # TODO(jshin): Find a way to update the list. (crbug.com/722022)
- for domain in ["gmail.com", "hotmail.com", "360.cn", "ntd.tv", "onclkds.com",
- "uber.com", "lyft.com", "ok.ru"]:
- if domain not in domains:
- outfile.write(domain + "\n")
-
- # Add a few made-up domains for testing.
- outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n")
- outfile.write("islkpx123.com\n")

Powered by Google App Engine
This is Rietveld 408576698