Chromium Code Reviews| Index: components/url_formatter/url_formatter.cc |
| diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc |
| index 936482843a390f8bc4b17f31f7742fbb25b2f7a4..2e1e829de3b9af1b2941a36f068d0d28952c10c0 100644 |
| --- a/components/url_formatter/url_formatter.cc |
| +++ b/components/url_formatter/url_formatter.cc |
| @@ -12,11 +12,13 @@ |
| #include "base/macros.h" |
| #include "base/numerics/safe_conversions.h" |
| #include "base/strings/string_piece.h" |
| +#include "base/strings/string_tokenizer.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_offset_string_conversions.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/threading/thread_local_storage.h" |
| #include "components/url_formatter/idn_spoof_checker.h" |
| +#include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
| #include "third_party/icu/source/common/unicode/uidna.h" |
| #include "third_party/icu/source/common/unicode/utypes.h" |
| #include "url/gurl.h" |
| @@ -50,14 +52,64 @@ class AppendComponentTransform { |
| class HostComponentTransform : public AppendComponentTransform { |
| public: |
| - HostComponentTransform() {} |
| + HostComponentTransform(bool trim_trivial_subdomains) |
| + : trim_trivial_subdomains_(trim_trivial_subdomains) {} |
| private: |
| base::string16 Execute( |
| const std::string& component_text, |
| base::OffsetAdjuster::Adjustments* adjustments) const override { |
| - return IDNToUnicodeWithAdjustments(component_text, adjustments); |
| + if (!trim_trivial_subdomains_) |
| + return IDNToUnicodeWithAdjustments(component_text, adjustments); |
| + |
| + // Exclude the registry and domain from trivial subdomain stripping. |
| + // To get the adjustment offset calculations correct, we need to transform |
| + // the registry and domain portion of the host as well. |
| + std::string domain_and_registry = |
| + net::registry_controlled_domains::GetDomainAndRegistry( |
| + component_text, |
| + net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES); |
| + |
| + base::OffsetAdjuster::Adjustments trivial_subdomains_adjustments; |
| + base::StringTokenizer t(component_text.begin(), |
|
Peter Kasting
2017/07/06 06:01:21
Nit: t -> tokenizer (avoid abbreviation in general
tommycli
2017/07/06 16:25:46
Done.
|
| + component_text.end() - domain_and_registry.length(), |
| + "."); |
| + t.set_options(base::StringTokenizer::RETURN_DELIMS); |
| + |
| + std::string new_subdomain_string; |
|
Peter Kasting
2017/07/06 06:01:21
Nit: |transformed_subdomain|?
tommycli
2017/07/06 16:25:46
Done.
|
| + size_t offset = 0; |
| + while (t.GetNext()) { |
| + // Append delimiters and non-trivial subdomains to the new subdomain part. |
| + if (t.token_is_delim() || (t.token() != "m" && t.token() != "www")) { |
| + new_subdomain_string.append(t.token()); |
|
Peter Kasting
2017/07/06 06:01:21
Nit: append() is fine, I find += a little more idi
tommycli
2017/07/06 16:25:46
Done. Ah i did not realize that string had operato
|
| + offset += t.token().length(); |
| + continue; |
| + } |
| + |
| + // When we find a trivial subdomain, we simply do not append to |
| + // |new_subdomain_string|. We also consume the next token, which must be |
| + // a delimiter. |
| + size_t trivial_subdomain_length = t.token().length(); |
| + bool next_delimiter_found = t.GetNext(); |
| + DCHECK(next_delimiter_found); |
| + DCHECK(t.token_is_delim()); |
| + |
| + // Add an adjustment accounting for the consumed subdomain and delimiter. |
| + trivial_subdomains_adjustments.push_back(base::OffsetAdjuster::Adjustment( |
| + offset, trivial_subdomain_length + 1, 0)); |
| + offset += t.token().length() + 1; |
|
Peter Kasting
2017/07/06 06:01:21
Wait, is this right? I think t.token() is pointin
tommycli
2017/07/06 16:25:46
Done. Good suggestion on re-using the tokenizer it
Peter Kasting
2017/07/06 16:31:47
Yeah, the new version feels a lot cleaner to me no
|
| + } |
| + |
| + std::string new_component_text = new_subdomain_string + domain_and_registry; |
| + base::string16 unicode_result = |
| + IDNToUnicodeWithAdjustments(new_component_text, adjustments); |
| + |
| + base::OffsetAdjuster::MergeSequentialAdjustments( |
| + trivial_subdomains_adjustments, adjustments); |
| + return unicode_result; |
| } |
| + |
| + bool trim_trivial_subdomains_; |
| }; |
| class NonHostComponentTransform : public AppendComponentTransform { |
| @@ -362,6 +414,7 @@ const FormatUrlType kFormatUrlOmitAll = |
| kFormatUrlOmitTrailingSlashOnBareHostname; |
| const FormatUrlType kFormatUrlExperimentalElideAfterHost = 1 << 3; |
| const FormatUrlType kFormatUrlExperimentalOmitHTTPS = 1 << 4; |
| +const FormatUrlType kFormatUrlExperimentalOmitTrivialSubdomains = 1 << 5; |
| base::string16 FormatUrl(const GURL& url, |
| FormatUrlTypes format_types, |
| @@ -481,7 +534,10 @@ base::string16 FormatUrlWithAdjustments( |
| *prefix_end = static_cast<size_t>(url_string.length()); |
| // Host. |
| - AppendFormattedComponent(spec, parsed.host, HostComponentTransform(), |
| + bool trim_trivial_subdomains = |
| + (format_types & kFormatUrlExperimentalOmitTrivialSubdomains) != 0; |
| + AppendFormattedComponent(spec, parsed.host, |
| + HostComponentTransform(trim_trivial_subdomains), |
| &url_string, &new_parsed->host, adjustments); |
| // Port. |
| @@ -594,7 +650,8 @@ bool CanStripTrailingSlash(const GURL& url) { |
| void AppendFormattedHost(const GURL& url, base::string16* output) { |
| AppendFormattedComponent( |
| url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host, |
| - HostComponentTransform(), output, NULL, NULL); |
| + HostComponentTransform(false /* trim_trivial_subdomains */), output, |
|
Peter Kasting
2017/07/06 06:01:21
(I still dislike these kinda comments but whatever
tommycli
2017/07/06 16:25:46
Done.
|
| + nullptr, nullptr); |
| } |
| base::string16 IDNToUnicode(base::StringPiece host) { |