components/url_formatter/url_formatter.cc - Issue 2966233002: Omnibox UI Experiments: Strip trivial subdomains

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2966233002: Omnibox UI Experiments: Strip trivial subdomains (Closed)

Patch Set: make cast explicit for windows Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/omnibox/browser/autocomplete_match_unittest.cc ('K') | « components/url_formatter/url_formatter.h ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | components/url_formatter/url_formatter_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <utility>	8 #include <utility>

9 #include <vector>	9 #include <vector>

10	10

11 #include "base/lazy_instance.h"	11 #include "base/lazy_instance.h"

12 #include "base/macros.h"	12 #include "base/macros.h"

13 #include "base/numerics/safe_conversions.h"	13 #include "base/numerics/safe_conversions.h"

14 #include "base/strings/string_piece.h"	14 #include "base/strings/string_piece.h"

	15 #include "base/strings/string_tokenizer.h"

15 #include "base/strings/string_util.h"	16 #include "base/strings/string_util.h"

16 #include "base/strings/utf_offset_string_conversions.h"	17 #include "base/strings/utf_offset_string_conversions.h"

17 #include "base/strings/utf_string_conversions.h"	18 #include "base/strings/utf_string_conversions.h"

18 #include "base/threading/thread_local_storage.h"	19 #include "base/threading/thread_local_storage.h"

19 #include "components/url_formatter/idn_spoof_checker.h"	20 #include "components/url_formatter/idn_spoof_checker.h"

	21 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"

20 #include "third_party/icu/source/common/unicode/uidna.h"	22 #include "third_party/icu/source/common/unicode/uidna.h"

21 #include "third_party/icu/source/common/unicode/utypes.h"	23 #include "third_party/icu/source/common/unicode/utypes.h"

22 #include "url/gurl.h"	24 #include "url/gurl.h"

23 #include "url/third_party/mozilla/url_parse.h"	25 #include "url/third_party/mozilla/url_parse.h"

24	26

25 namespace url_formatter {	27 namespace url_formatter {

26	28

27 namespace {	29 namespace {

28	30

29 base::string16 IDNToUnicodeWithAdjustments(	31 base::string16 IDNToUnicodeWithAdjustments(

30 base::StringPiece host,	32 base::StringPiece host,

31 base::OffsetAdjuster::Adjustments* adjustments);	33 base::OffsetAdjuster::Adjustments* adjustments);

32 bool IDNToUnicodeOneComponent(const base::char16* comp,	34 bool IDNToUnicodeOneComponent(const base::char16* comp,

33 size_t comp_len,	35 size_t comp_len,

34 bool is_tld_ascii,	36 bool is_tld_ascii,

35 base::string16* out);	37 base::string16* out);

36	38

37 class AppendComponentTransform {	39 class AppendComponentTransform {

38 public:	40 public:

39 AppendComponentTransform() {}	41 AppendComponentTransform() {}

40 virtual ~AppendComponentTransform() {}	42 virtual ~AppendComponentTransform() {}

41	43

42 virtual base::string16 Execute(	44 virtual base::string16 Execute(

43 const std::string& component_text,	45 const std::string& component_text,

44 base::OffsetAdjuster::Adjustments* adjustments) const = 0;	46 base::OffsetAdjuster::Adjustments* adjustments) const = 0;

45	47

46 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an	48 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an

47 // accessible copy constructor in order to call AppendFormattedComponent()	49 // accessible copy constructor in order to call AppendFormattedComponent()

48 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).	50 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).
	Peter Kasting 2017/07/06 06:01:21 FWIW, I think we could probably replace this comme FWIW, I think we could probably replace this comment with DISALLOW_COPY_AND_ASSIGN since I think we no longer compile with gcc anywhere? But probably best done in a separate patch :) tommycli 2017/07/06 16:25:46 Acknowledged. Show quoted text On 2017/07/06 06:01:21, Peter Kasting wrote: > FWIW, I think we could probably replace this comment with > DISALLOW_COPY_AND_ASSIGN since I think we no longer compile with gcc anywhere? > But probably best done in a separate patch :) Acknowledged.
49 };	51 };

50	52

51 class HostComponentTransform : public AppendComponentTransform {	53 class HostComponentTransform : public AppendComponentTransform {

52 public:	54 public:

53 HostComponentTransform() {}	55 HostComponentTransform(bool trim_trivial_subdomains)

	56 : trim_trivial_subdomains_(trim_trivial_subdomains) {}

54	57

55 private:	58 private:

56 base::string16 Execute(	59 base::string16 Execute(

57 const std::string& component_text,	60 const std::string& component_text,

58 base::OffsetAdjuster::Adjustments* adjustments) const override {	61 base::OffsetAdjuster::Adjustments* adjustments) const override {

59 return IDNToUnicodeWithAdjustments(component_text, adjustments);	62 if (!trim_trivial_subdomains_)

	63 return IDNToUnicodeWithAdjustments(component_text, adjustments);

	64

	65 // Exclude the registry and domain from trivial subdomain stripping.

	66 // To get the adjustment offset calculations correct, we need to transform

	67 // the registry and domain portion of the host as well.

	68 std::string domain_and_registry =

	69 net::registry_controlled_domains::GetDomainAndRegistry(

	70 component_text,

	71 net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);

	72

	73 base::OffsetAdjuster::Adjustments trivial_subdomains_adjustments;

	74 base::StringTokenizer t(component_text.begin(),
	Peter Kasting 2017/07/06 06:01:21 Nit: t -> tokenizer (avoid abbreviation in general Nit: t -> tokenizer (avoid abbreviation in general; see http://google.github.io/styleguide/cppguide.html#General_Naming_Rules ) tommycli 2017/07/06 16:25:46 Done. Show quoted text On 2017/07/06 06:01:21, Peter Kasting wrote: > Nit: t -> tokenizer (avoid abbreviation in general; see > http://google.github.io/styleguide/cppguide.html#General_Naming_Rules ) Done.
	75 component_text.end() - domain_and_registry.length(),

	76 ".");

	77 t.set_options(base::StringTokenizer::RETURN_DELIMS);

	78

	79 std::string new_subdomain_string;
	Peter Kasting 2017/07/06 06:01:21 Nit: \|transformed_subdomain\|? Nit: \|transformed_subdomain\|? tommycli 2017/07/06 16:25:46 Done. Show quoted text On 2017/07/06 06:01:21, Peter Kasting wrote: > Nit: \|transformed_subdomain\|? Done.
	80 size_t offset = 0;

	81 while (t.GetNext()) {

	82 // Append delimiters and non-trivial subdomains to the new subdomain part.

	83 if (t.token_is_delim() \|\| (t.token() != "m" && t.token() != "www")) {

	84 new_subdomain_string.append(t.token());
	Peter Kasting 2017/07/06 06:01:21 Nit: append() is fine, I find += a little more idi Nit: append() is fine, I find += a little more idiomatic myself tommycli 2017/07/06 16:25:46 Done. Ah i did not realize that string had operato Show quoted text On 2017/07/06 06:01:21, Peter Kasting wrote: > Nit: append() is fine, I find += a little more idiomatic myself Done. Ah i did not realize that string had operator+=. I thought it was the equivalent of s = s + t;
	85 offset += t.token().length();

	86 continue;

	87 }

	88

	89 // When we find a trivial subdomain, we simply do not append to

	90 // \|new_subdomain_string\|. We also consume the next token, which must be

	91 // a delimiter.

	92 size_t trivial_subdomain_length = t.token().length();

	93 bool next_delimiter_found = t.GetNext();

	94 DCHECK(next_delimiter_found);

	95 DCHECK(t.token_is_delim());

	96

	97 // Add an adjustment accounting for the consumed subdomain and delimiter.

	98 trivial_subdomains_adjustments.push_back(base::OffsetAdjuster::Adjustment(

	99 offset, trivial_subdomain_length + 1, 0));

	100 offset += t.token().length() + 1;
	Peter Kasting 2017/07/06 06:01:21 Wait, is this right? I think t.token() is pointin Wait, is this right? I think t.token() is pointing at the delimiter here, so this always adds 2. I think you wanted trivial_subdomain_length + 1 here, didn't you? But in that case I don't know how your adjustment unittest passes. It feels like this is symptomatic of a design problem: \|offset\| itself seems like it's just trying to track where in the original string you are. But I think you can read that off the tokenizer (via e.g. t.token_begin() - component_text.begin(), or maybe using token_end(), depending on when you're reading it and what you want). So rather than iterating an offset and also iterating through the string, you can just do one. Or you could eliminate the tokenizer, keep the read offset, and use a simple loop that calls find() starting from that offset. Either way would probably work. tommycli 2017/07/06 16:25:46 Done. Good suggestion on re-using the tokenizer it Show quoted text On 2017/07/06 06:01:21, Peter Kasting wrote: > Wait, is this right? I think t.token() is pointing at the delimiter here, so > this always adds 2. I think you wanted trivial_subdomain_length + 1 here, > didn't you? But in that case I don't know how your adjustment unittest passes. > > It feels like this is symptomatic of a design problem: \|offset\| itself seems > like it's just trying to track where in the original string you are. But I > think you can read that off the tokenizer (via e.g. t.token_begin() - > component_text.begin(), or maybe using token_end(), depending on when you're > reading it and what you want). So rather than iterating an offset and also > iterating through the string, you can just do one. > > Or you could eliminate the tokenizer, keep the read offset, and use a simple > loop that calls find() starting from that offset. Either way would probably > work. Done. Good suggestion on re-using the tokenizer iterator rather than keeping track of "offset" separately. Much cleaner now. The unit tests worked because the adjustment verification unit tests for trivial subdomains only trimmed a single subdomain (which was correct). only subsequent trimmings would be incorrect. I beefed up the test. It now fails on the old code and passes on the new. Peter Kasting 2017/07/06 16:31:47 Yeah, the new version feels a lot cleaner to me no Show quoted text On 2017/07/06 16:25:46, tommycli wrote: > On 2017/07/06 06:01:21, Peter Kasting wrote: > > Wait, is this right? I think t.token() is pointing at the delimiter here, so > > this always adds 2. I think you wanted trivial_subdomain_length + 1 here, > > didn't you? But in that case I don't know how your adjustment unittest > passes. > > > > It feels like this is symptomatic of a design problem: \|offset\| itself seems > > like it's just trying to track where in the original string you are. But I > > think you can read that off the tokenizer (via e.g. t.token_begin() - > > component_text.begin(), or maybe using token_end(), depending on when you're > > reading it and what you want). So rather than iterating an offset and also > > iterating through the string, you can just do one. > > > > Or you could eliminate the tokenizer, keep the read offset, and use a simple > > loop that calls find() starting from that offset. Either way would probably > > work. > > Done. Good suggestion on re-using the tokenizer iterator rather than keeping > track of "offset" separately. Much cleaner now. > > The unit tests worked because the adjustment verification unit tests for trivial > subdomains only trimmed a single subdomain (which was correct). only subsequent > trimmings would be incorrect. > > I beefed up the test. It now fails on the old code and passes on the new. Yeah, the new version feels a lot cleaner to me now. High five!
	101 }

	102

	103 std::string new_component_text = new_subdomain_string + domain_and_registry;

	104 base::string16 unicode_result =

	105 IDNToUnicodeWithAdjustments(new_component_text, adjustments);

	106

	107 base::OffsetAdjuster::MergeSequentialAdjustments(

	108 trivial_subdomains_adjustments, adjustments);

	109 return unicode_result;

60 }	110 }

	111

	112 bool trim_trivial_subdomains_;

61 };	113 };

62	114

63 class NonHostComponentTransform : public AppendComponentTransform {	115 class NonHostComponentTransform : public AppendComponentTransform {

64 public:	116 public:

65 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules)	117 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules)

66 : unescape_rules_(unescape_rules) {}	118 : unescape_rules_(unescape_rules) {}

67	119

68 private:	120 private:

69 base::string16 Execute(	121 base::string16 Execute(

70 const std::string& component_text,	122 const std::string& component_text,

(...skipping 284 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
355	407

356 const FormatUrlType kFormatUrlOmitNothing = 0;	408 const FormatUrlType kFormatUrlOmitNothing = 0;

357 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0;	409 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0;

358 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1;	410 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1;

359 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2;	411 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2;

360 const FormatUrlType kFormatUrlOmitAll =	412 const FormatUrlType kFormatUrlOmitAll =

361 kFormatUrlOmitUsernamePassword \| kFormatUrlOmitHTTP \|	413 kFormatUrlOmitUsernamePassword \| kFormatUrlOmitHTTP \|

362 kFormatUrlOmitTrailingSlashOnBareHostname;	414 kFormatUrlOmitTrailingSlashOnBareHostname;

363 const FormatUrlType kFormatUrlExperimentalElideAfterHost = 1 << 3;	415 const FormatUrlType kFormatUrlExperimentalElideAfterHost = 1 << 3;

364 const FormatUrlType kFormatUrlExperimentalOmitHTTPS = 1 << 4;	416 const FormatUrlType kFormatUrlExperimentalOmitHTTPS = 1 << 4;

	417 const FormatUrlType kFormatUrlExperimentalOmitTrivialSubdomains = 1 << 5;

365	418

366 base::string16 FormatUrl(const GURL& url,	419 base::string16 FormatUrl(const GURL& url,

367 FormatUrlTypes format_types,	420 FormatUrlTypes format_types,

368 net::UnescapeRule::Type unescape_rules,	421 net::UnescapeRule::Type unescape_rules,

369 url::Parsed* new_parsed,	422 url::Parsed* new_parsed,

370 size_t* prefix_end,	423 size_t* prefix_end,

371 size_t* offset_for_adjustment) {	424 size_t* offset_for_adjustment) {

372 std::vector<size_t> offsets;	425 std::vector<size_t> offsets;

373 if (offset_for_adjustment)	426 if (offset_for_adjustment)

374 offsets.push_back(*offset_for_adjustment);	427 offsets.push_back(*offset_for_adjustment);

(...skipping 99 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
474 AppendFormattedComponent(spec, parsed.password,	527 AppendFormattedComponent(spec, parsed.password,

475 NonHostComponentTransform(unescape_rules),	528 NonHostComponentTransform(unescape_rules),

476 &url_string, &new_parsed->password, adjustments);	529 &url_string, &new_parsed->password, adjustments);

477 if (parsed.username.is_valid() \|\| parsed.password.is_valid())	530 if (parsed.username.is_valid() \|\| parsed.password.is_valid())

478 url_string.push_back('@');	531 url_string.push_back('@');

479 }	532 }

480 if (prefix_end)	533 if (prefix_end)

481 *prefix_end = static_cast<size_t>(url_string.length());	534 *prefix_end = static_cast<size_t>(url_string.length());

482	535

483 // Host.	536 // Host.

484 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(),	537 bool trim_trivial_subdomains =

	538 (format_types & kFormatUrlExperimentalOmitTrivialSubdomains) != 0;

	539 AppendFormattedComponent(spec, parsed.host,

	540 HostComponentTransform(trim_trivial_subdomains),

485 &url_string, &new_parsed->host, adjustments);	541 &url_string, &new_parsed->host, adjustments);

486	542

487 // Port.	543 // Port.

488 if (parsed.port.is_nonempty()) {	544 if (parsed.port.is_nonempty()) {

489 url_string.push_back(':');	545 url_string.push_back(':');

490 new_parsed->port.begin = url_string.length();	546 new_parsed->port.begin = url_string.length();

491 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin,	547 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin,

492 spec.begin() + parsed.port.end());	548 spec.begin() + parsed.port.end());

493 new_parsed->port.len = url_string.length() - new_parsed->port.begin;	549 new_parsed->port.len = url_string.length() - new_parsed->port.begin;

494 } else {	550 } else {

(...skipping 92 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
587 bool CanStripTrailingSlash(const GURL& url) {	643 bool CanStripTrailingSlash(const GURL& url) {

588 // Omit the path only for standard, non-file URLs with nothing but "/" after	644 // Omit the path only for standard, non-file URLs with nothing but "/" after

589 // the hostname.	645 // the hostname.

590 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() &&	646 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() &&

591 !url.has_query() && !url.has_ref() && url.path_piece() == "/";	647 !url.has_query() && !url.has_ref() && url.path_piece() == "/";

592 }	648 }

593	649

594 void AppendFormattedHost(const GURL& url, base::string16* output) {	650 void AppendFormattedHost(const GURL& url, base::string16* output) {

595 AppendFormattedComponent(	651 AppendFormattedComponent(

596 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host,	652 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host,

597 HostComponentTransform(), output, NULL, NULL);	653 HostComponentTransform(false /* trim_trivial_subdomains */), output,
	Peter Kasting 2017/07/06 06:01:21 (I still dislike these kinda comments but whatever (I still dislike these kinda comments but whatever) tommycli 2017/07/06 16:25:46 Done. Show quoted text On 2017/07/06 06:01:21, Peter Kasting wrote: > (I still dislike these kinda comments but whatever) Done.
	654 nullptr, nullptr);

598 }	655 }

599	656

600 base::string16 IDNToUnicode(base::StringPiece host) {	657 base::string16 IDNToUnicode(base::StringPiece host) {

601 return IDNToUnicodeWithAdjustments(host, nullptr);	658 return IDNToUnicodeWithAdjustments(host, nullptr);

602 }	659 }

603	660

604 base::string16 StripWWW(const base::string16& text) {	661 base::string16 StripWWW(const base::string16& text) {

605 const base::string16 www(base::ASCIIToUTF16("www."));	662 const base::string16 www(base::ASCIIToUTF16("www."));

606 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	663 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

607 ? text.substr(www.length()) : text;	664 ? text.substr(www.length()) : text;

608 }	665 }

609	666

610 base::string16 StripWWWFromHost(const GURL& url) {	667 base::string16 StripWWWFromHost(const GURL& url) {

611 DCHECK(url.is_valid());	668 DCHECK(url.is_valid());

612 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	669 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

613 }	670 }

614	671

615 } // namespace url_formatter	672 } // namespace url_formatter

OLD	NEW