Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/url_formatter/url_formatter.h" | 5 #include "components/url_formatter/url_formatter.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <utility> | 8 #include <utility> |
| 9 #include <vector> | 9 #include <vector> |
| 10 | 10 |
| 11 #include "base/lazy_instance.h" | 11 #include "base/lazy_instance.h" |
| 12 #include "base/macros.h" | 12 #include "base/macros.h" |
| 13 #include "base/numerics/safe_conversions.h" | 13 #include "base/numerics/safe_conversions.h" |
| 14 #include "base/strings/string_piece.h" | 14 #include "base/strings/string_piece.h" |
| 15 #include "base/strings/string_tokenizer.h" | |
| 15 #include "base/strings/string_util.h" | 16 #include "base/strings/string_util.h" |
| 16 #include "base/strings/utf_offset_string_conversions.h" | 17 #include "base/strings/utf_offset_string_conversions.h" |
| 17 #include "base/strings/utf_string_conversions.h" | 18 #include "base/strings/utf_string_conversions.h" |
| 18 #include "base/threading/thread_local_storage.h" | 19 #include "base/threading/thread_local_storage.h" |
| 19 #include "components/url_formatter/idn_spoof_checker.h" | 20 #include "components/url_formatter/idn_spoof_checker.h" |
| 21 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" | |
| 20 #include "third_party/icu/source/common/unicode/uidna.h" | 22 #include "third_party/icu/source/common/unicode/uidna.h" |
| 21 #include "third_party/icu/source/common/unicode/utypes.h" | 23 #include "third_party/icu/source/common/unicode/utypes.h" |
| 22 #include "url/gurl.h" | 24 #include "url/gurl.h" |
| 23 #include "url/third_party/mozilla/url_parse.h" | 25 #include "url/third_party/mozilla/url_parse.h" |
| 24 | 26 |
| 25 namespace url_formatter { | 27 namespace url_formatter { |
| 26 | 28 |
| 27 namespace { | 29 namespace { |
| 28 | 30 |
| 29 base::string16 IDNToUnicodeWithAdjustments( | 31 base::string16 IDNToUnicodeWithAdjustments( |
| 30 base::StringPiece host, | 32 base::StringPiece host, |
| 31 base::OffsetAdjuster::Adjustments* adjustments); | 33 base::OffsetAdjuster::Adjustments* adjustments); |
| 32 bool IDNToUnicodeOneComponent(const base::char16* comp, | 34 bool IDNToUnicodeOneComponent(const base::char16* comp, |
| 33 size_t comp_len, | 35 size_t comp_len, |
| 34 bool is_tld_ascii, | 36 bool is_tld_ascii, |
| 35 base::string16* out); | 37 base::string16* out); |
| 36 | 38 |
| 37 class AppendComponentTransform { | 39 class AppendComponentTransform { |
| 38 public: | 40 public: |
| 39 AppendComponentTransform() {} | 41 AppendComponentTransform() {} |
| 40 virtual ~AppendComponentTransform() {} | 42 virtual ~AppendComponentTransform() {} |
| 41 | 43 |
| 42 virtual base::string16 Execute( | 44 virtual base::string16 Execute( |
| 43 const std::string& component_text, | 45 const std::string& component_text, |
| 44 base::OffsetAdjuster::Adjustments* adjustments) const = 0; | 46 base::OffsetAdjuster::Adjustments* adjustments) const = 0; |
| 45 | 47 |
| 46 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an | 48 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an |
| 47 // accessible copy constructor in order to call AppendFormattedComponent() | 49 // accessible copy constructor in order to call AppendFormattedComponent() |
| 48 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ). | 50 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ). |
|
Peter Kasting
2017/07/06 06:01:21
FWIW, I think we could probably replace this comme
tommycli
2017/07/06 16:25:46
Acknowledged.
| |
| 49 }; | 51 }; |
| 50 | 52 |
| 51 class HostComponentTransform : public AppendComponentTransform { | 53 class HostComponentTransform : public AppendComponentTransform { |
| 52 public: | 54 public: |
| 53 HostComponentTransform() {} | 55 HostComponentTransform(bool trim_trivial_subdomains) |
| 56 : trim_trivial_subdomains_(trim_trivial_subdomains) {} | |
| 54 | 57 |
| 55 private: | 58 private: |
| 56 base::string16 Execute( | 59 base::string16 Execute( |
| 57 const std::string& component_text, | 60 const std::string& component_text, |
| 58 base::OffsetAdjuster::Adjustments* adjustments) const override { | 61 base::OffsetAdjuster::Adjustments* adjustments) const override { |
| 59 return IDNToUnicodeWithAdjustments(component_text, adjustments); | 62 if (!trim_trivial_subdomains_) |
| 63 return IDNToUnicodeWithAdjustments(component_text, adjustments); | |
| 64 | |
| 65 // Exclude the registry and domain from trivial subdomain stripping. | |
| 66 // To get the adjustment offset calculations correct, we need to transform | |
| 67 // the registry and domain portion of the host as well. | |
| 68 std::string domain_and_registry = | |
| 69 net::registry_controlled_domains::GetDomainAndRegistry( | |
| 70 component_text, | |
| 71 net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES); | |
| 72 | |
| 73 base::OffsetAdjuster::Adjustments trivial_subdomains_adjustments; | |
| 74 base::StringTokenizer t(component_text.begin(), | |
|
Peter Kasting
2017/07/06 06:01:21
Nit: t -> tokenizer (avoid abbreviation in general
tommycli
2017/07/06 16:25:46
Done.
| |
| 75 component_text.end() - domain_and_registry.length(), | |
| 76 "."); | |
| 77 t.set_options(base::StringTokenizer::RETURN_DELIMS); | |
| 78 | |
| 79 std::string new_subdomain_string; | |
|
Peter Kasting
2017/07/06 06:01:21
Nit: |transformed_subdomain|?
tommycli
2017/07/06 16:25:46
Done.
| |
| 80 size_t offset = 0; | |
| 81 while (t.GetNext()) { | |
| 82 // Append delimiters and non-trivial subdomains to the new subdomain part. | |
| 83 if (t.token_is_delim() || (t.token() != "m" && t.token() != "www")) { | |
| 84 new_subdomain_string.append(t.token()); | |
|
Peter Kasting
2017/07/06 06:01:21
Nit: append() is fine, I find += a little more idi
tommycli
2017/07/06 16:25:46
Done. Ah i did not realize that string had operato
| |
| 85 offset += t.token().length(); | |
| 86 continue; | |
| 87 } | |
| 88 | |
| 89 // When we find a trivial subdomain, we simply do not append to | |
| 90 // |new_subdomain_string|. We also consume the next token, which must be | |
| 91 // a delimiter. | |
| 92 size_t trivial_subdomain_length = t.token().length(); | |
| 93 bool next_delimiter_found = t.GetNext(); | |
| 94 DCHECK(next_delimiter_found); | |
| 95 DCHECK(t.token_is_delim()); | |
| 96 | |
| 97 // Add an adjustment accounting for the consumed subdomain and delimiter. | |
| 98 trivial_subdomains_adjustments.push_back(base::OffsetAdjuster::Adjustment( | |
| 99 offset, trivial_subdomain_length + 1, 0)); | |
| 100 offset += t.token().length() + 1; | |
|
Peter Kasting
2017/07/06 06:01:21
Wait, is this right? I think t.token() is pointin
tommycli
2017/07/06 16:25:46
Done. Good suggestion on re-using the tokenizer it
Peter Kasting
2017/07/06 16:31:47
Yeah, the new version feels a lot cleaner to me no
| |
| 101 } | |
| 102 | |
| 103 std::string new_component_text = new_subdomain_string + domain_and_registry; | |
| 104 base::string16 unicode_result = | |
| 105 IDNToUnicodeWithAdjustments(new_component_text, adjustments); | |
| 106 | |
| 107 base::OffsetAdjuster::MergeSequentialAdjustments( | |
| 108 trivial_subdomains_adjustments, adjustments); | |
| 109 return unicode_result; | |
| 60 } | 110 } |
| 111 | |
| 112 bool trim_trivial_subdomains_; | |
| 61 }; | 113 }; |
| 62 | 114 |
| 63 class NonHostComponentTransform : public AppendComponentTransform { | 115 class NonHostComponentTransform : public AppendComponentTransform { |
| 64 public: | 116 public: |
| 65 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules) | 117 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules) |
| 66 : unescape_rules_(unescape_rules) {} | 118 : unescape_rules_(unescape_rules) {} |
| 67 | 119 |
| 68 private: | 120 private: |
| 69 base::string16 Execute( | 121 base::string16 Execute( |
| 70 const std::string& component_text, | 122 const std::string& component_text, |
| (...skipping 284 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 355 | 407 |
| 356 const FormatUrlType kFormatUrlOmitNothing = 0; | 408 const FormatUrlType kFormatUrlOmitNothing = 0; |
| 357 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; | 409 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; |
| 358 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; | 410 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; |
| 359 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; | 411 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; |
| 360 const FormatUrlType kFormatUrlOmitAll = | 412 const FormatUrlType kFormatUrlOmitAll = |
| 361 kFormatUrlOmitUsernamePassword | kFormatUrlOmitHTTP | | 413 kFormatUrlOmitUsernamePassword | kFormatUrlOmitHTTP | |
| 362 kFormatUrlOmitTrailingSlashOnBareHostname; | 414 kFormatUrlOmitTrailingSlashOnBareHostname; |
| 363 const FormatUrlType kFormatUrlExperimentalElideAfterHost = 1 << 3; | 415 const FormatUrlType kFormatUrlExperimentalElideAfterHost = 1 << 3; |
| 364 const FormatUrlType kFormatUrlExperimentalOmitHTTPS = 1 << 4; | 416 const FormatUrlType kFormatUrlExperimentalOmitHTTPS = 1 << 4; |
| 417 const FormatUrlType kFormatUrlExperimentalOmitTrivialSubdomains = 1 << 5; | |
| 365 | 418 |
| 366 base::string16 FormatUrl(const GURL& url, | 419 base::string16 FormatUrl(const GURL& url, |
| 367 FormatUrlTypes format_types, | 420 FormatUrlTypes format_types, |
| 368 net::UnescapeRule::Type unescape_rules, | 421 net::UnescapeRule::Type unescape_rules, |
| 369 url::Parsed* new_parsed, | 422 url::Parsed* new_parsed, |
| 370 size_t* prefix_end, | 423 size_t* prefix_end, |
| 371 size_t* offset_for_adjustment) { | 424 size_t* offset_for_adjustment) { |
| 372 std::vector<size_t> offsets; | 425 std::vector<size_t> offsets; |
| 373 if (offset_for_adjustment) | 426 if (offset_for_adjustment) |
| 374 offsets.push_back(*offset_for_adjustment); | 427 offsets.push_back(*offset_for_adjustment); |
| (...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 474 AppendFormattedComponent(spec, parsed.password, | 527 AppendFormattedComponent(spec, parsed.password, |
| 475 NonHostComponentTransform(unescape_rules), | 528 NonHostComponentTransform(unescape_rules), |
| 476 &url_string, &new_parsed->password, adjustments); | 529 &url_string, &new_parsed->password, adjustments); |
| 477 if (parsed.username.is_valid() || parsed.password.is_valid()) | 530 if (parsed.username.is_valid() || parsed.password.is_valid()) |
| 478 url_string.push_back('@'); | 531 url_string.push_back('@'); |
| 479 } | 532 } |
| 480 if (prefix_end) | 533 if (prefix_end) |
| 481 *prefix_end = static_cast<size_t>(url_string.length()); | 534 *prefix_end = static_cast<size_t>(url_string.length()); |
| 482 | 535 |
| 483 // Host. | 536 // Host. |
| 484 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(), | 537 bool trim_trivial_subdomains = |
| 538 (format_types & kFormatUrlExperimentalOmitTrivialSubdomains) != 0; | |
| 539 AppendFormattedComponent(spec, parsed.host, | |
| 540 HostComponentTransform(trim_trivial_subdomains), | |
| 485 &url_string, &new_parsed->host, adjustments); | 541 &url_string, &new_parsed->host, adjustments); |
| 486 | 542 |
| 487 // Port. | 543 // Port. |
| 488 if (parsed.port.is_nonempty()) { | 544 if (parsed.port.is_nonempty()) { |
| 489 url_string.push_back(':'); | 545 url_string.push_back(':'); |
| 490 new_parsed->port.begin = url_string.length(); | 546 new_parsed->port.begin = url_string.length(); |
| 491 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin, | 547 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin, |
| 492 spec.begin() + parsed.port.end()); | 548 spec.begin() + parsed.port.end()); |
| 493 new_parsed->port.len = url_string.length() - new_parsed->port.begin; | 549 new_parsed->port.len = url_string.length() - new_parsed->port.begin; |
| 494 } else { | 550 } else { |
| (...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 587 bool CanStripTrailingSlash(const GURL& url) { | 643 bool CanStripTrailingSlash(const GURL& url) { |
| 588 // Omit the path only for standard, non-file URLs with nothing but "/" after | 644 // Omit the path only for standard, non-file URLs with nothing but "/" after |
| 589 // the hostname. | 645 // the hostname. |
| 590 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() && | 646 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() && |
| 591 !url.has_query() && !url.has_ref() && url.path_piece() == "/"; | 647 !url.has_query() && !url.has_ref() && url.path_piece() == "/"; |
| 592 } | 648 } |
| 593 | 649 |
| 594 void AppendFormattedHost(const GURL& url, base::string16* output) { | 650 void AppendFormattedHost(const GURL& url, base::string16* output) { |
| 595 AppendFormattedComponent( | 651 AppendFormattedComponent( |
| 596 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host, | 652 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host, |
| 597 HostComponentTransform(), output, NULL, NULL); | 653 HostComponentTransform(false /* trim_trivial_subdomains */), output, |
|
Peter Kasting
2017/07/06 06:01:21
(I still dislike these kinda comments but whatever
tommycli
2017/07/06 16:25:46
Done.
| |
| 654 nullptr, nullptr); | |
| 598 } | 655 } |
| 599 | 656 |
| 600 base::string16 IDNToUnicode(base::StringPiece host) { | 657 base::string16 IDNToUnicode(base::StringPiece host) { |
| 601 return IDNToUnicodeWithAdjustments(host, nullptr); | 658 return IDNToUnicodeWithAdjustments(host, nullptr); |
| 602 } | 659 } |
| 603 | 660 |
| 604 base::string16 StripWWW(const base::string16& text) { | 661 base::string16 StripWWW(const base::string16& text) { |
| 605 const base::string16 www(base::ASCIIToUTF16("www.")); | 662 const base::string16 www(base::ASCIIToUTF16("www.")); |
| 606 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) | 663 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) |
| 607 ? text.substr(www.length()) : text; | 664 ? text.substr(www.length()) : text; |
| 608 } | 665 } |
| 609 | 666 |
| 610 base::string16 StripWWWFromHost(const GURL& url) { | 667 base::string16 StripWWWFromHost(const GURL& url) { |
| 611 DCHECK(url.is_valid()); | 668 DCHECK(url.is_valid()); |
| 612 return StripWWW(base::ASCIIToUTF16(url.host_piece())); | 669 return StripWWW(base::ASCIIToUTF16(url.host_piece())); |
| 613 } | 670 } |
| 614 | 671 |
| 615 } // namespace url_formatter | 672 } // namespace url_formatter |
| OLD | NEW |