Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(67)

Side by Side Diff: components/search_engines/template_url.cc

Issue 1968303002: Support inexact path matching when extracting terms from Template URL. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/search_engines/template_url.h" 5 #include "components/search_engines/template_url.h"
6 6
7 #include <string> 7 #include <string>
8 #include <vector> 8 #include <vector>
9 9
10 #include "base/command_line.h" 10 #include "base/command_line.h"
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
52 "google:unescapedSearchTerms"; 52 "google:unescapedSearchTerms";
53 const char kGoogleUnescapedSearchTermsParameterFull[] = 53 const char kGoogleUnescapedSearchTermsParameterFull[] =
54 "{google:unescapedSearchTerms}"; 54 "{google:unescapedSearchTerms}";
55 55
56 // Display value for kSearchTermsParameter. 56 // Display value for kSearchTermsParameter.
57 const char kDisplaySearchTerms[] = "%s"; 57 const char kDisplaySearchTerms[] = "%s";
58 58
59 // Display value for kGoogleUnescapedSearchTermsParameter. 59 // Display value for kGoogleUnescapedSearchTermsParameter.
60 const char kDisplayUnescapedSearchTerms[] = "%S"; 60 const char kDisplayUnescapedSearchTerms[] = "%S";
61 61
62 // Text "{google:ignorePathEnding}" at the end of template path orders to
63 // not compare rest of an URL's path while extracting search terms from the URL.
64 const char kGoogleIgnorePathEndingFullEscaped[] =
65 "%7Bgoogle:ignorePathEnding%7D";
66
62 // Used if the count parameter is not optional. Indicates we want 10 search 67 // Used if the count parameter is not optional. Indicates we want 10 search
63 // results. 68 // results.
64 const char kDefaultCount[] = "10"; 69 const char kDefaultCount[] = "10";
65 70
66 // Used if the output encoding parameter is required. 71 // Used if the output encoding parameter is required.
67 const char kOutputEncodingType[] = "UTF-8"; 72 const char kOutputEncodingType[] = "UTF-8";
68 73
69 // Attempts to encode |terms| and |original_query| in |encoding| and escape 74 // Attempts to encode |terms| and |original_query| in |encoding| and escape
70 // them. |terms| may be escaped as path or query depending on |is_in_query|; 75 // them. |terms| may be escaped as path or query depending on |is_in_query|;
71 // |original_query| is always escaped as query. Returns whether the encoding 76 // |original_query| is always escaped as query. Returns whether the encoding
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
134 kGoogleUnescapedSearchTermsParameterFull, 139 kGoogleUnescapedSearchTermsParameterFull,
135 &result.value_prefix, &result.value_suffix)) { 140 &result.value_prefix, &result.value_suffix)) {
136 result.key = params.substr(key.begin, key.len); 141 result.key = params.substr(key.begin, key.len);
137 break; 142 break;
138 } 143 }
139 } 144 }
140 } 145 }
141 return result; 146 return result;
142 } 147 }
143 148
149 struct SearchTermsInPathResult {
150 bool search_terms_found;
151 std::string value_prefix;
152 std::string value_suffix;
153 bool ignore_ending;
154 SearchTermsInPathResult() : search_terms_found(false), ignore_ending(false) {}
155 bool found() const { return search_terms_found; }
156 };
157
144 // Extract the position of the search terms' parameter in the URL path. 158 // Extract the position of the search terms' parameter in the URL path.
145 bool FindSearchTermsInPath(const std::string& path, 159 SearchTermsInPathResult FindSearchTermsInPath(const base::StringPiece& path) {
146 url::Component* parameter_position) { 160 DCHECK(path.starts_with("/"));
147 DCHECK(parameter_position); 161
148 parameter_position->reset(); 162 const base::StringPiece search_terms_parameter(
149 const size_t begin = path.find(kSearchTermsParameterFullEscaped); 163 kSearchTermsParameterFullEscaped);
150 if (begin == std::string::npos) 164 const size_t search_terms_pos = path.find(search_terms_parameter);
151 return false; 165 const size_t search_terms_end = (search_terms_pos == std::string::npos) ?
152 parameter_position->begin = begin; 166 std::string::npos : (search_terms_pos + search_terms_parameter.length());
153 parameter_position->len = arraysize(kSearchTermsParameterFullEscaped) - 1; 167
154 return true; 168 const base::StringPiece ignore_ending_parameter(
169 kGoogleIgnorePathEndingFullEscaped);
170 const size_t ignore_ending_pos = path.find(ignore_ending_parameter);
171
172 bool search_terms_found = (search_terms_pos != std::string::npos);
173 bool ignore_ending = (ignore_ending_pos != std::string::npos);
174 size_t path_end = path.length();
175
176 if (search_terms_found && ignore_ending) {
177 if (ignore_ending_pos < search_terms_pos) {
178 // "{google:ignorePathEnding}" occurs before "{searchTerms}".
179 // Ignore "{searchTerms}".
180 search_terms_found = false;
181 } else if (search_terms_end == ignore_ending_pos) {
182 // No characters occur between "{searchTerms}" and
183 // "{google:ignorePathEnding}". Ignore "{google:ignorePathEnding}".
184 ignore_ending = false;
185 path_end = ignore_ending_pos;
186 }
187 }
188
189 base::StringPiece prefix, suffix;
190 if (search_terms_found) {
191 prefix = path.substr(0, search_terms_pos);
192 if (ignore_ending)
193 suffix = path.substr(search_terms_end,
194 ignore_ending_pos - search_terms_end);
195 else
196 suffix = path.substr(search_terms_end, path_end - search_terms_end);
197 } else {
198 if (ignore_ending)
199 prefix = path.substr(0, ignore_ending_pos);
200 else
201 prefix = path.substr(0, path_end);
202 }
203
204 SearchTermsInPathResult result;
205 result.search_terms_found = search_terms_found;
206 result.ignore_ending = ignore_ending;
207 result.value_prefix = prefix.as_string();
208 result.value_suffix = suffix.as_string();
209 DCHECK(base::StartsWith(result.value_prefix, "/",
210 base::CompareCase::SENSITIVE));
211 DCHECK(!result.search_terms_found || !result.ignore_ending ||
212 !result.value_suffix.empty());
213 return result;
155 } 214 }
156 215
157 bool IsTemplateParameterString(const std::string& param) { 216 bool IsTemplateParameterString(const std::string& param) {
158 return (param.length() > 2) && (*(param.begin()) == kStartParameter) && 217 return (param.length() > 2) && (*(param.begin()) == kStartParameter) &&
159 (*(param.rbegin()) == kEndParameter); 218 (*(param.rbegin()) == kEndParameter);
160 } 219 }
161 220
162 } // namespace 221 } // namespace
163 222
164 223
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
233 292
234 // TemplateURLRef ------------------------------------------------------------- 293 // TemplateURLRef -------------------------------------------------------------
235 294
236 TemplateURLRef::TemplateURLRef(const TemplateURL* owner, Type type) 295 TemplateURLRef::TemplateURLRef(const TemplateURL* owner, Type type)
237 : owner_(owner), 296 : owner_(owner),
238 type_(type), 297 type_(type),
239 index_in_owner_(0), 298 index_in_owner_(0),
240 parsed_(false), 299 parsed_(false),
241 valid_(false), 300 valid_(false),
242 supports_replacements_(false), 301 supports_replacements_(false),
243 search_term_position_in_path_(std::string::npos), 302 ignore_path_ending_(false),
244 search_term_key_location_(url::Parsed::QUERY), 303 search_term_key_location_(url::Parsed::QUERY),
245 prepopulated_(false) { 304 prepopulated_(false) {
246 DCHECK(owner_); 305 DCHECK(owner_);
247 DCHECK_NE(INDEXED, type_); 306 DCHECK_NE(INDEXED, type_);
248 } 307 }
249 308
250 TemplateURLRef::TemplateURLRef(const TemplateURL* owner, size_t index_in_owner) 309 TemplateURLRef::TemplateURLRef(const TemplateURL* owner, size_t index_in_owner)
251 : owner_(owner), 310 : owner_(owner),
252 type_(INDEXED), 311 type_(INDEXED),
253 index_in_owner_(index_in_owner), 312 index_in_owner_(index_in_owner),
254 parsed_(false), 313 parsed_(false),
255 valid_(false), 314 valid_(false),
256 supports_replacements_(false), 315 supports_replacements_(false),
257 search_term_position_in_path_(std::string::npos), 316 ignore_path_ending_(false),
258 search_term_key_location_(url::Parsed::QUERY), 317 search_term_key_location_(url::Parsed::QUERY),
259 prepopulated_(false) { 318 prepopulated_(false) {
260 DCHECK(owner_); 319 DCHECK(owner_);
261 DCHECK_LT(index_in_owner_, owner_->alternate_urls().size()); 320 DCHECK_LT(index_in_owner_, owner_->alternate_urls().size());
262 } 321 }
263 322
264 TemplateURLRef::~TemplateURLRef() { 323 TemplateURLRef::~TemplateURLRef() {
265 } 324 }
266 325
267 TemplateURLRef::TemplateURLRef(const TemplateURLRef& source) = default; 326 TemplateURLRef::TemplateURLRef(const TemplateURLRef& source) = default;
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after
408 ParseIfNecessary(search_terms_data); 467 ParseIfNecessary(search_terms_data);
409 return host_; 468 return host_;
410 } 469 }
411 470
412 const std::string& TemplateURLRef::GetPath( 471 const std::string& TemplateURLRef::GetPath(
413 const SearchTermsData& search_terms_data) const { 472 const SearchTermsData& search_terms_data) const {
414 ParseIfNecessary(search_terms_data); 473 ParseIfNecessary(search_terms_data);
415 return path_; 474 return path_;
416 } 475 }
417 476
477 bool TemplateURLRef::GetIgnorePathEnding(
478 const SearchTermsData& search_terms_data) const {
479 ParseIfNecessary(search_terms_data);
480 return ignore_path_ending_;
481 }
482
418 const std::string& TemplateURLRef::GetSearchTermKey( 483 const std::string& TemplateURLRef::GetSearchTermKey(
419 const SearchTermsData& search_terms_data) const { 484 const SearchTermsData& search_terms_data) const {
420 ParseIfNecessary(search_terms_data); 485 ParseIfNecessary(search_terms_data);
421 return search_term_key_; 486 return search_term_key_;
422 } 487 }
423 488
424 size_t TemplateURLRef::GetSearchTermPositionInPath(
425 const SearchTermsData& search_terms_data) const {
426 ParseIfNecessary(search_terms_data);
427 return search_term_position_in_path_;
428 }
429
430 url::Parsed::ComponentType TemplateURLRef::GetSearchTermKeyLocation( 489 url::Parsed::ComponentType TemplateURLRef::GetSearchTermKeyLocation(
431 const SearchTermsData& search_terms_data) const { 490 const SearchTermsData& search_terms_data) const {
432 ParseIfNecessary(search_terms_data); 491 ParseIfNecessary(search_terms_data);
433 return search_term_key_location_; 492 return search_term_key_location_;
434 } 493 }
435 494
495 const std::string& TemplateURLRef::GetSearchTermValuePrefix(
496 const SearchTermsData& search_terms_data) const {
497 ParseIfNecessary(search_terms_data);
498 return search_term_value_prefix_;
499 }
500
501 const std::string& TemplateURLRef::GetSearchTermValueSuffix(
502 const SearchTermsData& search_terms_data) const {
503 ParseIfNecessary(search_terms_data);
504 return search_term_value_suffix_;
505 }
506
436 base::string16 TemplateURLRef::SearchTermToString16( 507 base::string16 TemplateURLRef::SearchTermToString16(
437 const std::string& term) const { 508 const std::string& term) const {
438 const std::vector<std::string>& encodings = owner_->input_encodings(); 509 const std::vector<std::string>& encodings = owner_->input_encodings();
439 base::string16 result; 510 base::string16 result;
440 511
441 net::UnescapeRule::Type unescape_rules = 512 net::UnescapeRule::Type unescape_rules =
442 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | 513 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
443 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS; 514 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS;
444 if (search_term_key_location_ != url::Parsed::PATH) 515 if (search_term_key_location_ != url::Parsed::PATH)
445 unescape_rules |= net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; 516 unescape_rules |= net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
488 ParseIfNecessary(search_terms_data); 559 ParseIfNecessary(search_terms_data);
489 560
490 // We need a search term in the template URL to extract something. 561 // We need a search term in the template URL to extract something.
491 if (search_term_key_.empty() && 562 if (search_term_key_.empty() &&
492 (search_term_key_location_ != url::Parsed::PATH)) 563 (search_term_key_location_ != url::Parsed::PATH))
493 return false; 564 return false;
494 565
495 // Host, port, and path must match. 566 // Host, port, and path must match.
496 if ((url.host() != host_) || 567 if ((url.host() != host_) ||
497 (url.port() != port_) || 568 (url.port() != port_) ||
498 ((url.path() != path_) && 569 ((search_term_key_location_ != url::Parsed::PATH) &&
499 (search_term_key_location_ != url::Parsed::PATH))) { 570 !MatchPath(url.path()))) {
500 return false; 571 return false;
501 } 572 }
502 573
503 std::string source; 574 std::string source;
504 url::Component position; 575 url::Component position;
505 576
506 if (search_term_key_location_ == url::Parsed::PATH) { 577 if (search_term_key_location_ == url::Parsed::PATH) {
507 source = url.path(); 578 source = url.path();
508 579 if (!MatchPathWithSearchTerms(source, &position))
509 // Characters in the path before and after search terms must match.
510 if (source.length() < path_.length())
511 return false;
512 position.begin = search_term_position_in_path_;
513 position.len = source.length() - path_.length();
514 if (source.substr(0, position.begin) + source.substr(position.end()) !=
515 path_)
516 return false; 580 return false;
517 } else { 581 } else {
518 DCHECK(search_term_key_location_ == url::Parsed::QUERY || 582 DCHECK(search_term_key_location_ == url::Parsed::QUERY ||
519 search_term_key_location_ == url::Parsed::REF); 583 search_term_key_location_ == url::Parsed::REF);
520 source = (search_term_key_location_ == url::Parsed::QUERY) ? 584 source = (search_term_key_location_ == url::Parsed::QUERY) ?
521 url.query() : url.ref(); 585 url.query() : url.ref();
522 586
523 url::Component query, key, value; 587 url::Component query, key, value;
524 query.len = static_cast<int>(source.size()); 588 query.len = static_cast<int>(source.size());
525 bool key_found = false; 589 bool key_found = false;
(...skipping 26 matching lines...) Expand all
552 // Extract the search term. 616 // Extract the search term.
553 *search_terms = 617 *search_terms =
554 SearchTermToString16(source.substr(position.begin, position.len)); 618 SearchTermToString16(source.substr(position.begin, position.len));
555 if (search_terms_component) 619 if (search_terms_component)
556 *search_terms_component = search_term_key_location_; 620 *search_terms_component = search_term_key_location_;
557 if (search_terms_position) 621 if (search_terms_position)
558 *search_terms_position = position; 622 *search_terms_position = position;
559 return true; 623 return true;
560 } 624 }
561 625
626 bool TemplateURLRef::MatchPath(const std::string& path) const {
627 if (search_term_key_location_ == url::Parsed::PATH)
628 return false;
629 if (ignore_path_ending_)
630 return base::StartsWith(path, path_, base::CompareCase::SENSITIVE);
631 else
632 return (path == path_);
633 }
634
635 bool TemplateURLRef::MatchPathWithSearchTerms(
636 const std::string& path,
637 url::Component* search_terms_position) const {
638 if (search_term_key_location_ != url::Parsed::PATH)
639 return false;
640 if (path.length() < search_term_value_prefix_.length() +
641 search_term_value_suffix_.length())
642 return false;
643 if (!base::StartsWith(path, search_term_value_prefix_,
644 base::CompareCase::SENSITIVE))
645 return false;
646 const size_t search_terms_pos = search_term_value_prefix_.length();
647 size_t search_terms_end = std::string::npos;
648 if (ignore_path_ending_) {
649 search_terms_end = path.find(search_term_value_suffix_,
650 search_terms_pos);
651 if (search_terms_end == std::string::npos)
652 return false;
653 } else {
654 if (!base::EndsWith(path, search_term_value_suffix_,
655 base::CompareCase::SENSITIVE))
656 return false;
657 search_terms_end = path.length() - search_term_value_suffix_.length();
658 }
659 DCHECK_NE(std::string::npos, search_terms_end);
660 *search_terms_position = url::MakeRange(search_terms_pos, search_terms_end);
661 return true;
662 }
663
562 void TemplateURLRef::InvalidateCachedValues() const { 664 void TemplateURLRef::InvalidateCachedValues() const {
563 supports_replacements_ = valid_ = parsed_ = false; 665 supports_replacements_ = valid_ = parsed_ = false;
564 host_.clear(); 666 host_.clear();
565 port_.clear(); 667 port_.clear();
566 path_.clear(); 668 path_.clear();
669 ignore_path_ending_ = false;
567 search_term_key_.clear(); 670 search_term_key_.clear();
568 search_term_position_in_path_ = std::string::npos;
569 search_term_key_location_ = url::Parsed::QUERY; 671 search_term_key_location_ = url::Parsed::QUERY;
672 search_term_value_prefix_.clear();
673 search_term_value_suffix_.clear();
570 replacements_.clear(); 674 replacements_.clear();
571 post_params_.clear(); 675 post_params_.clear();
572 } 676 }
573 677
574 bool TemplateURLRef::ParseParameter(size_t start, 678 bool TemplateURLRef::ParseParameter(size_t start,
575 size_t end, 679 size_t end,
576 std::string* url, 680 std::string* url,
577 Replacements* replacements) const { 681 Replacements* replacements) const {
578 DCHECK(start != std::string::npos && 682 DCHECK(start != std::string::npos &&
579 end != std::string::npos && end > start); 683 end != std::string::npos && end > start);
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after
812 base::ReplaceSubstringsAfterOffset( 916 base::ReplaceSubstringsAfterOffset(
813 &url_string, 0, "{google:baseSuggestURL}", 917 &url_string, 0, "{google:baseSuggestURL}",
814 search_terms_data.GoogleBaseSuggestURLValue()); 918 search_terms_data.GoogleBaseSuggestURLValue());
815 919
816 GURL url(url_string); 920 GURL url(url_string);
817 if (!url.is_valid()) 921 if (!url.is_valid())
818 return; 922 return;
819 923
820 auto query_result = FindSearchTermsKey(url.query()); 924 auto query_result = FindSearchTermsKey(url.query());
821 auto ref_result = FindSearchTermsKey(url.ref()); 925 auto ref_result = FindSearchTermsKey(url.ref());
822 url::Component parameter_position; 926 auto path_result = FindSearchTermsInPath(url.path());
823 const bool in_query = query_result.found(); 927 const bool in_query = query_result.found();
824 const bool in_ref = ref_result.found(); 928 const bool in_ref = ref_result.found();
825 const bool in_path = FindSearchTermsInPath(url.path(), &parameter_position); 929 const bool in_path = path_result.found();
826 if (in_query ? (in_ref || in_path) : (in_ref == in_path)) 930 if (in_query ? (in_ref || in_path) : (in_ref == in_path))
827 return; // No key or multiple keys found. We only handle having one key. 931 return; // No key or multiple keys found. We only handle having one key.
828 932
829 host_ = url.host(); 933 host_ = url.host();
830 port_ = url.port(); 934 port_ = url.port();
831 path_ = url.path(); 935 path_ = path_result.value_prefix + path_result.value_suffix;
936 ignore_path_ending_ = path_result.ignore_ending;
832 if (in_query) { 937 if (in_query) {
833 search_term_key_ = query_result.key; 938 search_term_key_ = query_result.key;
834 search_term_key_location_ = url::Parsed::QUERY; 939 search_term_key_location_ = url::Parsed::QUERY;
835 search_term_value_prefix_ = query_result.value_prefix; 940 search_term_value_prefix_ = query_result.value_prefix;
836 search_term_value_suffix_ = query_result.value_suffix; 941 search_term_value_suffix_ = query_result.value_suffix;
837 } else if (in_ref) { 942 } else if (in_ref) {
838 search_term_key_ = ref_result.key; 943 search_term_key_ = ref_result.key;
839 search_term_key_location_ = url::Parsed::REF; 944 search_term_key_location_ = url::Parsed::REF;
840 search_term_value_prefix_ = ref_result.value_prefix; 945 search_term_value_prefix_ = ref_result.value_prefix;
841 search_term_value_suffix_ = ref_result.value_suffix; 946 search_term_value_suffix_ = ref_result.value_suffix;
842 } else { 947 } else {
843 DCHECK(in_path); 948 DCHECK(in_path);
844 DCHECK_GE(parameter_position.begin, 1); // Path must start with '/'.
845 search_term_key_location_ = url::Parsed::PATH; 949 search_term_key_location_ = url::Parsed::PATH;
846 search_term_position_in_path_ = parameter_position.begin; 950 search_term_value_prefix_ = path_result.value_prefix;
847 // Remove the "{searchTerms}" itself from |path_|. 951 search_term_value_suffix_ = path_result.value_suffix;
848 path_.erase(parameter_position.begin, parameter_position.len);
849 } 952 }
850 } 953 }
851 954
852 void TemplateURLRef::HandleReplacement(const std::string& name, 955 void TemplateURLRef::HandleReplacement(const std::string& name,
853 const std::string& value, 956 const std::string& value,
854 const Replacement& replacement, 957 const Replacement& replacement,
855 std::string* url) const { 958 std::string* url) const {
856 size_t pos = replacement.index; 959 size_t pos = replacement.index;
857 if (replacement.is_post_param) { 960 if (replacement.is_post_param) {
858 DCHECK_LT(pos, post_params_.size()); 961 DCHECK_LT(pos, post_params_.size());
(...skipping 676 matching lines...) Expand 10 before | Expand all | Expand 10 after
1535 // patterns. This means that given patterns 1638 // patterns. This means that given patterns
1536 // [ "http://foo/#q={searchTerms}", "http://foo/?q={searchTerms}" ], 1639 // [ "http://foo/#q={searchTerms}", "http://foo/?q={searchTerms}" ],
1537 // calling ExtractSearchTermsFromURL() on "http://foo/?q=bar#q=' would 1640 // calling ExtractSearchTermsFromURL() on "http://foo/?q=bar#q=' would
1538 // return false. This is important for at least Google, where such URLs 1641 // return false. This is important for at least Google, where such URLs
1539 // are invalid. 1642 // are invalid.
1540 return !search_terms->empty(); 1643 return !search_terms->empty();
1541 } 1644 }
1542 } 1645 }
1543 return false; 1646 return false;
1544 } 1647 }
OLDNEW
« no previous file with comments | « components/search_engines/template_url.h ('k') | components/search_engines/template_url_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698