OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/scored_history_match.h" | 5 #include "components/omnibox/browser/scored_history_match.h" |
6 | 6 |
7 #include <math.h> | 7 #include <math.h> |
8 | 8 |
9 #include <algorithm> | 9 #include <algorithm> |
10 #include <vector> | 10 #include <vector> |
11 | 11 |
12 #include "base/logging.h" | 12 #include "base/logging.h" |
13 #include "base/macros.h" | |
13 #include "base/numerics/safe_conversions.h" | 14 #include "base/numerics/safe_conversions.h" |
14 #include "base/strings/string_number_conversions.h" | 15 #include "base/strings/string_number_conversions.h" |
15 #include "base/strings/string_split.h" | 16 #include "base/strings/string_split.h" |
16 #include "base/strings/string_util.h" | 17 #include "base/strings/string_util.h" |
17 #include "base/strings/utf_offset_string_conversions.h" | 18 #include "base/strings/utf_offset_string_conversions.h" |
18 #include "base/strings/utf_string_conversions.h" | 19 #include "base/strings/utf_string_conversions.h" |
19 #include "components/bookmarks/browser/bookmark_utils.h" | 20 #include "components/bookmarks/browser/bookmark_utils.h" |
20 #include "components/omnibox/browser/history_url_provider.h" | 21 #include "components/omnibox/browser/history_url_provider.h" |
21 #include "components/omnibox/browser/omnibox_field_trial.h" | 22 #include "components/omnibox/browser/omnibox_field_trial.h" |
22 #include "components/omnibox/browser/url_prefix.h" | 23 #include "components/omnibox/browser/url_prefix.h" |
23 | 24 |
24 namespace { | 25 namespace { |
25 | 26 |
26 // The number of days of recency scores to precompute. | 27 // The number of days of recency scores to precompute. |
27 const int kDaysToPrecomputeRecencyScoresFor = 366; | 28 const int kDaysToPrecomputeRecencyScoresFor = 366; |
28 | 29 |
29 // The number of raw term score buckets use; raw term scores greater this are | 30 // The number of raw term score buckets use; raw term scores greater this are |
30 // capped at the score of the largest bucket. | 31 // capped at the score of the largest bucket. |
31 const int kMaxRawTermScore = 30; | 32 const int kMaxRawTermScore = 30; |
32 | 33 |
34 // Default relevance buckets. See GetFinalRelevancyScore() for more details on | |
35 // these numbers. | |
36 const char kDefaultRelevanceBucketsString[] = | |
37 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399"; | |
38 | |
33 // Pre-computed information to speed up calculating recency scores. | 39 // Pre-computed information to speed up calculating recency scores. |
34 // |days_ago_to_recency_score| is a simple array mapping how long ago a page was | 40 // |days_ago_to_recency_score| is a simple array mapping how long ago a page was |
35 // visited (in days) to the recency score we should assign it. This allows easy | 41 // visited (in days) to the recency score we should assign it. This allows easy |
36 // lookups of scores without requiring math. This is initialized by | 42 // lookups of scores without requiring math. This is initialized by |
37 // InitDaysAgoToRecencyScoreArray called by | 43 // InitDaysAgoToRecencyScoreArray called by |
38 // ScoredHistoryMatch::Init(). | 44 // ScoredHistoryMatch::Init(). |
39 float days_ago_to_recency_score[kDaysToPrecomputeRecencyScoresFor]; | 45 float days_ago_to_recency_score[kDaysToPrecomputeRecencyScoresFor]; |
40 | 46 |
41 // Pre-computed information to speed up calculating topicality scores. | 47 // Pre-computed information to speed up calculating topicality scores. |
42 // |raw_term_score_to_topicality_score| is a simple array mapping how raw terms | 48 // |raw_term_score_to_topicality_score| is a simple array mapping how raw terms |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
106 // static | 112 // static |
107 bool ScoredHistoryMatch::also_do_hup_like_scoring_; | 113 bool ScoredHistoryMatch::also_do_hup_like_scoring_; |
108 float ScoredHistoryMatch::bookmark_value_; | 114 float ScoredHistoryMatch::bookmark_value_; |
109 float ScoredHistoryMatch::typed_value_; | 115 float ScoredHistoryMatch::typed_value_; |
110 bool ScoredHistoryMatch::fix_few_visits_bug_; | 116 bool ScoredHistoryMatch::fix_few_visits_bug_; |
111 bool ScoredHistoryMatch::frequency_uses_sum_; | 117 bool ScoredHistoryMatch::frequency_uses_sum_; |
112 size_t ScoredHistoryMatch::max_visits_to_score_; | 118 size_t ScoredHistoryMatch::max_visits_to_score_; |
113 bool ScoredHistoryMatch::allow_tld_matches_; | 119 bool ScoredHistoryMatch::allow_tld_matches_; |
114 bool ScoredHistoryMatch::allow_scheme_matches_; | 120 bool ScoredHistoryMatch::allow_scheme_matches_; |
115 size_t ScoredHistoryMatch::num_title_words_to_allow_; | 121 size_t ScoredHistoryMatch::num_title_words_to_allow_; |
116 bool ScoredHistoryMatch::hqp_experimental_scoring_enabled_; | 122 float ScoredHistoryMatch::topicality_threshold_; |
117 | 123 ScoredHistoryMatch::ScoreMaxRelevances* |
118 // Default topicality threshold. See GetTopicalityScore() for details. | 124 ScoredHistoryMatch::relevance_buckets_override_ = nullptr; |
119 float ScoredHistoryMatch::topicality_threshold_ = 0.8f; | |
120 | |
121 // Default HQP relevance buckets. See GetFinalRelevancyScore() for more details | |
122 // on these numbers. | |
123 char ScoredHistoryMatch::hqp_relevance_buckets_str_[] = | |
124 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399"; | |
125 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>* | |
126 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr; | |
127 | 125 |
128 ScoredHistoryMatch::ScoredHistoryMatch() | 126 ScoredHistoryMatch::ScoredHistoryMatch() |
129 : ScoredHistoryMatch(history::URLRow(), | 127 : ScoredHistoryMatch(history::URLRow(), |
130 VisitInfoVector(), | 128 VisitInfoVector(), |
131 base::string16(), | 129 base::string16(), |
132 String16Vector(), | 130 String16Vector(), |
133 WordStarts(), | 131 WordStarts(), |
134 RowWordStarts(), | 132 RowWordStarts(), |
135 false, | 133 false, |
136 base::Time::Max()) { | 134 base::Time::Max()) { |
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
254 likely_can_inline = true; | 252 likely_can_inline = true; |
255 innermost_match = (best_inlineable_prefix->num_components == | 253 innermost_match = (best_inlineable_prefix->num_components == |
256 best_prefix->num_components); | 254 best_prefix->num_components); |
257 } | 255 } |
258 } | 256 } |
259 } | 257 } |
260 | 258 |
261 const float topicality_score = GetTopicalityScore( | 259 const float topicality_score = GetTopicalityScore( |
262 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts); | 260 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts); |
263 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits); | 261 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits); |
264 raw_score = base::saturated_cast<int>(GetFinalRelevancyScore( | 262 raw_score = base::saturated_cast<int>( |
265 topicality_score, frequency_score, *hqp_relevance_buckets_)); | 263 GetFinalRelevancyScore(topicality_score, frequency_score)); |
266 | 264 |
267 if (also_do_hup_like_scoring_ && likely_can_inline) { | 265 if (also_do_hup_like_scoring_ && likely_can_inline) { |
268 // HistoryURL-provider-like scoring gives any match that is | 266 // HistoryURL-provider-like scoring gives any match that is |
269 // capable of being inlined a certain minimum score. Some of these | 267 // capable of being inlined a certain minimum score. Some of these |
270 // are given a higher score that lets them be shown in inline. | 268 // are given a higher score that lets them be shown in inline. |
271 // This test here derives from the test in | 269 // This test here derives from the test in |
272 // HistoryURLProvider::PromoteMatchForInlineAutocomplete(). | 270 // HistoryURLProvider::PromoteMatchForInlineAutocomplete(). |
273 const bool promote_to_inline = | 271 const bool promote_to_inline = |
274 (row.typed_count() > 1) || (IsHostOnly() && (row.typed_count() == 1)); | 272 (row.typed_count() > 1) || (IsHostOnly() && (row.typed_count() == 1)); |
275 int hup_like_score = | 273 int hup_like_score = |
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
408 initialized = true; | 406 initialized = true; |
409 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring(); | 407 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring(); |
410 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue(); | 408 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue(); |
411 typed_value_ = OmniboxFieldTrial::HQPTypedValue(); | 409 typed_value_ = OmniboxFieldTrial::HQPTypedValue(); |
412 max_visits_to_score_ = OmniboxFieldTrial::HQPMaxVisitsToScore(); | 410 max_visits_to_score_ = OmniboxFieldTrial::HQPMaxVisitsToScore(); |
413 frequency_uses_sum_ = OmniboxFieldTrial::HQPFreqencyUsesSum(); | 411 frequency_uses_sum_ = OmniboxFieldTrial::HQPFreqencyUsesSum(); |
414 fix_few_visits_bug_ = OmniboxFieldTrial::HQPFixFewVisitsBug(); | 412 fix_few_visits_bug_ = OmniboxFieldTrial::HQPFixFewVisitsBug(); |
415 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue(); | 413 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue(); |
416 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue(); | 414 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue(); |
417 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow(); | 415 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow(); |
416 topicality_threshold_ = | |
417 OmniboxFieldTrial::HQPExperimentalTopicalityThreshold(); | |
418 | 418 |
419 InitRawTermScoreToTopicalityScoreArray(); | 419 InitRawTermScoreToTopicalityScoreArray(); |
420 InitDaysAgoToRecencyScoreArray(); | 420 InitDaysAgoToRecencyScoreArray(); |
421 InitHQPExperimentalParams(); | |
422 } | 421 } |
423 | 422 |
424 float ScoredHistoryMatch::GetTopicalityScore( | 423 float ScoredHistoryMatch::GetTopicalityScore( |
425 const int num_terms, | 424 const int num_terms, |
426 const base::string16& url, | 425 const base::string16& url, |
427 const WordStarts& terms_to_word_starts_offsets, | 426 const WordStarts& terms_to_word_starts_offsets, |
428 const RowWordStarts& word_starts) { | 427 const RowWordStarts& word_starts) { |
429 // A vector that accumulates per-term scores. The strongest match--a | 428 // A vector that accumulates per-term scores. The strongest match--a |
430 // match in the hostname at a word boundary--is worth 10 points. | 429 // match in the hostname at a word boundary--is worth 10 points. |
431 // Everything else is less. In general, a match that's not at a word | 430 // Everything else is less. In general, a match that's not at a word |
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
610 // Use |max_visits_to_score_| as the denominator for the average regardless of | 609 // Use |max_visits_to_score_| as the denominator for the average regardless of |
611 // how many visits there were in order to penalize a match that has | 610 // how many visits there were in order to penalize a match that has |
612 // fewer visits than kMaxVisitsToScore. | 611 // fewer visits than kMaxVisitsToScore. |
613 if (fix_few_visits_bug_) | 612 if (fix_few_visits_bug_) |
614 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_; | 613 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_; |
615 return visits.size() * summed_visit_points / | 614 return visits.size() * summed_visit_points / |
616 ScoredHistoryMatch::max_visits_to_score_; | 615 ScoredHistoryMatch::max_visits_to_score_; |
617 } | 616 } |
618 | 617 |
619 // static | 618 // static |
620 float ScoredHistoryMatch::GetFinalRelevancyScore( | 619 float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score, |
621 float topicality_score, | 620 float frequency_score) { |
622 float frequency_score, | 621 // |relevance_buckets| gives a mapping from intemerdiate score to the final |
623 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) { | 622 // relevance score. |
624 DCHECK(hqp_relevance_buckets.size() > 0); | 623 CR_DEFINE_STATIC_LOCAL(ScoreMaxRelevances, default_relevance_buckets, |
625 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0); | 624 (GetHQPBuckets())); |
625 ScoreMaxRelevances* relevance_buckets = relevance_buckets_override_ | |
626 ? relevance_buckets_override_ : &default_relevance_buckets; | |
Peter Kasting
2016/12/10 02:31:15
Nit: Bad indenting (See what git cl format would d
Mark P
2016/12/11 04:57:22
Done.
| |
627 DCHECK(!relevance_buckets->empty()); | |
628 DCHECK_EQ(0.0, (*relevance_buckets)[0].first); | |
626 | 629 |
627 if (topicality_score == 0) | 630 if (topicality_score == 0) |
628 return 0; | 631 return 0; |
629 // Here's how to interpret intermediate_score: Suppose the omnibox | 632 // Here's how to interpret intermediate_score: Suppose the omnibox |
630 // has one input term. Suppose we have a URL for which the omnibox | 633 // has one input term. Suppose we have a URL for which the omnibox |
631 // input term has a single URL hostname hit at a word boundary. (This | 634 // input term has a single URL hostname hit at a word boundary. (This |
632 // implies topicality_score = 1.0.). Then the intermediate_score for | 635 // implies topicality_score = 1.0.). Then the intermediate_score for |
633 // this URL will depend entirely on the frequency_score with | 636 // this URL will depend entirely on the frequency_score with |
634 // this interpretation: | 637 // this interpretation: |
635 // - a single typed visit more than three months ago, no other visits -> 0.2 | 638 // - a single typed visit more than three months ago, no other visits -> 0.2 |
636 // - a visit every three days, no typed visits -> 0.706 | 639 // - a visit every three days, no typed visits -> 0.706 |
637 // - a visit every day, no typed visits -> 0.916 | 640 // - a visit every day, no typed visits -> 0.916 |
638 // - a single typed visit yesterday, no other visits -> 2.0 | 641 // - a single typed visit yesterday, no other visits -> 2.0 |
639 // - a typed visit once a week -> 11.77 | 642 // - a typed visit once a week -> 11.77 |
640 // - a typed visit every three days -> 14.12 | 643 // - a typed visit every three days -> 14.12 |
641 // - at least ten typed visits today -> 20.0 (maximum score) | 644 // - at least ten typed visits today -> 20.0 (maximum score) |
642 // | 645 // |
643 // The below code maps intermediate_score to the range [0, 1399]. | 646 // The below code maps intermediate_score to the range [0, 1399]. |
644 // For example: | 647 // For example: |
645 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399" | 648 // The default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399" |
646 // We will linearly interpolate the scores between: | 649 // We will linearly interpolate the scores between: |
647 // 0 to 1.5 --> 400 to 600 | 650 // 0 to 1.5 --> 400 to 600 |
648 // 1.5 to 12.0 --> 600 to 1300 | 651 // 1.5 to 12.0 --> 600 to 1300 |
649 // 12.0 to 20.0 --> 1300 to 1399 | 652 // 12.0 to 20.0 --> 1300 to 1399 |
650 // >= 20.0 --> 1399 | 653 // >= 20.0 --> 1399 |
651 // | 654 // |
652 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result | 655 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result |
653 // from HistoryURL provider). | 656 // from HistoryURL provider). |
654 const float intermediate_score = topicality_score * frequency_score; | 657 const float intermediate_score = topicality_score * frequency_score; |
655 | 658 |
656 // Find the threshold where intermediate score is greater than bucket. | 659 // Find the threshold where intermediate score is greater than bucket. |
657 size_t i = 1; | 660 size_t i = 1; |
658 for (; i < hqp_relevance_buckets.size(); ++i) { | 661 for (; i < relevance_buckets->size(); ++i) { |
659 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i]; | 662 const ScoreMaxRelevance& bucket = (*relevance_buckets)[i]; |
660 if (intermediate_score >= hqp_bucket.first) { | 663 if (intermediate_score >= bucket.first) { |
661 continue; | 664 continue; |
662 } | 665 } |
663 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1]; | 666 const ScoreMaxRelevance& previous_bucket = (*relevance_buckets)[i - 1]; |
664 const float slope = ((hqp_bucket.second - previous_bucket.second) / | 667 const float slope = ((bucket.second - previous_bucket.second) / |
665 (hqp_bucket.first - previous_bucket.first)); | 668 (bucket.first - previous_bucket.first)); |
666 return (previous_bucket.second + | 669 return (previous_bucket.second + |
667 (slope * (intermediate_score - previous_bucket.first))); | 670 (slope * (intermediate_score - previous_bucket.first))); |
668 } | 671 } |
669 // It will reach this stage when the score is > highest bucket score. | 672 // It will reach this stage when the score is > highest bucket score. |
670 // Return the highest bucket score. | 673 // Return the highest bucket score. |
671 return hqp_relevance_buckets[i - 1].second; | 674 return (*relevance_buckets)[i - 1].second; |
672 } | 675 } |
673 | 676 |
674 // static | 677 // static |
675 void ScoredHistoryMatch::InitHQPExperimentalParams() { | 678 std::vector<ScoredHistoryMatch::ScoreMaxRelevance> |
676 // These are default HQP relevance scoring buckets. | 679 ScoredHistoryMatch::GetHQPBuckets() { |
677 // See GetFinalRelevancyScore() for details. | 680 std::string relevance_buckets_str(kDefaultRelevanceBucketsString); |
Peter Kasting
2016/12/10 02:31:15
Nit: Prefer = to () for strings; see https://www.c
Mark P
2016/12/11 04:57:22
Done.
| |
678 std::string hqp_relevance_buckets_str = std::string( | 681 std::string experimental_scoring_buckets = |
679 hqp_relevance_buckets_str_); | 682 OmniboxFieldTrial::HQPExperimentalScoringBuckets(); |
680 | 683 if (!experimental_scoring_buckets.empty()) |
681 // Fetch the experiment params if they are any. | 684 relevance_buckets_str = experimental_scoring_buckets; |
682 hqp_experimental_scoring_enabled_ = | 685 return GetHQPBucketsFromString(relevance_buckets_str); |
683 OmniboxFieldTrial::HQPExperimentalScoringEnabled(); | |
684 | |
685 if (hqp_experimental_scoring_enabled_) { | |
686 // Add the topicality threshold from experiment params. | |
687 float hqp_experimental_topicality_threhold = | |
688 OmniboxFieldTrial::HQPExperimentalTopicalityThreshold(); | |
689 topicality_threshold_ = hqp_experimental_topicality_threhold; | |
690 | |
691 // Add the HQP experimental scoring buckets. | |
692 std::string hqp_experimental_scoring_buckets = | |
693 OmniboxFieldTrial::HQPExperimentalScoringBuckets(); | |
694 if (!hqp_experimental_scoring_buckets.empty()) | |
695 hqp_relevance_buckets_str = hqp_experimental_scoring_buckets; | |
696 } | |
697 | |
698 // Parse the hqp_relevance_buckets_str string once and store them in vector | |
699 // which is easy to access. | |
700 hqp_relevance_buckets_ = | |
701 new std::vector<ScoredHistoryMatch::ScoreMaxRelevance>(); | |
702 | |
703 bool is_valid_bucket_str = GetHQPBucketsFromString(hqp_relevance_buckets_str, | |
704 hqp_relevance_buckets_); | |
705 DCHECK(is_valid_bucket_str); | |
706 } | 686 } |
707 | 687 |
708 // static | 688 // static |
709 bool ScoredHistoryMatch::GetHQPBucketsFromString( | 689 ScoredHistoryMatch::ScoreMaxRelevances |
710 const std::string& buckets_str, | 690 ScoredHistoryMatch::GetHQPBucketsFromString(const std::string& buckets_str) { |
711 std::vector<ScoreMaxRelevance>* hqp_buckets) { | |
712 DCHECK(hqp_buckets != NULL); | |
713 DCHECK(!buckets_str.empty()); | 691 DCHECK(!buckets_str.empty()); |
714 | |
715 base::StringPairs kv_pairs; | 692 base::StringPairs kv_pairs; |
716 if (base::SplitStringIntoKeyValuePairs(buckets_str, ':', ',', &kv_pairs)) { | 693 if (!base::SplitStringIntoKeyValuePairs(buckets_str, ':', ',', &kv_pairs)) |
717 for (base::StringPairs::const_iterator it = kv_pairs.begin(); | 694 return ScoreMaxRelevances(); |
718 it != kv_pairs.end(); ++it) { | 695 ScoreMaxRelevances hqp_buckets; |
719 ScoreMaxRelevance bucket; | 696 for (base::StringPairs::const_iterator it = kv_pairs.begin(); |
720 bool is_valid_intermediate_score = | 697 it != kv_pairs.end(); ++it) { |
721 base::StringToDouble(it->first, &bucket.first); | 698 ScoreMaxRelevance bucket; |
722 DCHECK(is_valid_intermediate_score); | 699 bool is_valid_intermediate_score = |
723 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); | 700 base::StringToDouble(it->first, &bucket.first); |
724 DCHECK(is_valid_hqp_score); | 701 DCHECK(is_valid_intermediate_score); |
725 hqp_buckets->push_back(bucket); | 702 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); |
726 } | 703 DCHECK(is_valid_hqp_score); |
727 return true; | 704 hqp_buckets.push_back(bucket); |
728 } | 705 } |
729 return false; | 706 return hqp_buckets; |
730 } | 707 } |
OLD | NEW |