components/omnibox/browser/scored_history_match.cc - Issue 2541143002: Omnibox - Boost Frequency Scores Based on Number of Matching Pages

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2541143002: Omnibox - Boost Frequency Scores Based on Number of Matching Pages (Closed)

Patch Set: improved comments and formatting Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/omnibox/browser/scored_history_match.h ('K') | « components/omnibox/browser/scored_history_match.h ('k') | components/omnibox/browser/scored_history_match_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/scored_history_match.h"	5 #include "components/omnibox/browser/scored_history_match.h"

6	6

7 #include <math.h>	7 #include <math.h>

8	8

9 #include <algorithm>	9 #include <algorithm>

10 #include <vector>	10 #include <vector>

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
102 }	102 }

103	103

104 } // namespace	104 } // namespace

105	105

106 // static	106 // static

107 bool ScoredHistoryMatch::also_do_hup_like_scoring_;	107 bool ScoredHistoryMatch::also_do_hup_like_scoring_;

108 float ScoredHistoryMatch::bookmark_value_;	108 float ScoredHistoryMatch::bookmark_value_;

109 float ScoredHistoryMatch::typed_value_;	109 float ScoredHistoryMatch::typed_value_;

110 bool ScoredHistoryMatch::fix_few_visits_bug_;	110 bool ScoredHistoryMatch::fix_few_visits_bug_;

111 bool ScoredHistoryMatch::frequency_uses_sum_;	111 bool ScoredHistoryMatch::frequency_uses_sum_;

	112 OmniboxFieldTrial::NumMatchesScores*

	113 ScoredHistoryMatch::num_matches_to_document_specificity_score_ = nullptr;

112 size_t ScoredHistoryMatch::max_visits_to_score_;	114 size_t ScoredHistoryMatch::max_visits_to_score_;

113 bool ScoredHistoryMatch::allow_tld_matches_;	115 bool ScoredHistoryMatch::allow_tld_matches_;

114 bool ScoredHistoryMatch::allow_scheme_matches_;	116 bool ScoredHistoryMatch::allow_scheme_matches_;

115 size_t ScoredHistoryMatch::num_title_words_to_allow_;	117 size_t ScoredHistoryMatch::num_title_words_to_allow_;

116 bool ScoredHistoryMatch::hqp_experimental_scoring_enabled_;	118 bool ScoredHistoryMatch::hqp_experimental_scoring_enabled_;

117	119

118 // Default topicality threshold. See GetTopicalityScore() for details.	120 // Default topicality threshold. See GetTopicalityScore() for details.

119 float ScoredHistoryMatch::topicality_threshold_ = 0.8f;	121 float ScoredHistoryMatch::topicality_threshold_ = 0.8f;

120	122

121 // Default HQP relevance buckets. See GetFinalRelevancyScore() for more details	123 // Default HQP relevance buckets. See GetFinalRelevancyScore() for more details

122 // on these numbers.	124 // on these numbers.

123 char ScoredHistoryMatch::hqp_relevance_buckets_str_[] =	125 char ScoredHistoryMatch::hqp_relevance_buckets_str_[] =

124 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399";	126 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399";

125 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*	127 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*

126 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr;	128 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr;

127	129

128 ScoredHistoryMatch::ScoredHistoryMatch()	130 ScoredHistoryMatch::ScoredHistoryMatch()

129 : ScoredHistoryMatch(history::URLRow(),	131 : ScoredHistoryMatch(history::URLRow(),

130 VisitInfoVector(),	132 VisitInfoVector(),

131 base::string16(),	133 base::string16(),

132 String16Vector(),	134 String16Vector(),

133 WordStarts(),	135 WordStarts(),

134 RowWordStarts(),	136 RowWordStarts(),

135 false,	137 false,

136 base::Time::Max()) {	138 1,

137 }	139 base::Time::Max()) {}

138	140

139 ScoredHistoryMatch::ScoredHistoryMatch(	141 ScoredHistoryMatch::ScoredHistoryMatch(

140 const history::URLRow& row,	142 const history::URLRow& row,

141 const VisitInfoVector& visits,	143 const VisitInfoVector& visits,

142 const base::string16& lower_string,	144 const base::string16& lower_string,

143 const String16Vector& terms_vector,	145 const String16Vector& terms_vector,

144 const WordStarts& terms_to_word_starts_offsets,	146 const WordStarts& terms_to_word_starts_offsets,

145 const RowWordStarts& word_starts,	147 const RowWordStarts& word_starts,

146 bool is_url_bookmarked,	148 bool is_url_bookmarked,

	149 size_t num_matching_pages,

147 base::Time now)	150 base::Time now)

148 : HistoryMatch(row, 0, false, false), raw_score(0) {	151 : HistoryMatch(row, 0, false, false), raw_score(0) {

149 // NOTE: Call Init() before doing any validity checking to ensure that the	152 // NOTE: Call Init() before doing any validity checking to ensure that the

150 // class is always initialized after an instance has been constructed. In	153 // class is always initialized after an instance has been constructed. In

151 // particular, this ensures that the class is initialized after an instance	154 // particular, this ensures that the class is initialized after an instance

152 // has been constructed via the no-args constructor.	155 // has been constructed via the no-args constructor.

153 ScoredHistoryMatch::Init();	156 ScoredHistoryMatch::Init();

154	157

155 // Figure out where each search term appears in the URL and/or page title	158 // Figure out where each search term appears in the URL and/or page title

156 // so that we can score as well as provide autocomplete highlighting.	159 // so that we can score as well as provide autocomplete highlighting.

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
254 likely_can_inline = true;	257 likely_can_inline = true;

255 innermost_match = (best_inlineable_prefix->num_components ==	258 innermost_match = (best_inlineable_prefix->num_components ==

256 best_prefix->num_components);	259 best_prefix->num_components);

257 }	260 }

258 }	261 }

259 }	262 }

260	263

261 const float topicality_score = GetTopicalityScore(	264 const float topicality_score = GetTopicalityScore(

262 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);	265 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);

263 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);	266 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);

264 raw_score = base::saturated_cast<int>(GetFinalRelevancyScore(	267 const float specificity_score =

265 topicality_score, frequency_score, *hqp_relevance_buckets_));	268 GetDocumentSpecificityScore(num_matching_pages);

	269 raw_score = base::saturated_cast<int>(

	270 GetFinalRelevancyScore(topicality_score, frequency_score,

	271 specificity_score, *hqp_relevance_buckets_));

266	272

267 if (also_do_hup_like_scoring_ && likely_can_inline) {	273 if (also_do_hup_like_scoring_ && likely_can_inline) {

268 // HistoryURL-provider-like scoring gives any match that is	274 // HistoryURL-provider-like scoring gives any match that is

269 // capable of being inlined a certain minimum score. Some of these	275 // capable of being inlined a certain minimum score. Some of these

270 // are given a higher score that lets them be shown in inline.	276 // are given a higher score that lets them be shown in inline.

271 // This test here derives from the test in	277 // This test here derives from the test in

272 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().	278 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().

273 const bool promote_to_inline =	279 const bool promote_to_inline =

274 (row.typed_count() > 1) \|\| (IsHostOnly() && (row.typed_count() == 1));	280 (row.typed_count() > 1) \|\| (IsHostOnly() && (row.typed_count() == 1));

275 int hup_like_score =	281 int hup_like_score =

(...skipping 129 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
405 if (initialized)	411 if (initialized)

406 return;	412 return;

407	413

408 initialized = true;	414 initialized = true;

409 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring();	415 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring();

410 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();	416 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();

411 typed_value_ = OmniboxFieldTrial::HQPTypedValue();	417 typed_value_ = OmniboxFieldTrial::HQPTypedValue();

412 max_visits_to_score_ = OmniboxFieldTrial::HQPMaxVisitsToScore();	418 max_visits_to_score_ = OmniboxFieldTrial::HQPMaxVisitsToScore();

413 frequency_uses_sum_ = OmniboxFieldTrial::HQPFreqencyUsesSum();	419 frequency_uses_sum_ = OmniboxFieldTrial::HQPFreqencyUsesSum();

414 fix_few_visits_bug_ = OmniboxFieldTrial::HQPFixFewVisitsBug();	420 fix_few_visits_bug_ = OmniboxFieldTrial::HQPFixFewVisitsBug();

	421 num_matches_to_document_specificity_score_ =

	422 new OmniboxFieldTrial::NumMatchesScores();

	423 (*num_matches_to_document_specificity_score_) =

	424 OmniboxFieldTrial::HQPNumMatchesScores();

415 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();	425 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();

416 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();	426 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();

417 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow();	427 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow();

418	428

419 InitRawTermScoreToTopicalityScoreArray();	429 InitRawTermScoreToTopicalityScoreArray();

420 InitDaysAgoToRecencyScoreArray();	430 InitDaysAgoToRecencyScoreArray();

421 InitHQPExperimentalParams();	431 InitHQPExperimentalParams();

422 }	432 }

423	433

424 float ScoredHistoryMatch::GetTopicalityScore(	434 float ScoredHistoryMatch::GetTopicalityScore(

(...skipping 184 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
609 // Compute the average weighted value_of_transition and return it.	619 // Compute the average weighted value_of_transition and return it.

610 // Use \|max_visits_to_score_\| as the denominator for the average regardless of	620 // Use \|max_visits_to_score_\| as the denominator for the average regardless of

611 // how many visits there were in order to penalize a match that has	621 // how many visits there were in order to penalize a match that has

612 // fewer visits than kMaxVisitsToScore.	622 // fewer visits than kMaxVisitsToScore.

613 if (fix_few_visits_bug_)	623 if (fix_few_visits_bug_)

614 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_;	624 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_;

615 return visits.size() * summed_visit_points /	625 return visits.size() * summed_visit_points /

616 ScoredHistoryMatch::max_visits_to_score_;	626 ScoredHistoryMatch::max_visits_to_score_;

617 }	627 }

618	628

	629 float ScoredHistoryMatch::GetDocumentSpecificityScore(

	630 const size_t num_matching_pages) const {

	631 // The floating point value below doesn't matter.
	Peter Kasting 2016/12/06 05:19:30 I'm not sure that's true. Since upper_bound() ret I'm not sure that's true. Since upper_bound() returns the first element greater than the given value, if the server sends a score of 1.0 or less for a particular number of pages, this will fail to match. A more accurate comment would probably be "The floating point value below must be less than the lowest score the server would send down" or similar, and a safer value might be -1. Mark P 2016/12/08 00:21:31 Good point. I never imagined a field trial specif Show quoted text On 2016/12/06 05:19:30, Peter Kasting wrote: > I'm not sure that's true. > > Since upper_bound() returns the first element greater than the given value, if > the server sends a score of 1.0 or less for a particular number of pages, this > will fail to match. > > A more accurate comment would probably be "The floating point value below must > be less than the lowest score the server would send down" or similar, and a > safer value might be -1. Good point. I never imagined a field trial specifying a score of less than 1, but having an appropriate comment and defensive code makes sense. Who knows what I'll want in the future? :-) Done.
	632 OmniboxFieldTrial::NumMatchesScores::const_iterator it =

	633 std::upper_bound(num_matches_to_document_specificity_score_->begin(),

	634 num_matches_to_document_specificity_score_->end(),

	635 std::pair<size_t, double>{num_matching_pages, 1.0});

	636 if (it == num_matches_to_document_specificity_score_->end())

	637 return 1.0;

	638 return it->second;
	Peter Kasting 2016/12/06 05:19:30 Nit: Could use ?: Nit: Could use ?: Mark P 2016/12/08 00:21:31 Done (after reversing the order, as that read bett Show quoted text On 2016/12/06 05:19:30, Peter Kasting wrote: > Nit: Could use ?: Done (after reversing the order, as that read better to me). Peter Kasting 2016/12/08 00:51:40 My concern with the order reversal is that it sort Show quoted text On 2016/12/08 00:21:31, Mark P wrote: > On 2016/12/06 05:19:30, Peter Kasting wrote: > > Nit: Could use ?: > > Done (after reversing the order, as that read better to me). My concern with the order reversal is that it sort of violates "don't have 'else' with a negated condition, or the 'else' reads like a double-negative". Except here the "else" is the ':'. So I'd put it back the other way, but up to you. Mark P 2016/12/08 04:37:35 I understand the point, but I find != end to read Show quoted text On 2016/12/08 00:51:40, Peter Kasting wrote: > On 2016/12/08 00:21:31, Mark P wrote: > > On 2016/12/06 05:19:30, Peter Kasting wrote: > > > Nit: Could use ?: > > > > Done (after reversing the order, as that read better to me). > > My concern with the order reversal is that it sort of violates "don't have > 'else' with a negated condition, or the 'else' reads like a double-negative". > Except here the "else" is the ':'. > > So I'd put it back the other way, but up to you. I understand the point, but I find != end to read to me like a positive (exists), and when I read == end, I have to mentally reverse it (!exists).
	639 };

	640

619 // static	641 // static

620 float ScoredHistoryMatch::GetFinalRelevancyScore(	642 float ScoredHistoryMatch::GetFinalRelevancyScore(

621 float topicality_score,	643 float topicality_score,

622 float frequency_score,	644 float frequency_score,

	645 float specificity_score,

623 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) {	646 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) {

624 DCHECK(hqp_relevance_buckets.size() > 0);	647 DCHECK(hqp_relevance_buckets.size() > 0);

625 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0);	648 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0);

626	649

627 if (topicality_score == 0)	650 if (topicality_score == 0)

628 return 0;	651 return 0;

629 // Here's how to interpret intermediate_score: Suppose the omnibox	652 // Here's how to interpret intermediate_score: Suppose the omnibox has one

630 // has one input term. Suppose we have a URL for which the omnibox	653 // input term. Suppose the input matches many documents. (This implies

	654 // specificity_score == 1.0.) Suppose we have a URL for which the omnibox

631 // input term has a single URL hostname hit at a word boundary. (This	655 // input term has a single URL hostname hit at a word boundary. (This

632 // implies topicality_score = 1.0.). Then the intermediate_score for	656 // implies topicality_score = 1.0.). Then the intermediate_score for

633 // this URL will depend entirely on the frequency_score with	657 // this URL will depend entirely on the frequency_score with

634 // this interpretation:	658 // this interpretation:

635 // - a single typed visit more than three months ago, no other visits -> 0.2	659 // - a single typed visit more than three months ago, no other visits -> 0.2

636 // - a visit every three days, no typed visits -> 0.706	660 // - a visit every three days, no typed visits -> 0.706

637 // - a visit every day, no typed visits -> 0.916	661 // - a visit every day, no typed visits -> 0.916

638 // - a single typed visit yesterday, no other visits -> 2.0	662 // - a single typed visit yesterday, no other visits -> 2.0

639 // - a typed visit once a week -> 11.77	663 // - a typed visit once a week -> 11.77

640 // - a typed visit every three days -> 14.12	664 // - a typed visit every three days -> 14.12

641 // - at least ten typed visits today -> 20.0 (maximum score)	665 // - at least ten typed visits today -> 20.0 (maximum score)

642 //	666 //

643 // The below code maps intermediate_score to the range [0, 1399].	667 // The below code maps intermediate_score to the range [0, 1399].

644 // For example:	668 // For example:

645 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"	669 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"

646 // We will linearly interpolate the scores between:	670 // We will linearly interpolate the scores between:

647 // 0 to 1.5 --> 400 to 600	671 // 0 to 1.5 --> 400 to 600

648 // 1.5 to 12.0 --> 600 to 1300	672 // 1.5 to 12.0 --> 600 to 1300

649 // 12.0 to 20.0 --> 1300 to 1399	673 // 12.0 to 20.0 --> 1300 to 1399

650 // >= 20.0 --> 1399	674 // >= 20.0 --> 1399

651 //	675 //

652 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result	676 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result

653 // from HistoryURL provider).	677 // from HistoryURL provider).

654 const float intermediate_score = topicality_score * frequency_score;	678 const float intermediate_score =

	679 topicality_score * frequency_score * specificity_score;

655	680

656 // Find the threshold where intermediate score is greater than bucket.	681 // Find the threshold where intermediate score is greater than bucket.

657 size_t i = 1;	682 size_t i = 1;

658 for (; i < hqp_relevance_buckets.size(); ++i) {	683 for (; i < hqp_relevance_buckets.size(); ++i) {

659 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i];	684 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i];

660 if (intermediate_score >= hqp_bucket.first) {	685 if (intermediate_score >= hqp_bucket.first) {

661 continue;	686 continue;

662 }	687 }

663 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1];	688 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1];

664 const float slope = ((hqp_bucket.second - previous_bucket.second) /	689 const float slope = ((hqp_bucket.second - previous_bucket.second) /

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
721 base::StringToDouble(it->first, &bucket.first);	746 base::StringToDouble(it->first, &bucket.first);

722 DCHECK(is_valid_intermediate_score);	747 DCHECK(is_valid_intermediate_score);

723 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);	748 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);

724 DCHECK(is_valid_hqp_score);	749 DCHECK(is_valid_hqp_score);

725 hqp_buckets->push_back(bucket);	750 hqp_buckets->push_back(bucket);

726 }	751 }

727 return true;	752 return true;

728 }	753 }

729 return false;	754 return false;

730 }	755 }

OLD	NEW