components/omnibox/browser/scored_history_match.cc - Issue 2541143002: Omnibox - Boost Frequency Scores Based on Number of Matching Pages

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2541143002: Omnibox - Boost Frequency Scores Based on Number of Matching Pages (Closed)

Patch Set: fix leak Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/omnibox/browser/scored_history_match.h ('K') | « components/omnibox/browser/scored_history_match.h ('k') | components/omnibox/browser/scored_history_match_unittest.cc » ('j') | components/omnibox/browser/scored_history_match_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/scored_history_match.h"	5 #include "components/omnibox/browser/scored_history_match.h"

6	6

7 #include <math.h>	7 #include <math.h>

8	8

9 #include <algorithm>	9 #include <algorithm>

10 #include <vector>	10 #include <vector>

11	11

12 #include "base/logging.h"	12 #include "base/logging.h"

	13 #include "base/macros.h"

13 #include "base/numerics/safe_conversions.h"	14 #include "base/numerics/safe_conversions.h"

14 #include "base/strings/string_number_conversions.h"	15 #include "base/strings/string_number_conversions.h"

15 #include "base/strings/string_split.h"	16 #include "base/strings/string_split.h"

16 #include "base/strings/string_util.h"	17 #include "base/strings/string_util.h"

17 #include "base/strings/utf_offset_string_conversions.h"	18 #include "base/strings/utf_offset_string_conversions.h"

18 #include "base/strings/utf_string_conversions.h"	19 #include "base/strings/utf_string_conversions.h"

19 #include "components/bookmarks/browser/bookmark_utils.h"	20 #include "components/bookmarks/browser/bookmark_utils.h"

20 #include "components/omnibox/browser/history_url_provider.h"	21 #include "components/omnibox/browser/history_url_provider.h"

21 #include "components/omnibox/browser/omnibox_field_trial.h"	22 #include "components/omnibox/browser/omnibox_field_trial.h"

22 #include "components/omnibox/browser/url_prefix.h"	23 #include "components/omnibox/browser/url_prefix.h"

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
118 // Default topicality threshold. See GetTopicalityScore() for details.	119 // Default topicality threshold. See GetTopicalityScore() for details.

119 float ScoredHistoryMatch::topicality_threshold_ = 0.8f;	120 float ScoredHistoryMatch::topicality_threshold_ = 0.8f;

120	121

121 // Default HQP relevance buckets. See GetFinalRelevancyScore() for more details	122 // Default HQP relevance buckets. See GetFinalRelevancyScore() for more details

122 // on these numbers.	123 // on these numbers.

123 char ScoredHistoryMatch::hqp_relevance_buckets_str_[] =	124 char ScoredHistoryMatch::hqp_relevance_buckets_str_[] =

124 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399";	125 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399";

125 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*	126 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*

126 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr;	127 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr;

127	128

	129 OmniboxFieldTrial::NumMatchesScores*

	130 ScoredHistoryMatch::num_matches_to_document_specificity_score_override_ =

	131 nullptr;

	132

128 ScoredHistoryMatch::ScoredHistoryMatch()	133 ScoredHistoryMatch::ScoredHistoryMatch()

129 : ScoredHistoryMatch(history::URLRow(),	134 : ScoredHistoryMatch(history::URLRow(),

130 VisitInfoVector(),	135 VisitInfoVector(),

131 base::string16(),	136 base::string16(),

132 String16Vector(),	137 String16Vector(),

133 WordStarts(),	138 WordStarts(),

134 RowWordStarts(),	139 RowWordStarts(),

135 false,	140 false,

136 base::Time::Max()) {	141 1,

137 }	142 base::Time::Max()) {}

138	143

139 ScoredHistoryMatch::ScoredHistoryMatch(	144 ScoredHistoryMatch::ScoredHistoryMatch(

140 const history::URLRow& row,	145 const history::URLRow& row,

141 const VisitInfoVector& visits,	146 const VisitInfoVector& visits,

142 const base::string16& lower_string,	147 const base::string16& lower_string,

143 const String16Vector& terms_vector,	148 const String16Vector& terms_vector,

144 const WordStarts& terms_to_word_starts_offsets,	149 const WordStarts& terms_to_word_starts_offsets,

145 const RowWordStarts& word_starts,	150 const RowWordStarts& word_starts,

146 bool is_url_bookmarked,	151 bool is_url_bookmarked,

	152 size_t num_matching_pages,

147 base::Time now)	153 base::Time now)

148 : HistoryMatch(row, 0, false, false), raw_score(0) {	154 : HistoryMatch(row, 0, false, false), raw_score(0) {

149 // NOTE: Call Init() before doing any validity checking to ensure that the	155 // NOTE: Call Init() before doing any validity checking to ensure that the

150 // class is always initialized after an instance has been constructed. In	156 // class is always initialized after an instance has been constructed. In

151 // particular, this ensures that the class is initialized after an instance	157 // particular, this ensures that the class is initialized after an instance

152 // has been constructed via the no-args constructor.	158 // has been constructed via the no-args constructor.

153 ScoredHistoryMatch::Init();	159 ScoredHistoryMatch::Init();

154	160

155 // Figure out where each search term appears in the URL and/or page title	161 // Figure out where each search term appears in the URL and/or page title

156 // so that we can score as well as provide autocomplete highlighting.	162 // so that we can score as well as provide autocomplete highlighting.

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
254 likely_can_inline = true;	260 likely_can_inline = true;

255 innermost_match = (best_inlineable_prefix->num_components ==	261 innermost_match = (best_inlineable_prefix->num_components ==

256 best_prefix->num_components);	262 best_prefix->num_components);

257 }	263 }

258 }	264 }

259 }	265 }

260	266

261 const float topicality_score = GetTopicalityScore(	267 const float topicality_score = GetTopicalityScore(

262 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);	268 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);

263 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);	269 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);

264 raw_score = base::saturated_cast<int>(GetFinalRelevancyScore(	270 const float specificity_score =

265 topicality_score, frequency_score, *hqp_relevance_buckets_));	271 GetDocumentSpecificityScore(num_matching_pages);

	272 raw_score = base::saturated_cast<int>(

	273 GetFinalRelevancyScore(topicality_score, frequency_score,

	274 specificity_score, *hqp_relevance_buckets_));

266	275

267 if (also_do_hup_like_scoring_ && likely_can_inline) {	276 if (also_do_hup_like_scoring_ && likely_can_inline) {

268 // HistoryURL-provider-like scoring gives any match that is	277 // HistoryURL-provider-like scoring gives any match that is

269 // capable of being inlined a certain minimum score. Some of these	278 // capable of being inlined a certain minimum score. Some of these

270 // are given a higher score that lets them be shown in inline.	279 // are given a higher score that lets them be shown in inline.

271 // This test here derives from the test in	280 // This test here derives from the test in

272 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().	281 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().

273 const bool promote_to_inline =	282 const bool promote_to_inline =

274 (row.typed_count() > 1) \|\| (IsHostOnly() && (row.typed_count() == 1));	283 (row.typed_count() > 1) \|\| (IsHostOnly() && (row.typed_count() == 1));

275 int hup_like_score =	284 int hup_like_score =

(...skipping 333 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
609 // Compute the average weighted value_of_transition and return it.	618 // Compute the average weighted value_of_transition and return it.

610 // Use \|max_visits_to_score_\| as the denominator for the average regardless of	619 // Use \|max_visits_to_score_\| as the denominator for the average regardless of

611 // how many visits there were in order to penalize a match that has	620 // how many visits there were in order to penalize a match that has

612 // fewer visits than kMaxVisitsToScore.	621 // fewer visits than kMaxVisitsToScore.

613 if (fix_few_visits_bug_)	622 if (fix_few_visits_bug_)

614 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_;	623 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_;

615 return visits.size() * summed_visit_points /	624 return visits.size() * summed_visit_points /

616 ScoredHistoryMatch::max_visits_to_score_;	625 ScoredHistoryMatch::max_visits_to_score_;

617 }	626 }

618	627

	628 float ScoredHistoryMatch::GetDocumentSpecificityScore(

	629 size_t num_matching_pages) const {

	630 // A mapping from the number of matching pages to their associated document

	631 // specificity scores. See omnibox_field_trial.h for more details.

	632 CR_DEFINE_STATIC_LOCAL(OmniboxFieldTrial::NumMatchesScores,

	633 default_num_matches_to_document_specificity_score,

	634 (OmniboxFieldTrial::HQPNumMatchesScores()));

	635 OmniboxFieldTrial::NumMatchesScores*

	636 num_matches_to_document_specificity_score =

	637 num_matches_to_document_specificity_score_override_

	638 ? num_matches_to_document_specificity_score_override_

	639 : &default_num_matches_to_document_specificity_score;
	Peter Kasting 2016/12/10 02:22:26 I feel like there might be a way to shorten all th I feel like there might be a way to shorten all the names here and not have the same text repeat over and over... \|matches_to_specificity\|? I dunno. Mark P 2016/12/11 05:11:37 I was wondering if there was a good way to shorten Show quoted text On 2016/12/10 02:22:26, Peter Kasting wrote: > I feel like there might be a way to shorten all the names here and not have the > same text repeat over and over... \|matches_to_specificity\|? I dunno. I was wondering if there was a good way to shorten the name that we'd both be comfortable with. It wasn't obvious. Thank you for coming up with one. Done.
	640

	641 // The floating point value below must be less than the lowest score the

	642 // server would send down.

	643 OmniboxFieldTrial::NumMatchesScores::const_iterator it =

	644 std::upper_bound(num_matches_to_document_specificity_score->begin(),

	645 num_matches_to_document_specificity_score->end(),

	646 std::pair<size_t, double>{num_matching_pages, -1});

	647 return (it != num_matches_to_document_specificity_score->end()) ? it->second

	648 : 1.0;

	649 };

	650

619 // static	651 // static

620 float ScoredHistoryMatch::GetFinalRelevancyScore(	652 float ScoredHistoryMatch::GetFinalRelevancyScore(

621 float topicality_score,	653 float topicality_score,

622 float frequency_score,	654 float frequency_score,

	655 float specificity_score,

623 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) {	656 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) {

624 DCHECK(hqp_relevance_buckets.size() > 0);	657 DCHECK(hqp_relevance_buckets.size() > 0);

625 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0);	658 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0);

626	659

627 if (topicality_score == 0)	660 if (topicality_score == 0)

628 return 0;	661 return 0;

629 // Here's how to interpret intermediate_score: Suppose the omnibox	662 // Here's how to interpret intermediate_score: Suppose the omnibox has one

630 // has one input term. Suppose we have a URL for which the omnibox	663 // input term. Suppose the input matches many documents. (This implies

	664 // specificity_score == 1.0.) Suppose we have a URL for which the omnibox

631 // input term has a single URL hostname hit at a word boundary. (This	665 // input term has a single URL hostname hit at a word boundary. (This

632 // implies topicality_score = 1.0.). Then the intermediate_score for	666 // implies topicality_score = 1.0.). Then the intermediate_score for

633 // this URL will depend entirely on the frequency_score with	667 // this URL will depend entirely on the frequency_score with

634 // this interpretation:	668 // this interpretation:

635 // - a single typed visit more than three months ago, no other visits -> 0.2	669 // - a single typed visit more than three months ago, no other visits -> 0.2

636 // - a visit every three days, no typed visits -> 0.706	670 // - a visit every three days, no typed visits -> 0.706

637 // - a visit every day, no typed visits -> 0.916	671 // - a visit every day, no typed visits -> 0.916

638 // - a single typed visit yesterday, no other visits -> 2.0	672 // - a single typed visit yesterday, no other visits -> 2.0

639 // - a typed visit once a week -> 11.77	673 // - a typed visit once a week -> 11.77

640 // - a typed visit every three days -> 14.12	674 // - a typed visit every three days -> 14.12

641 // - at least ten typed visits today -> 20.0 (maximum score)	675 // - at least ten typed visits today -> 20.0 (maximum score)

642 //	676 //

643 // The below code maps intermediate_score to the range [0, 1399].	677 // The below code maps intermediate_score to the range [0, 1399].

644 // For example:	678 // For example:

645 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"	679 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"

646 // We will linearly interpolate the scores between:	680 // We will linearly interpolate the scores between:

647 // 0 to 1.5 --> 400 to 600	681 // 0 to 1.5 --> 400 to 600

648 // 1.5 to 12.0 --> 600 to 1300	682 // 1.5 to 12.0 --> 600 to 1300

649 // 12.0 to 20.0 --> 1300 to 1399	683 // 12.0 to 20.0 --> 1300 to 1399

650 // >= 20.0 --> 1399	684 // >= 20.0 --> 1399

651 //	685 //

652 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result	686 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result

653 // from HistoryURL provider).	687 // from HistoryURL provider).

654 const float intermediate_score = topicality_score * frequency_score;	688 const float intermediate_score =

	689 topicality_score * frequency_score * specificity_score;

655	690

656 // Find the threshold where intermediate score is greater than bucket.	691 // Find the threshold where intermediate score is greater than bucket.

657 size_t i = 1;	692 size_t i = 1;

658 for (; i < hqp_relevance_buckets.size(); ++i) {	693 for (; i < hqp_relevance_buckets.size(); ++i) {

659 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i];	694 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i];

660 if (intermediate_score >= hqp_bucket.first) {	695 if (intermediate_score >= hqp_bucket.first) {

661 continue;	696 continue;

662 }	697 }

663 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1];	698 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1];

664 const float slope = ((hqp_bucket.second - previous_bucket.second) /	699 const float slope = ((hqp_bucket.second - previous_bucket.second) /

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
721 base::StringToDouble(it->first, &bucket.first);	756 base::StringToDouble(it->first, &bucket.first);

722 DCHECK(is_valid_intermediate_score);	757 DCHECK(is_valid_intermediate_score);

723 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);	758 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);

724 DCHECK(is_valid_hqp_score);	759 DCHECK(is_valid_hqp_score);

725 hqp_buckets->push_back(bucket);	760 hqp_buckets->push_back(bucket);

726 }	761 }

727 return true;	762 return true;

728 }	763 }

729 return false;	764 return false;

730 }	765 }

OLD	NEW