chrome/browser/history/scored_history_match.cc - Issue 905023003: Adding knobs on HQP provider.

Side by Side Diff: chrome/browser/history/scored_history_match.cc

Issue 905023003: Adding knobs on HQP provider. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressing mark comments. Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« chrome/browser/history/scored_history_match.h ('K') | « chrome/browser/history/scored_history_match.h ('k') | chrome/browser/history/scored_history_match_unittest.cc » ('j') | components/omnibox/omnibox_field_trial.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/scored_history_match.h"	5 #include "chrome/browser/history/scored_history_match.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <functional>	8 #include <functional>

9 #include <iterator>	9 #include <iterator>

10 #include <numeric>	10 #include <numeric>

11 #include <set>	11 #include <set>

12	12

13 #include <math.h>	13 #include <math.h>

14	14

15 #include "base/logging.h"	15 #include "base/logging.h"

16 #include "base/metrics/histogram.h"	16 #include "base/metrics/histogram.h"

	17 #include "base/strings/string_number_conversions.h"

	18 #include "base/strings/string_split.h"

17 #include "base/strings/string_util.h"	19 #include "base/strings/string_util.h"

18 #include "base/strings/utf_string_conversions.h"	20 #include "base/strings/utf_string_conversions.h"

19 #include "chrome/browser/autocomplete/history_url_provider.h"	21 #include "chrome/browser/autocomplete/history_url_provider.h"

20 #include "components/bookmarks/browser/bookmark_utils.h"	22 #include "components/bookmarks/browser/bookmark_utils.h"

21 #include "components/history/core/browser/history_client.h"	23 #include "components/history/core/browser/history_client.h"

22 #include "components/omnibox/omnibox_field_trial.h"	24 #include "components/omnibox/omnibox_field_trial.h"

23 #include "components/omnibox/url_prefix.h"	25 #include "components/omnibox/url_prefix.h"

24 #include "content/public/browser/browser_thread.h"	26 #include "content/public/browser/browser_thread.h"

25	27

26 namespace history {	28 namespace history {

27	29

28 // ScoredHistoryMatch ----------------------------------------------------------	30 // ScoredHistoryMatch ----------------------------------------------------------

29	31

30 // static	32 // static

31 const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10;	33 const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10;

32 const int ScoredHistoryMatch::kDaysToPrecomputeRecencyScoresFor = 366;	34 const int ScoredHistoryMatch::kDaysToPrecomputeRecencyScoresFor = 366;

33 const int ScoredHistoryMatch::kMaxRawTermScore = 30;	35 const int ScoredHistoryMatch::kMaxRawTermScore = 30;

34 float* ScoredHistoryMatch::raw_term_score_to_topicality_score_ = NULL;	36 float* ScoredHistoryMatch::raw_term_score_to_topicality_score_ = NULL;

35 float* ScoredHistoryMatch::days_ago_to_recency_score_ = NULL;	37 float* ScoredHistoryMatch::days_ago_to_recency_score_ = NULL;

36 bool ScoredHistoryMatch::initialized_ = false;	38 bool ScoredHistoryMatch::initialized_ = false;

37 int ScoredHistoryMatch::bookmark_value_ = 1;	39 int ScoredHistoryMatch::bookmark_value_ = 1;

38 bool ScoredHistoryMatch::allow_tld_matches_ = false;	40 bool ScoredHistoryMatch::allow_tld_matches_ = false;

39 bool ScoredHistoryMatch::allow_scheme_matches_ = false;	41 bool ScoredHistoryMatch::allow_scheme_matches_ = false;

40 bool ScoredHistoryMatch::also_do_hup_like_scoring_ = false;	42 bool ScoredHistoryMatch::also_do_hup_like_scoring_ = false;

41 int ScoredHistoryMatch::max_assigned_score_for_non_inlineable_matches_ = -1;	43 int ScoredHistoryMatch::max_assigned_score_for_non_inlineable_matches_ = -1;

	44 bool ScoredHistoryMatch::hqp_experimental_scoring_enabled_ = false;

	45 float ScoredHistoryMatch::topicality_threshold_ = -1;

	46 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*

	47 ScoredHistoryMatch::hqp_relevance_buckets_ = NULL;

42	48

43 ScoredHistoryMatch::ScoredHistoryMatch()	49 ScoredHistoryMatch::ScoredHistoryMatch()

44 : raw_score_(0),	50 : raw_score_(0),

45 can_inline_(false) {	51 can_inline_(false) {

46 Init();	52 Init();

47 }	53 }

48	54

49 ScoredHistoryMatch::ScoredHistoryMatch(	55 ScoredHistoryMatch::ScoredHistoryMatch(

50 const URLRow& row,	56 const URLRow& row,

51 const VisitInfoVector& visits,	57 const VisitInfoVector& visits,

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
147 const int num_components_in_best_inlineable_prefix =	153 const int num_components_in_best_inlineable_prefix =

148 best_inlineable_prefix->num_components;	154 best_inlineable_prefix->num_components;

149 innermost_match = (num_components_in_best_inlineable_prefix ==	155 innermost_match = (num_components_in_best_inlineable_prefix ==

150 num_components_in_best_prefix);	156 num_components_in_best_prefix);

151 }	157 }

152	158

153 const float topicality_score = GetTopicalityScore(	159 const float topicality_score = GetTopicalityScore(

154 terms.size(), url, terms_to_word_starts_offsets, word_starts);	160 terms.size(), url, terms_to_word_starts_offsets, word_starts);

155 const float frequency_score = GetFrequency(	161 const float frequency_score = GetFrequency(

156 now, (history_client && history_client->IsBookmarked(gurl)), visits);	162 now, (history_client && history_client->IsBookmarked(gurl)), visits);

157 raw_score_ = GetFinalRelevancyScore(topicality_score, frequency_score);	163 raw_score_ = GetFinalRelevancyScore(topicality_score, frequency_score,

	164 *hqp_relevance_buckets_);

158 raw_score_ =	165 raw_score_ =

159 (raw_score_ <= kint32max) ? static_cast<int>(raw_score_) : kint32max;	166 (raw_score_ <= kint32max) ? static_cast<int>(raw_score_) : kint32max;

160	167

161 if (also_do_hup_like_scoring_ && can_inline_) {	168 if (also_do_hup_like_scoring_ && can_inline_) {

162 // HistoryURL-provider-like scoring gives any match that is	169 // HistoryURL-provider-like scoring gives any match that is

163 // capable of being inlined a certain minimum score. Some of these	170 // capable of being inlined a certain minimum score. Some of these

164 // are given a higher score that lets them be shown in inline.	171 // are given a higher score that lets them be shown in inline.

165 // This test here derives from the test in	172 // This test here derives from the test in

166 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().	173 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().

167 const bool promote_to_inline = (row.typed_count() > 1) \|\|	174 const bool promote_to_inline = (row.typed_count() > 1) \|\|

(...skipping 263 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
431 // due to this test would look stupid if shown to the user.	438 // due to this test would look stupid if shown to the user.

432 if (term_scores[i] == 0)	439 if (term_scores[i] == 0)

433 return 0;	440 return 0;

434 topicality_score += raw_term_score_to_topicality_score_[	441 topicality_score += raw_term_score_to_topicality_score_[

435 (term_scores[i] >= kMaxRawTermScore) ? (kMaxRawTermScore - 1) :	442 (term_scores[i] >= kMaxRawTermScore) ? (kMaxRawTermScore - 1) :

436 term_scores[i]];	443 term_scores[i]];

437 }	444 }

438 // TODO(mpearson): If there are multiple terms, consider taking the	445 // TODO(mpearson): If there are multiple terms, consider taking the

439 // geometric mean of per-term scores rather than the arithmetic mean.	446 // geometric mean of per-term scores rather than the arithmetic mean.

440	447

441 return topicality_score / num_terms;	448 float final_topicality_score = topicality_score / num_terms;

	449

	450 // Demote all the URLs if the topicality score is less than threshold.

	451 if (hqp_experimental_scoring_enabled_ &&

	452 (final_topicality_score < topicality_threshold_)) {

	453 return 0.0;

	454 }

	455

	456 return final_topicality_score;

442 }	457 }

443	458

444 // static	459 // static

445 void ScoredHistoryMatch::FillInTermScoreToTopicalityScoreArray() {	460 void ScoredHistoryMatch::FillInTermScoreToTopicalityScoreArray() {

446 for (int term_score = 0; term_score < kMaxRawTermScore; ++term_score) {	461 for (int term_score = 0; term_score < kMaxRawTermScore; ++term_score) {

447 float topicality_score;	462 float topicality_score;

448 if (term_score < 10) {	463 if (term_score < 10) {

449 // If the term scores less than 10 points (no full-credit hit, or	464 // If the term scores less than 10 points (no full-credit hit, or

450 // no combination of hits that score that well), then the topicality	465 // no combination of hits that score that well), then the topicality

451 // score is linear in the term score.	466 // score is linear in the term score.

(...skipping 83 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
535 if (bookmarked)	550 if (bookmarked)

536 value_of_transition = std::max(value_of_transition, bookmark_value_);	551 value_of_transition = std::max(value_of_transition, bookmark_value_);

537 const float bucket_weight =	552 const float bucket_weight =

538 GetRecencyScore((now - visits[i].first).InDays());	553 GetRecencyScore((now - visits[i].first).InDays());

539 summed_visit_points += (value_of_transition * bucket_weight);	554 summed_visit_points += (value_of_transition * bucket_weight);

540 }	555 }

541 return visits.size() * summed_visit_points / kMaxVisitsToScore;	556 return visits.size() * summed_visit_points / kMaxVisitsToScore;

542 }	557 }

543	558

544 // static	559 // static

545 float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score,	560 float ScoredHistoryMatch::GetFinalRelevancyScore(

546 float frequency_score) {	561 float topicality_score,

	562 float frequency_score,

	563 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) {

	564 DCHECK(hqp_relevance_buckets.size() > 0);

	565

547 if (topicality_score == 0)	566 if (topicality_score == 0)

548 return 0;	567 return 0;

549 // Here's how to interpret intermediate_score: Suppose the omnibox	568 // Here's how to interpret intermediate_score: Suppose the omnibox

550 // has one input term. Suppose we have a URL for which the omnibox	569 // has one input term. Suppose we have a URL for which the omnibox

551 // input term has a single URL hostname hit at a word boundary. (This	570 // input term has a single URL hostname hit at a word boundary. (This

552 // implies topicality_score = 1.0.). Then the intermediate_score for	571 // implies topicality_score = 1.0.). Then the intermediate_score for

553 // this URL will depend entirely on the frequency_score with	572 // this URL will depend entirely on the frequency_score with

554 // this interpretation:	573 // this interpretation:

555 // - a single typed visit more than three months ago, no other visits -> 0.2	574 // - a single typed visit more than three months ago, no other visits -> 0.2

556 // - a visit every three days, no typed visits -> 0.706	575 // - a visit every three days, no typed visits -> 0.706

557 // - a visit every day, no typed visits -> 0.916	576 // - a visit every day, no typed visits -> 0.916

558 // - a single typed visit yesterday, no other visits -> 2.0	577 // - a single typed visit yesterday, no other visits -> 2.0

559 // - a typed visit once a week -> 11.77	578 // - a typed visit once a week -> 11.77

560 // - a typed visit every three days -> 14.12	579 // - a typed visit every three days -> 14.12

561 // - at least ten typed visits today -> 20.0 (maximum score)	580 // - at least ten typed visits today -> 20.0 (maximum score)

	581 //

	582 // The below code maps intermediate_score to the range [0, 1399].

	583 // For example:

	584 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"

	585 // We will linearly interpolate the scores between:

	586 // 0 to 1.5 --> 400 to 600

	587 // 1.5 to 12.0 --> 600 to 1300

	588 // 12.0 to 20.0 --> 1300 to 1399

	589 // >= 20.0 --> 1399

	590 //

	591 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result

	592 // from HistoryURL provider)
	Mark P 2015/02/18 00:03:32 nit: period. nit: period. Ashok vardhan 2015/02/18 01:11:21 Done. Show quoted text On 2015/02/18 00:03:32, Mark P wrote: > nit: period. Done.
562 const float intermediate_score = topicality_score * frequency_score;	593 const float intermediate_score = topicality_score * frequency_score;

563 // The below code maps intermediate_score to the range [0, 1399].	594

564 // The score maxes out at 1400 (i.e., cannot beat a good inline result).	595 double min_intermediate_score;
	Mark P 2015/02/18 00:03:32 If intermediate_score == 0, you end up using this If intermediate_score == 0, you end up using this value which you never explicitly initialize. Ditto with the variable below. Is that intended? Should these be explicitly initialized, or should that ">" in line 600 be a >=? Also, perhaps we should DCHECK that hqp_relevance_buckets[0].first is 0.0. Finally, I still think these two variables need a comment. Ashok vardhan 2015/02/18 01:11:21 Yes, i have added ">=" after refactoring. You can Show quoted text On 2015/02/18 00:03:32, Mark P wrote: > If intermediate_score == 0, you end up using this value which you never > explicitly initialize. Ditto with the variable below. Is that intended? > Should these be explicitly initialized, or should that ">" in line 600 be a >=? > > Also, perhaps we should DCHECK that hqp_relevance_buckets[0].first is 0.0. > > Finally, I still think these two variables need a comment. Yes, i have added ">=" after refactoring. You can see in the immediate-patch-set. Added comment and check.
565 if (intermediate_score <= 1) {	596 float min_hqp_score;

566 // Linearly extrapolate between 0 and 1.5 so 0 has a score of 400	597 // Find the threshold where intermediate score is greater than bucket.

567 // and 1.5 has a score of 600.	598 for (size_t i = 0; i < hqp_relevance_buckets.size(); ++i) {

568 const float slope = (600 - 400) / (1.5f - 0.0f);	599 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i];

569 return 400 + slope * intermediate_score;	600 if (intermediate_score > hqp_bucket.first) {

	601 min_intermediate_score = hqp_bucket.first;

	602 min_hqp_score = hqp_bucket.second;

	603 } else {

	604 const float slope = ((hqp_bucket.second - min_hqp_score) /

	605 (hqp_bucket.first - min_intermediate_score));

	606 const float final_hqp_score = (min_hqp_score +

	607 (slope * (intermediate_score -

	608 min_intermediate_score)));

	609 return final_hqp_score;
	Mark P 2015/02/18 00:03:32 nit: simply return final_hqp_score; no need to use nit: simply return final_hqp_score; no need to use a temporary variable here. Ashok vardhan 2015/02/18 01:11:21 Done. Show quoted text On 2015/02/18 00:03:32, Mark P wrote: > nit: simply return final_hqp_score; no need to use a temporary variable here. Done.
	610 }

570 }	611 }

571 if (intermediate_score <= 12.0) {	612 // It will reach this stage when the score is > highest bucket score.

572 // Linearly extrapolate up to 12 so 12 has a score of 1300.	613 return min_hqp_score;

573 const float slope = (1300 - 600) / (12.0f - 1.5f);

574 return 600 + slope * (intermediate_score - 1.5);

575 }

576 // Linearly extrapolate so a score of 20 (or more) has a score of 1399.

577 // (Scores above 20 are possible for URLs that have multiple term hits

578 // in the URL and/or title and that are visited practically all

579 // the time using typed visits. We don't attempt to distinguish

580 // between these very good results.)

581 const float slope = (1399 - 1300) / (20.0f - 12.0f);

582 return std::min(1399.0, 1300 + slope * (intermediate_score - 12.0));

583 }	614 }

584	615

	616 // static

	617 void ScoredHistoryMatch::InitializeHQPExperimentalParams() {

	618 // These are default HQP relevance scoring buckets.

	619 // See GetFinalRelevancyScore() for details.

	620 std::string hqp_relevance_buckets_str = "0.0:400,1.5:600,12.0:1300,20.0:1399";

	621

	622 // Fetch the experiment params if they are any.

	623 hqp_experimental_scoring_enabled_ =

	624 OmniboxFieldTrial::HQPExperimentalScoringEnabled();

	625

	626 if (hqp_experimental_scoring_enabled_) {

	627 // Add the topicality threshold from experiment params.

	628 float hqp_experimental_topicality_threhold =

	629 OmniboxFieldTrial::HQPExperimentalTopicalityThreshold();

	630 topicality_threshold_ = hqp_experimental_topicality_threhold;

	631

	632 // Add the HQP experimental scoring buckets.

	633 std::string hqp_experimental_scoring_buckets =

	634 OmniboxFieldTrial::HQPExperimentalScoringBuckets();

	635 if (!hqp_experimental_scoring_buckets.empty())

	636 hqp_relevance_buckets_str = hqp_experimental_scoring_buckets;

	637 }

	638

	639 // Parse the hqp_relevance_buckets_str string once and store them in vector

	640 // which is easy to access.

	641 hqp_relevance_buckets_ =

	642 new std::vector<ScoredHistoryMatch::ScoreMaxRelevance>();

	643 base::StringPairs kv_pairs;

	644 if (base::SplitStringIntoKeyValuePairs(hqp_relevance_buckets_str,

	645 ':', ',', &kv_pairs)) {

	646 for (base::StringPairs::const_iterator it = kv_pairs.begin();

	647 it != kv_pairs.end(); ++it) {

	648 ScoreMaxRelevance bucket;

	649 base::StringToDouble(it->first, &bucket.first);
	Mark P 2015/02/18 00:03:32 DCHECKS are not executed in non-debug builds, mean Show quoted text On 2015/02/17 01:23:52, ashokvardhan wrote: > On 2015/02/14 01:27:14, Mark P wrote: > > minor nit: consider DCHECKING the return value for these > > DCHECK seems to be not working here. > > std::string x = "1"; > int a = 0; > int b = 0; > base::StringToInt(x, &a); > DCHECK(base::StringToInt(x, &b)); > LOG(INFO) << "a is : " << a << std::endl; > LOG(INFO) << "b is : " << b << std::endl; > > Logs: > a is : 1 > b is : 0 > > Am i missing anything here ? I used CHECK in our code-base, but this is strange. > I can use if condition checkings but not sure why this is not working. DCHECKS are not executed in non-debug builds, meaning base::StringToInt(x, &b) is not executed. That's why b remains 0. I guess perhaps you can save the return type of the function, then DCHECK(that_value). Ashok vardhan 2015/02/18 01:11:21 I see, thanks.! Show quoted text On 2015/02/18 00:03:32, Mark P wrote: > On 2015/02/17 01:23:52, ashokvardhan wrote: > > On 2015/02/14 01:27:14, Mark P wrote: > > > minor nit: consider DCHECKING the return value for these > > > > DCHECK seems to be not working here. > > > > std::string x = "1"; > > int a = 0; > > int b = 0; > > base::StringToInt(x, &a); > > DCHECK(base::StringToInt(x, &b)); > > > LOG(INFO) << "a is : " << a << std::endl; > > LOG(INFO) << "b is : " << b << std::endl; > > > > Logs: > > a is : 1 > > b is : 0 > > > > Am i missing anything here ? I used CHECK in our code-base, but this is > strange. > > I can use if condition checkings but not sure why this is not working. > > DCHECKS are not executed in non-debug builds, meaning > base::StringToInt(x, &b) > is not executed. That's why b remains 0. I see, thanks.! Show quoted text > > I guess perhaps you can save the return type of the function, then > DCHECK(that_value). Done.
	650 base::StringToInt(it->second, &bucket.second);

	651 hqp_relevance_buckets_->push_back(bucket);

	652 }

	653 }

	654 }

	655

	656 // static

585 void ScoredHistoryMatch::Init() {	657 void ScoredHistoryMatch::Init() {

586 if (initialized_)	658 if (initialized_)

587 return;	659 return;

588 also_do_hup_like_scoring_ = false;	660 also_do_hup_like_scoring_ = false;

589 // When doing HUP-like scoring, don't allow a non-inlineable match	661 // When doing HUP-like scoring, don't allow a non-inlineable match

590 // to beat the score of good inlineable matches. This is a problem	662 // to beat the score of good inlineable matches. This is a problem

591 // because if a non-inlineable match ends up with the highest score	663 // because if a non-inlineable match ends up with the highest score

592 // from HistoryQuick provider, all HistoryQuick matches get demoted	664 // from HistoryQuick provider, all HistoryQuick matches get demoted

593 // to non-inlineable scores (scores less than 1200). Without	665 // to non-inlineable scores (scores less than 1200). Without

594 // HUP-like-scoring, these results would actually come from the HUP	666 // HUP-like-scoring, these results would actually come from the HUP

595 // and not be demoted, thus outscoring the demoted HQP results.	667 // and not be demoted, thus outscoring the demoted HQP results.

596 // When the HQP provides these, we need to clamp the non-inlineable	668 // When the HQP provides these, we need to clamp the non-inlineable

597 // results to preserve this behavior.	669 // results to preserve this behavior.

598 if (also_do_hup_like_scoring_) {	670 if (also_do_hup_like_scoring_) {

599 max_assigned_score_for_non_inlineable_matches_ =	671 max_assigned_score_for_non_inlineable_matches_ =

600 HistoryURLProvider::kScoreForBestInlineableResult - 1;	672 HistoryURLProvider::kScoreForBestInlineableResult - 1;

601 }	673 }

602 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();	674 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();

603 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();	675 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();

604 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();	676 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();

	677

	678 // Initialize the HQP Experimental scoring params.

	679 InitializeHQPExperimentalParams();

	680

605 initialized_ = true;	681 initialized_ = true;

606 }	682 }

607	683

608 } // namespace history	684 } // namespace history

OLD	NEW