Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(229)

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2548363010: Omnibox - Refactor |relevance_buckets| to Remove Memory Leak on Exit (Closed)
Patch Set: polish Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/scored_history_match.h" 5 #include "components/omnibox/browser/scored_history_match.h"
6 6
7 #include <math.h> 7 #include <math.h>
8 8
9 #include <algorithm> 9 #include <algorithm>
10 #include <vector> 10 #include <vector>
11 11
12 #include "base/logging.h" 12 #include "base/logging.h"
13 #include "base/macros.h"
13 #include "base/numerics/safe_conversions.h" 14 #include "base/numerics/safe_conversions.h"
14 #include "base/strings/string_number_conversions.h" 15 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_split.h" 16 #include "base/strings/string_split.h"
16 #include "base/strings/string_util.h" 17 #include "base/strings/string_util.h"
17 #include "base/strings/utf_offset_string_conversions.h" 18 #include "base/strings/utf_offset_string_conversions.h"
18 #include "base/strings/utf_string_conversions.h" 19 #include "base/strings/utf_string_conversions.h"
19 #include "components/bookmarks/browser/bookmark_utils.h" 20 #include "components/bookmarks/browser/bookmark_utils.h"
20 #include "components/omnibox/browser/history_url_provider.h" 21 #include "components/omnibox/browser/history_url_provider.h"
21 #include "components/omnibox/browser/omnibox_field_trial.h" 22 #include "components/omnibox/browser/omnibox_field_trial.h"
22 #include "components/omnibox/browser/url_prefix.h" 23 #include "components/omnibox/browser/url_prefix.h"
23 24
24 namespace { 25 namespace {
25 26
26 // The number of days of recency scores to precompute. 27 // The number of days of recency scores to precompute.
27 const int kDaysToPrecomputeRecencyScoresFor = 366; 28 const int kDaysToPrecomputeRecencyScoresFor = 366;
28 29
29 // The number of raw term score buckets use; raw term scores greater this are 30 // The number of raw term score buckets use; raw term scores greater this are
30 // capped at the score of the largest bucket. 31 // capped at the score of the largest bucket.
31 const int kMaxRawTermScore = 30; 32 const int kMaxRawTermScore = 30;
32 33
34 // Default topicality threshold. See GetTopicalityScore() for details.
35 float kDefaultTopicalityThreshold = 0.8;
36
37 // Default relevance buckets. See GetFinalRelevancyScore() for more details on
38 // these numbers.
39 const char kDefaultRelevanceBucketsString[] =
40 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399";
Peter Kasting 2016/12/08 01:29:21 Nit: Since both these constants are only used in o
Mark P 2016/12/09 20:40:56 I prefer meaningful, significant constants be decl
Peter Kasting 2016/12/10 02:31:15 You mean, like, up here? :) It seems harder to r
Mark P 2016/12/11 04:57:22 Touché.
41
33 // Pre-computed information to speed up calculating recency scores. 42 // Pre-computed information to speed up calculating recency scores.
34 // |days_ago_to_recency_score| is a simple array mapping how long ago a page was 43 // |days_ago_to_recency_score| is a simple array mapping how long ago a page was
35 // visited (in days) to the recency score we should assign it. This allows easy 44 // visited (in days) to the recency score we should assign it. This allows easy
36 // lookups of scores without requiring math. This is initialized by 45 // lookups of scores without requiring math. This is initialized by
37 // InitDaysAgoToRecencyScoreArray called by 46 // InitDaysAgoToRecencyScoreArray called by
38 // ScoredHistoryMatch::Init(). 47 // ScoredHistoryMatch::Init().
39 float days_ago_to_recency_score[kDaysToPrecomputeRecencyScoresFor]; 48 float days_ago_to_recency_score[kDaysToPrecomputeRecencyScoresFor];
40 49
41 // Pre-computed information to speed up calculating topicality scores. 50 // Pre-computed information to speed up calculating topicality scores.
42 // |raw_term_score_to_topicality_score| is a simple array mapping how raw terms 51 // |raw_term_score_to_topicality_score| is a simple array mapping how raw terms
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
106 // static 115 // static
107 bool ScoredHistoryMatch::also_do_hup_like_scoring_; 116 bool ScoredHistoryMatch::also_do_hup_like_scoring_;
108 float ScoredHistoryMatch::bookmark_value_; 117 float ScoredHistoryMatch::bookmark_value_;
109 float ScoredHistoryMatch::typed_value_; 118 float ScoredHistoryMatch::typed_value_;
110 bool ScoredHistoryMatch::fix_few_visits_bug_; 119 bool ScoredHistoryMatch::fix_few_visits_bug_;
111 bool ScoredHistoryMatch::frequency_uses_sum_; 120 bool ScoredHistoryMatch::frequency_uses_sum_;
112 size_t ScoredHistoryMatch::max_visits_to_score_; 121 size_t ScoredHistoryMatch::max_visits_to_score_;
113 bool ScoredHistoryMatch::allow_tld_matches_; 122 bool ScoredHistoryMatch::allow_tld_matches_;
114 bool ScoredHistoryMatch::allow_scheme_matches_; 123 bool ScoredHistoryMatch::allow_scheme_matches_;
115 size_t ScoredHistoryMatch::num_title_words_to_allow_; 124 size_t ScoredHistoryMatch::num_title_words_to_allow_;
116 bool ScoredHistoryMatch::hqp_experimental_scoring_enabled_; 125 float ScoredHistoryMatch::topicality_threshold_;
117
118 // Default topicality threshold. See GetTopicalityScore() for details.
119 float ScoredHistoryMatch::topicality_threshold_ = 0.8f;
120
121 // Default HQP relevance buckets. See GetFinalRelevancyScore() for more details
122 // on these numbers.
123 char ScoredHistoryMatch::hqp_relevance_buckets_str_[] =
124 "0.0:400,1.5:600,5.0:900,10.5:1203,15.0:1300,20.0:1399";
125 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>* 126 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*
126 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr; 127 ScoredHistoryMatch::relevance_buckets_ = nullptr;
127 128
128 ScoredHistoryMatch::ScoredHistoryMatch() 129 ScoredHistoryMatch::ScoredHistoryMatch()
129 : ScoredHistoryMatch(history::URLRow(), 130 : ScoredHistoryMatch(history::URLRow(),
130 VisitInfoVector(), 131 VisitInfoVector(),
131 base::string16(), 132 base::string16(),
132 String16Vector(), 133 String16Vector(),
133 WordStarts(), 134 WordStarts(),
134 RowWordStarts(), 135 RowWordStarts(),
135 false, 136 false,
136 base::Time::Max()) { 137 base::Time::Max()) {
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after
254 likely_can_inline = true; 255 likely_can_inline = true;
255 innermost_match = (best_inlineable_prefix->num_components == 256 innermost_match = (best_inlineable_prefix->num_components ==
256 best_prefix->num_components); 257 best_prefix->num_components);
257 } 258 }
258 } 259 }
259 } 260 }
260 261
261 const float topicality_score = GetTopicalityScore( 262 const float topicality_score = GetTopicalityScore(
262 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts); 263 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);
263 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits); 264 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);
264 raw_score = base::saturated_cast<int>(GetFinalRelevancyScore( 265 raw_score = base::saturated_cast<int>(
265 topicality_score, frequency_score, *hqp_relevance_buckets_)); 266 GetFinalRelevancyScore(topicality_score, frequency_score));
266 267
267 if (also_do_hup_like_scoring_ && likely_can_inline) { 268 if (also_do_hup_like_scoring_ && likely_can_inline) {
268 // HistoryURL-provider-like scoring gives any match that is 269 // HistoryURL-provider-like scoring gives any match that is
269 // capable of being inlined a certain minimum score. Some of these 270 // capable of being inlined a certain minimum score. Some of these
270 // are given a higher score that lets them be shown in inline. 271 // are given a higher score that lets them be shown in inline.
271 // This test here derives from the test in 272 // This test here derives from the test in
272 // HistoryURLProvider::PromoteMatchForInlineAutocomplete(). 273 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().
273 const bool promote_to_inline = 274 const bool promote_to_inline =
274 (row.typed_count() > 1) || (IsHostOnly() && (row.typed_count() == 1)); 275 (row.typed_count() > 1) || (IsHostOnly() && (row.typed_count() == 1));
275 int hup_like_score = 276 int hup_like_score =
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after
408 initialized = true; 409 initialized = true;
409 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring(); 410 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring();
410 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue(); 411 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();
411 typed_value_ = OmniboxFieldTrial::HQPTypedValue(); 412 typed_value_ = OmniboxFieldTrial::HQPTypedValue();
412 max_visits_to_score_ = OmniboxFieldTrial::HQPMaxVisitsToScore(); 413 max_visits_to_score_ = OmniboxFieldTrial::HQPMaxVisitsToScore();
413 frequency_uses_sum_ = OmniboxFieldTrial::HQPFreqencyUsesSum(); 414 frequency_uses_sum_ = OmniboxFieldTrial::HQPFreqencyUsesSum();
414 fix_few_visits_bug_ = OmniboxFieldTrial::HQPFixFewVisitsBug(); 415 fix_few_visits_bug_ = OmniboxFieldTrial::HQPFixFewVisitsBug();
415 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue(); 416 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();
416 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue(); 417 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();
417 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow(); 418 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow();
419 topicality_threshold_ =
420 OmniboxFieldTrial::HQPExperimentalScoringEnabled()
421 ? OmniboxFieldTrial::HQPExperimentalTopicalityThreshold()
422 : kDefaultTopicalityThreshold;
Peter Kasting 2016/12/08 01:29:21 Nit: Should HQPExperimentalTopicalityThreshold() r
Mark P 2016/12/09 20:40:56 Sure. I had some concerns about current experimen
418 423
419 InitRawTermScoreToTopicalityScoreArray(); 424 InitRawTermScoreToTopicalityScoreArray();
420 InitDaysAgoToRecencyScoreArray(); 425 InitDaysAgoToRecencyScoreArray();
421 InitHQPExperimentalParams();
422 } 426 }
423 427
424 float ScoredHistoryMatch::GetTopicalityScore( 428 float ScoredHistoryMatch::GetTopicalityScore(
425 const int num_terms, 429 const int num_terms,
426 const base::string16& url, 430 const base::string16& url,
427 const WordStarts& terms_to_word_starts_offsets, 431 const WordStarts& terms_to_word_starts_offsets,
428 const RowWordStarts& word_starts) { 432 const RowWordStarts& word_starts) {
429 // A vector that accumulates per-term scores. The strongest match--a 433 // A vector that accumulates per-term scores. The strongest match--a
430 // match in the hostname at a word boundary--is worth 10 points. 434 // match in the hostname at a word boundary--is worth 10 points.
431 // Everything else is less. In general, a match that's not at a word 435 // Everything else is less. In general, a match that's not at a word
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after
610 // Use |max_visits_to_score_| as the denominator for the average regardless of 614 // Use |max_visits_to_score_| as the denominator for the average regardless of
611 // how many visits there were in order to penalize a match that has 615 // how many visits there were in order to penalize a match that has
612 // fewer visits than kMaxVisitsToScore. 616 // fewer visits than kMaxVisitsToScore.
613 if (fix_few_visits_bug_) 617 if (fix_few_visits_bug_)
614 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_; 618 return summed_visit_points / ScoredHistoryMatch::max_visits_to_score_;
615 return visits.size() * summed_visit_points / 619 return visits.size() * summed_visit_points /
616 ScoredHistoryMatch::max_visits_to_score_; 620 ScoredHistoryMatch::max_visits_to_score_;
617 } 621 }
618 622
619 // static 623 // static
620 float ScoredHistoryMatch::GetFinalRelevancyScore( 624 float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score,
621 float topicality_score, 625 float frequency_score) {
622 float frequency_score, 626 // |relevance_buckets| gives a mapping from intemerdiate score to the final
623 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) { 627 // relevance score.
624 DCHECK(hqp_relevance_buckets.size() > 0); 628 CR_DEFINE_STATIC_LOCAL(std::vector<ScoreMaxRelevance>, relevance_buckets,
625 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0); 629 (GetHQPBuckets()));
Peter Kasting 2016/12/08 01:29:21 This has the danger (that you called out in the he
Mark P 2016/12/09 20:40:56 Yeah, that's cleaner. Done.
630 relevance_buckets_ = &relevance_buckets;
631 DCHECK(relevance_buckets.size() > 0);
Peter Kasting 2016/12/08 01:29:21 Nit: DCHECK(!relevance_buckets.empty())
Mark P 2016/12/09 20:40:56 Done.
632 DCHECK_EQ(relevance_buckets[0].first, 0.0);
Peter Kasting 2016/12/08 01:29:21 Nit: (expected, actual)
Mark P 2016/12/09 20:40:56 Done.
626 633
627 if (topicality_score == 0) 634 if (topicality_score == 0)
628 return 0; 635 return 0;
629 // Here's how to interpret intermediate_score: Suppose the omnibox 636 // Here's how to interpret intermediate_score: Suppose the omnibox
630 // has one input term. Suppose we have a URL for which the omnibox 637 // has one input term. Suppose we have a URL for which the omnibox
631 // input term has a single URL hostname hit at a word boundary. (This 638 // input term has a single URL hostname hit at a word boundary. (This
632 // implies topicality_score = 1.0.). Then the intermediate_score for 639 // implies topicality_score = 1.0.). Then the intermediate_score for
633 // this URL will depend entirely on the frequency_score with 640 // this URL will depend entirely on the frequency_score with
634 // this interpretation: 641 // this interpretation:
635 // - a single typed visit more than three months ago, no other visits -> 0.2 642 // - a single typed visit more than three months ago, no other visits -> 0.2
636 // - a visit every three days, no typed visits -> 0.706 643 // - a visit every three days, no typed visits -> 0.706
637 // - a visit every day, no typed visits -> 0.916 644 // - a visit every day, no typed visits -> 0.916
638 // - a single typed visit yesterday, no other visits -> 2.0 645 // - a single typed visit yesterday, no other visits -> 2.0
639 // - a typed visit once a week -> 11.77 646 // - a typed visit once a week -> 11.77
640 // - a typed visit every three days -> 14.12 647 // - a typed visit every three days -> 14.12
641 // - at least ten typed visits today -> 20.0 (maximum score) 648 // - at least ten typed visits today -> 20.0 (maximum score)
642 // 649 //
643 // The below code maps intermediate_score to the range [0, 1399]. 650 // The below code maps intermediate_score to the range [0, 1399].
644 // For example: 651 // For example:
645 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399" 652 // The default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"
646 // We will linearly interpolate the scores between: 653 // We will linearly interpolate the scores between:
647 // 0 to 1.5 --> 400 to 600 654 // 0 to 1.5 --> 400 to 600
648 // 1.5 to 12.0 --> 600 to 1300 655 // 1.5 to 12.0 --> 600 to 1300
649 // 12.0 to 20.0 --> 1300 to 1399 656 // 12.0 to 20.0 --> 1300 to 1399
650 // >= 20.0 --> 1399 657 // >= 20.0 --> 1399
651 // 658 //
652 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result 659 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result
653 // from HistoryURL provider). 660 // from HistoryURL provider).
654 const float intermediate_score = topicality_score * frequency_score; 661 const float intermediate_score = topicality_score * frequency_score;
655 662
656 // Find the threshold where intermediate score is greater than bucket. 663 // Find the threshold where intermediate score is greater than bucket.
657 size_t i = 1; 664 size_t i = 1;
658 for (; i < hqp_relevance_buckets.size(); ++i) { 665 for (; i < relevance_buckets.size(); ++i) {
659 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i]; 666 const ScoreMaxRelevance& bucket = relevance_buckets[i];
660 if (intermediate_score >= hqp_bucket.first) { 667 if (intermediate_score >= bucket.first) {
661 continue; 668 continue;
662 } 669 }
663 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1]; 670 const ScoreMaxRelevance& previous_bucket = relevance_buckets[i - 1];
664 const float slope = ((hqp_bucket.second - previous_bucket.second) / 671 const float slope = ((bucket.second - previous_bucket.second) /
665 (hqp_bucket.first - previous_bucket.first)); 672 (bucket.first - previous_bucket.first));
666 return (previous_bucket.second + 673 return (previous_bucket.second +
667 (slope * (intermediate_score - previous_bucket.first))); 674 (slope * (intermediate_score - previous_bucket.first)));
668 } 675 }
669 // It will reach this stage when the score is > highest bucket score. 676 // It will reach this stage when the score is > highest bucket score.
670 // Return the highest bucket score. 677 // Return the highest bucket score.
671 return hqp_relevance_buckets[i - 1].second; 678 return relevance_buckets[i - 1].second;
672 } 679 }
673 680
674 // static 681 // static
675 void ScoredHistoryMatch::InitHQPExperimentalParams() { 682 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>
676 // These are default HQP relevance scoring buckets. 683 ScoredHistoryMatch::GetHQPBuckets() {
677 // See GetFinalRelevancyScore() for details. 684 std::string relevance_buckets_str(kDefaultRelevanceBucketsString);
678 std::string hqp_relevance_buckets_str = std::string( 685 if (OmniboxFieldTrial::HQPExperimentalScoringEnabled()) {
679 hqp_relevance_buckets_str_); 686 std::string experimental_scoring_buckets =
680
681 // Fetch the experiment params if they are any.
682 hqp_experimental_scoring_enabled_ =
683 OmniboxFieldTrial::HQPExperimentalScoringEnabled();
684
685 if (hqp_experimental_scoring_enabled_) {
686 // Add the topicality threshold from experiment params.
687 float hqp_experimental_topicality_threhold =
688 OmniboxFieldTrial::HQPExperimentalTopicalityThreshold();
689 topicality_threshold_ = hqp_experimental_topicality_threhold;
690
691 // Add the HQP experimental scoring buckets.
692 std::string hqp_experimental_scoring_buckets =
693 OmniboxFieldTrial::HQPExperimentalScoringBuckets(); 687 OmniboxFieldTrial::HQPExperimentalScoringBuckets();
694 if (!hqp_experimental_scoring_buckets.empty()) 688 if (!experimental_scoring_buckets.empty())
695 hqp_relevance_buckets_str = hqp_experimental_scoring_buckets; 689 relevance_buckets_str = experimental_scoring_buckets;
696 } 690 }
697 691 return GetHQPBucketsFromString(relevance_buckets_str);
698 // Parse the hqp_relevance_buckets_str string once and store them in vector
699 // which is easy to access.
700 hqp_relevance_buckets_ =
701 new std::vector<ScoredHistoryMatch::ScoreMaxRelevance>();
702
703 bool is_valid_bucket_str = GetHQPBucketsFromString(hqp_relevance_buckets_str,
704 hqp_relevance_buckets_);
705 DCHECK(is_valid_bucket_str);
706 } 692 }
707 693
708 // static 694 // static
709 bool ScoredHistoryMatch::GetHQPBucketsFromString( 695 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>
710 const std::string& buckets_str, 696 ScoredHistoryMatch::GetHQPBucketsFromString(const std::string& buckets_str) {
711 std::vector<ScoreMaxRelevance>* hqp_buckets) {
712 DCHECK(hqp_buckets != NULL);
713 DCHECK(!buckets_str.empty()); 697 DCHECK(!buckets_str.empty());
714
715 base::StringPairs kv_pairs; 698 base::StringPairs kv_pairs;
716 if (base::SplitStringIntoKeyValuePairs(buckets_str, ':', ',', &kv_pairs)) { 699 if (!base::SplitStringIntoKeyValuePairs(buckets_str, ':', ',', &kv_pairs)) {
Peter Kasting 2016/12/08 01:29:21 Nit: No {}
Mark P 2016/12/09 20:40:56 Done.
717 for (base::StringPairs::const_iterator it = kv_pairs.begin(); 700 return std::vector<ScoreMaxRelevance>();
718 it != kv_pairs.end(); ++it) {
719 ScoreMaxRelevance bucket;
720 bool is_valid_intermediate_score =
721 base::StringToDouble(it->first, &bucket.first);
722 DCHECK(is_valid_intermediate_score);
723 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);
724 DCHECK(is_valid_hqp_score);
725 hqp_buckets->push_back(bucket);
726 }
727 return true;
728 } 701 }
729 return false; 702 std::vector<ScoreMaxRelevance> hqp_buckets;
703 for (base::StringPairs::const_iterator it = kv_pairs.begin();
Mark P 2016/12/07 22:39:27 Despite the highlighting, this loop and its interi
704 it != kv_pairs.end(); ++it) {
705 ScoreMaxRelevance bucket;
706 bool is_valid_intermediate_score =
707 base::StringToDouble(it->first, &bucket.first);
708 DCHECK(is_valid_intermediate_score);
709 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);
710 DCHECK(is_valid_hqp_score);
711 hqp_buckets.push_back(bucket);
712 }
713 return hqp_buckets;
730 } 714 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698