components/history/core/browser/scored_history_match.cc - Issue 903493002: Componentize ScoredHistoryMatch

Unified Diff: components/history/core/browser/scored_history_match.cc

Issue 903493002: Componentize ScoredHistoryMatch (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « components/history/core/browser/scored_history_match.h ('k') | components/history/core/browser/scored_history_match_client.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/history/core/browser/scored_history_match.cc

diff --git a/components/history/core/browser/scored_history_match.cc b/components/history/core/browser/scored_history_match.cc

new file mode 100644

index 0000000000000000000000000000000000000000..f9c0396095b6003928464884c710cec3750ab2e0

--- /dev/null

+++ b/components/history/core/browser/scored_history_match.cc

@@ -0,0 +1,371 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "components/history/core/browser/scored_history_match.h"

+#include <algorithm>

+#include "base/strings/string_util.h"

+#include "base/strings/utf_offset_string_conversions.h"

+#include "components/history/core/browser/scored_history_match_client.h"

+#include "url/gurl.h"

+namespace history {

+// static

+const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10;

+ScoredHistoryMatch::ScoredHistoryMatch() : raw_score_(0), can_inline_(false) {

+ScoredHistoryMatch::ScoredHistoryMatch(

+ const URLRow& row,

+ const VisitInfoVector& visits,

+ const std::string& languages,

+ const base::string16& lower_string,

+ const String16Vector& terms,

+ const WordStarts& terms_to_word_starts_offsets,

+ const RowWordStarts& word_starts,

+ const base::Time now,

+ const ScoredHistoryMatchClient* client)

+ : HistoryMatch(row, 0, false, false), raw_score_(0), can_inline_(false) {

+ GURL gurl = row.url();

+ if (!gurl.is_valid())

+ return;

+ // Figure out where each search term appears in the URL and/or page title

+ // so that we can score as well as provide autocomplete highlighting.

+ base::OffsetAdjuster::Adjustments adjustments;

+ base::string16 title = row.title();

+ base::string16 url = client->CleanUpUrlAndTitleForMatching(

+ gurl, languages, &adjustments, &title);

+ int term_num = 0;

+ for (const auto& term : terms) {

+ TermMatches url_term_matches = MatchTermInString(term, url, term_num);

+ TermMatches title_term_matches = MatchTermInString(term, title, term_num);

+ if (url_term_matches.empty() && title_term_matches.empty())

+ return; // A term was not found in either URL or title - reject.

+ url_matches_.insert(url_matches_.end(), url_term_matches.begin(),

+ url_term_matches.end());

+ title_matches_.insert(title_matches_.end(), title_term_matches.begin(),

+ title_term_matches.end());

+ ++term_num;

+ }

+ // Sort matches by offset and eliminate any which overlap.

+ // TODO(mpearson): Investigate whether this has any meaningful

+ // effect on scoring. (It's necessary at some point: removing

+ // overlaps and sorting is needed to decide what to highlight in the

+ // suggestion string. But this sort and de-overlap doesn't have to

+ // be done before scoring.)

+ url_matches_ = SortAndDeoverlapMatches(url_matches_);

+ title_matches_ = SortAndDeoverlapMatches(title_matches_);

+ // Only try to inline autocomplete match if we has a single term, some

+ // url_matches and no space in the request.

+ if (!url_matches_.empty() && terms.size() == 1 &&

+ !IsWhitespace(*lower_string.rbegin())) {

+ can_inline_ =

+ client->CanInlineAutocompleteMatch(gurl, terms[0], &innermost_match);

+ }

+ const float topicality_score = GetTopicalityScore(

+ terms.size(), url, terms_to_word_starts_offsets, word_starts, client);

+ const float frequency_score =

+ GetFrequency(now, client->IsBoomarked(gurl), visits, client);

+ raw_score_ = GetFinalRelevancyScore(topicality_score, frequency_score);

+ raw_score_ =

+ (raw_score_ <= kint32max) ? static_cast<int>(raw_score_) : kint32max;

+ // Now that we're done processing this entry, correct the offsets of the

+ // matches in |url_matches_| so they point to offsets in the original URL

+ // spec, not the cleaned-up URL string that we used for matching.

+ std::vector<size_t> offsets = OffsetsFromTermMatches(url_matches_);

+ base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);

+ url_matches_ = ReplaceOffsetsInTermMatches(url_matches_, offsets);

+ScoredHistoryMatch::~ScoredHistoryMatch() {}

+// Comparison function for sorting ScoredMatches by their scores with

+// intelligent tie-breaking.

+bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,

+ const ScoredHistoryMatch& m2) {

+ if (m1.raw_score() != m2.raw_score())

+ return m1.raw_score() > m2.raw_score();

+ // This tie-breaking logic is inspired by / largely copied from the

+ // ordering logic in history_url_provider.cc CompareHistoryMatch().

+ // A URL that has been typed at all is better than one that has never been

+ // typed. (Note "!"s on each side.)

+ if (!m1.url_info.typed_count() != !m2.url_info.typed_count())

+ return m1.url_info.typed_count() > m2.url_info.typed_count();

+ // Innermost matches (matches after any scheme or "www.") are better than

+ // non-innermost matches.

+ if (m1.innermost_match != m2.innermost_match)

+ return m1.innermost_match;

+ // URLs that have been typed more often are better.

+ if (m1.url_info.typed_count() != m2.url_info.typed_count())

+ return m1.url_info.typed_count() > m2.url_info.typed_count();

+ // For URLs that have each been typed once, a host (alone) is better

+ // than a page inside.

+ if (m1.url_info.typed_count() == 1) {

+ if (m1.IsHostOnly() != m2.IsHostOnly())

+ return m1.IsHostOnly();

+ }

+ // URLs that have been visited more often are better.

+ if (m1.url_info.visit_count() != m2.url_info.visit_count())

+ return m1.url_info.visit_count() > m2.url_info.visit_count();

+ // URLs that have been visited more recently are better.

+ return m1.url_info.last_visit() > m2.url_info.last_visit();

+// static

+TermMatches ScoredHistoryMatch::FilterTermMatchesByWordStarts(

+ const TermMatches& term_matches,

+ const WordStarts& terms_to_word_starts_offsets,

+ const WordStarts& word_starts,

+ size_t start_pos,

+ size_t end_pos) {

+ // Return early if no filtering is needed.

+ if (start_pos == std::string::npos)

+ return term_matches;

+ TermMatches filtered_matches;

+ WordStarts::const_iterator next_word_starts = word_starts.begin();

+ WordStarts::const_iterator end_word_starts = word_starts.end();

+ for (const auto& term_match : term_matches) {

+ const size_t term_offset =

+ terms_to_word_starts_offsets[term_match.term_num];

+ // Advance next_word_starts until it's >= the position of the term we're

+ // considering (adjusted for where the word begins within the term).

+ while ((next_word_starts != end_word_starts) &&

+ (*next_word_starts < (term_match.offset + term_offset)))

+ ++next_word_starts;

+ // Add the match if it's before the position we start filtering at or

+ // after the position we stop filtering at (assuming we have a position

+ // to stop filtering at) or if it's at a word boundary.

+ if ((term_match.offset < start_pos) ||

+ ((end_pos != std::string::npos) && (term_match.offset >= end_pos)) ||

+ ((next_word_starts != end_word_starts) &&

+ (*next_word_starts == term_match.offset + term_offset)))

+ filtered_matches.push_back(term_match);

+ }

+ return filtered_matches;

+float ScoredHistoryMatch::GetTopicalityScore(

+ const int num_terms,

+ const base::string16& url,

+ const WordStarts& terms_to_word_starts_offsets,

+ const RowWordStarts& word_starts,

+ const ScoredHistoryMatchClient* client) {

+ // DCHECK(raw_term_score_to_topicality_score_);

+ // A vector that accumulates per-term scores. The strongest match--a

+ // match in the hostname at a word boundary--is worth 10 points.

+ // Everything else is less. In general, a match that's not at a word

+ // boundary is worth about 1/4th or 1/5th of a match at the word boundary

+ // in the same part of the URL/title.

+ DCHECK_GT(num_terms, 0);

+ std::vector<int> term_scores(num_terms, 0);

+ WordStarts::const_iterator next_word_starts =

+ word_starts.url_word_starts_.begin();

+ WordStarts::const_iterator end_word_starts =

+ word_starts.url_word_starts_.end();

+ const size_t question_mark_pos = url.find('?');

+ const size_t colon_pos = url.find(':');

+ // The + 3 skips the // that probably appears in the protocol

+ // after the colon. If the protocol doesn't have two slashes after

+ // the colon, that's okay--all this ends up doing is starting our

+ // search for the next / a few characters into the hostname. The

+ // only times this can cause problems is if we have a protocol without

+ // a // after the colon and the hostname is only one or two characters.

+ // This isn't worth worrying about.

+ const size_t end_of_hostname_pos = (colon_pos != std::string::npos) ?

+ url.find('/', colon_pos + 3) : url.find('/');

+ size_t last_part_of_hostname_pos =

+ (end_of_hostname_pos != std::string::npos) ?

+ url.rfind('.', end_of_hostname_pos) : url.rfind('.');

+ // Loop through all URL matches and score them appropriately.

+ // First, filter all matches not at a word boundary and in the path (or

+ // later).

+ url_matches_ = FilterTermMatchesByWordStarts(

+ url_matches_, terms_to_word_starts_offsets, word_starts.url_word_starts_,

+ end_of_hostname_pos,

+ std::string::npos);

+ if (colon_pos != std::string::npos) {

+ // Also filter matches not at a word boundary and in the scheme.

+ url_matches_ = FilterTermMatchesByWordStarts(

+ url_matches_, terms_to_word_starts_offsets,

+ word_starts.url_word_starts_, 0, colon_pos);

+ }

+ const bool allow_tld_matches = client->AllowTldMatches();

+ const bool allow_scheme_matches = client->AllowSchemeMatches();

+ for (const auto& url_match : url_matches_) {

+ const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num];

+ // Advance next_word_starts until it's >= the position of the term we're

+ // considering (adjusted for where the word begins within the term).

+ while ((next_word_starts != end_word_starts) &&

+ (*next_word_starts < (url_match.offset + term_offset))) {

+ ++next_word_starts;

+ }

+ const bool at_word_boundary =

+ (next_word_starts != end_word_starts) &&

+ (*next_word_starts == url_match.offset + term_offset);

+ if ((question_mark_pos != std::string::npos) &&

+ (url_match.offset > question_mark_pos)) {

+ // The match is in a CGI ?... fragment.

+ DCHECK(at_word_boundary);

+ term_scores[url_match.term_num] += 5;

+ } else if ((end_of_hostname_pos != std::string::npos) &&

+ (url_match.offset > end_of_hostname_pos)) {

+ // The match is in the path.

+ DCHECK(at_word_boundary);

+ term_scores[url_match.term_num] += 8;

+ } else if ((colon_pos == std::string::npos) ||

+ (url_match.offset > colon_pos)) {

+ // The match is in the hostname.

+ if ((last_part_of_hostname_pos == std::string::npos) ||

+ (url_match.offset < last_part_of_hostname_pos)) {

+ // Either there are no dots in the hostname or this match isn't

+ // the last dotted component.

+ term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;

+ } else {

+ // The match is in the last part of a dotted hostname (usually this

+ // is the top-level domain .com, .net, etc.).

+ if (allow_tld_matches)

+ term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;

+ }

+ } else {

+ // The match is in the protocol (a.k.a. scheme).

+ // Matches not at a word boundary should have been filtered already.

+ DCHECK(at_word_boundary);

+ match_in_scheme = true;

+ if (allow_scheme_matches)

+ term_scores[url_match.term_num] += 10;

+ }

+ // Now do the analogous loop over all matches in the title.

+ next_word_starts = word_starts.title_word_starts_.begin();

+ end_word_starts = word_starts.title_word_starts_.end();

+ int word_num = 0;

+ title_matches_ = FilterTermMatchesByWordStarts(

+ title_matches_, terms_to_word_starts_offsets,

+ word_starts.title_word_starts_, 0, std::string::npos);

+ for (const auto& title_match : title_matches_) {

+ const size_t term_offset =

+ terms_to_word_starts_offsets[title_match.term_num];

+ // Advance next_word_starts until it's >= the position of the term we're

+ // considering (adjusted for where the word begins within the term).

+ while ((next_word_starts != end_word_starts) &&

+ (*next_word_starts < (title_match.offset + term_offset))) {

+ ++next_word_starts;

+ ++word_num;

+ }

+ if (word_num >= 10) break; // only count the first ten words

+ DCHECK(next_word_starts != end_word_starts);

+ DCHECK_EQ(*next_word_starts, title_match.offset + term_offset)

+ << "not at word boundary";

+ term_scores[title_match.term_num] += 8;

+ }

+ // TODO(mpearson): Restore logic for penalizing out-of-order matches.

+ // (Perhaps discount them by 0.8?)

+ // TODO(mpearson): Consider: if the earliest match occurs late in the string,

+ // should we discount it?

+ // TODO(mpearson): Consider: do we want to score based on how much of the

+ // input string the input covers? (I'm leaning toward no.)

+ // Compute the topicality_score as the sum of transformed term_scores.

+ float topicality_score = 0;

+ for (int term_score : term_scores) {

+ // Drop this URL if it seems like a term didn't appear or, more precisely,

+ // didn't appear in a part of the URL or title that we trust enough

+ // to give it credit for. For instance, terms that appear in the middle

+ // of a CGI parameter get no credit. Almost all the matches dropped

+ // due to this test would look stupid if shown to the user.

+ if (term_score == 0)

+ return 0;

+ topicality_score +=

+ client->GetTopicalityScoreFromRawScore(term_score);

+ }

+ // TODO(mpearson): If there are multiple terms, consider taking the

+ // geometric mean of per-term scores rather than the arithmetic mean.

+ return topicality_score / num_terms;

+// static

+float ScoredHistoryMatch::GetFrequency(const base::Time& now,

+ const bool bookmarked,

+ const VisitInfoVector& visits,

+ const ScoredHistoryMatchClient* client) {

+ // Compute the weighted average |value_of_transition| over the last at

+ // most kMaxVisitsToScore visits, where each visit is weighted using

+ // GetRecencyScore() based on how many days ago it happened. Use

+ // kMaxVisitsToScore as the denominator for the average regardless of

+ // how many visits there were in order to penalize a match that has

+ // fewer visits than kMaxVisitsToScore.

+ float summed_visit_points = 0;

+ const int bookmark_value = client->BookmarkValue();

+ size_t visits_to_score = std::min(visits.size(), kMaxVisitsToScore);

+ for (size_t i = 0; i < visits_to_score; ++i) {

+ int value_of_transition =

+ (visits[i].second == ui::PAGE_TRANSITION_TYPED) ? 20 : 1;

+ if (bookmarked)

+ value_of_transition = std::max(value_of_transition, bookmark_value);

+ const float bucket_weight =

+ client->GetRecencyScore((now - visits[i].first).InDays());

+ summed_visit_points += (value_of_transition * bucket_weight);

+ }

+ return visits.size() * summed_visit_points / kMaxVisitsToScore;

+// static

+float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score,

+ float frequency_score) {

+ if (topicality_score == 0)

+ return 0;

+ // Here's how to interpret intermediate_score: Suppose the omnibox

+ // has one input term. Suppose we have a URL for which the omnibox

+ // input term has a single URL hostname hit at a word boundary. (This

+ // implies topicality_score = 1.0.). Then the intermediate_score for

+ // this URL will depend entirely on the frequency_score with

+ // this interpretation:

+ // - a single typed visit more than three months ago, no other visits -> 0.2

+ // - a visit every three days, no typed visits -> 0.706

+ // - a visit every day, no typed visits -> 0.916

+ // - a single typed visit yesterday, no other visits -> 2.0

+ // - a typed visit once a week -> 11.77

+ // - a typed visit every three days -> 14.12

+ // - at least ten typed visits today -> 20.0 (maximum score)

+ const float intermediate_score = topicality_score * frequency_score;

+ // The below code maps intermediate_score to the range [0, 1399].

+ // The score maxes out at 1400 (i.e., cannot beat a good inline result).

+ if (intermediate_score <= 1) {

+ // Linearly extrapolate between 0 and 1.5 so 0 has a score of 400

+ // and 1.5 has a score of 600.

+ const float slope = (600 - 400) / (1.5f - 0.0f);

+ return 400 + slope * intermediate_score;

+ }

+ if (intermediate_score <= 12.0) {

+ // Linearly extrapolate up to 12 so 12 has a score of 1300.

+ const float slope = (1300 - 600) / (12.0f - 1.5f);

+ return 600 + slope * (intermediate_score - 1.5);

+ }

+ // Linearly extrapolate so a score of 20 (or more) has a score of 1399.

+ // (Scores above 20 are possible for URLs that have multiple term hits

+ // in the URL and/or title and that are visited practically all

+ // the time using typed visits. We don't attempt to distinguish

+ // between these very good results.)

+ const float slope = (1399 - 1300) / (20.0f - 12.0f);

+ return std::min(1399.0, 1300 + slope * (intermediate_score - 12.0));

+} // namespace history