Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(213)

Unified Diff: components/history/core/browser/scored_history_match.cc

Issue 903493002: Componentize ScoredHistoryMatch (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rebase Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/history/core/browser/scored_history_match.cc
diff --git a/components/history/core/browser/scored_history_match.cc b/components/history/core/browser/scored_history_match.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9c0396095b6003928464884c710cec3750ab2e0
--- /dev/null
+++ b/components/history/core/browser/scored_history_match.cc
@@ -0,0 +1,371 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/history/core/browser/scored_history_match.h"
+
+#include <algorithm>
+
+#include "base/strings/string_util.h"
+#include "base/strings/utf_offset_string_conversions.h"
+#include "components/history/core/browser/scored_history_match_client.h"
+#include "url/gurl.h"
+
+namespace history {
+
+// static
+const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10;
+
+ScoredHistoryMatch::ScoredHistoryMatch() : raw_score_(0), can_inline_(false) {
+}
+
+ScoredHistoryMatch::ScoredHistoryMatch(
+ const URLRow& row,
+ const VisitInfoVector& visits,
+ const std::string& languages,
+ const base::string16& lower_string,
+ const String16Vector& terms,
+ const WordStarts& terms_to_word_starts_offsets,
+ const RowWordStarts& word_starts,
+ const base::Time now,
+ const ScoredHistoryMatchClient* client)
+ : HistoryMatch(row, 0, false, false), raw_score_(0), can_inline_(false) {
+
+ GURL gurl = row.url();
+ if (!gurl.is_valid())
+ return;
+
+ // Figure out where each search term appears in the URL and/or page title
+ // so that we can score as well as provide autocomplete highlighting.
+ base::OffsetAdjuster::Adjustments adjustments;
+ base::string16 title = row.title();
+ base::string16 url = client->CleanUpUrlAndTitleForMatching(
+ gurl, languages, &adjustments, &title);
+ int term_num = 0;
+ for (const auto& term : terms) {
+ TermMatches url_term_matches = MatchTermInString(term, url, term_num);
+ TermMatches title_term_matches = MatchTermInString(term, title, term_num);
+ if (url_term_matches.empty() && title_term_matches.empty())
+ return; // A term was not found in either URL or title - reject.
+ url_matches_.insert(url_matches_.end(), url_term_matches.begin(),
+ url_term_matches.end());
+ title_matches_.insert(title_matches_.end(), title_term_matches.begin(),
+ title_term_matches.end());
+ ++term_num;
+ }
+
+ // Sort matches by offset and eliminate any which overlap.
+ // TODO(mpearson): Investigate whether this has any meaningful
+ // effect on scoring. (It's necessary at some point: removing
+ // overlaps and sorting is needed to decide what to highlight in the
+ // suggestion string. But this sort and de-overlap doesn't have to
+ // be done before scoring.)
+ url_matches_ = SortAndDeoverlapMatches(url_matches_);
+ title_matches_ = SortAndDeoverlapMatches(title_matches_);
+
+ // Only try to inline autocomplete match if we has a single term, some
+ // url_matches and no space in the request.
+ if (!url_matches_.empty() && terms.size() == 1 &&
+ !IsWhitespace(*lower_string.rbegin())) {
+ can_inline_ =
+ client->CanInlineAutocompleteMatch(gurl, terms[0], &innermost_match);
+ }
+
+ const float topicality_score = GetTopicalityScore(
+ terms.size(), url, terms_to_word_starts_offsets, word_starts, client);
+ const float frequency_score =
+ GetFrequency(now, client->IsBoomarked(gurl), visits, client);
+ raw_score_ = GetFinalRelevancyScore(topicality_score, frequency_score);
+ raw_score_ =
+ (raw_score_ <= kint32max) ? static_cast<int>(raw_score_) : kint32max;
+
+ // Now that we're done processing this entry, correct the offsets of the
+ // matches in |url_matches_| so they point to offsets in the original URL
+ // spec, not the cleaned-up URL string that we used for matching.
+ std::vector<size_t> offsets = OffsetsFromTermMatches(url_matches_);
+ base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
+ url_matches_ = ReplaceOffsetsInTermMatches(url_matches_, offsets);
+}
+
+ScoredHistoryMatch::~ScoredHistoryMatch() {}
+
+// Comparison function for sorting ScoredMatches by their scores with
+// intelligent tie-breaking.
+bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,
+ const ScoredHistoryMatch& m2) {
+ if (m1.raw_score() != m2.raw_score())
+ return m1.raw_score() > m2.raw_score();
+
+ // This tie-breaking logic is inspired by / largely copied from the
+ // ordering logic in history_url_provider.cc CompareHistoryMatch().
+
+ // A URL that has been typed at all is better than one that has never been
+ // typed. (Note "!"s on each side.)
+ if (!m1.url_info.typed_count() != !m2.url_info.typed_count())
+ return m1.url_info.typed_count() > m2.url_info.typed_count();
+
+ // Innermost matches (matches after any scheme or "www.") are better than
+ // non-innermost matches.
+ if (m1.innermost_match != m2.innermost_match)
+ return m1.innermost_match;
+
+ // URLs that have been typed more often are better.
+ if (m1.url_info.typed_count() != m2.url_info.typed_count())
+ return m1.url_info.typed_count() > m2.url_info.typed_count();
+
+ // For URLs that have each been typed once, a host (alone) is better
+ // than a page inside.
+ if (m1.url_info.typed_count() == 1) {
+ if (m1.IsHostOnly() != m2.IsHostOnly())
+ return m1.IsHostOnly();
+ }
+
+ // URLs that have been visited more often are better.
+ if (m1.url_info.visit_count() != m2.url_info.visit_count())
+ return m1.url_info.visit_count() > m2.url_info.visit_count();
+
+ // URLs that have been visited more recently are better.
+ return m1.url_info.last_visit() > m2.url_info.last_visit();
+}
+
+// static
+TermMatches ScoredHistoryMatch::FilterTermMatchesByWordStarts(
+ const TermMatches& term_matches,
+ const WordStarts& terms_to_word_starts_offsets,
+ const WordStarts& word_starts,
+ size_t start_pos,
+ size_t end_pos) {
+ // Return early if no filtering is needed.
+ if (start_pos == std::string::npos)
+ return term_matches;
+ TermMatches filtered_matches;
+ WordStarts::const_iterator next_word_starts = word_starts.begin();
+ WordStarts::const_iterator end_word_starts = word_starts.end();
+ for (const auto& term_match : term_matches) {
+ const size_t term_offset =
+ terms_to_word_starts_offsets[term_match.term_num];
+ // Advance next_word_starts until it's >= the position of the term we're
+ // considering (adjusted for where the word begins within the term).
+ while ((next_word_starts != end_word_starts) &&
+ (*next_word_starts < (term_match.offset + term_offset)))
+ ++next_word_starts;
+ // Add the match if it's before the position we start filtering at or
+ // after the position we stop filtering at (assuming we have a position
+ // to stop filtering at) or if it's at a word boundary.
+ if ((term_match.offset < start_pos) ||
+ ((end_pos != std::string::npos) && (term_match.offset >= end_pos)) ||
+ ((next_word_starts != end_word_starts) &&
+ (*next_word_starts == term_match.offset + term_offset)))
+ filtered_matches.push_back(term_match);
+ }
+ return filtered_matches;
+}
+
+float ScoredHistoryMatch::GetTopicalityScore(
+ const int num_terms,
+ const base::string16& url,
+ const WordStarts& terms_to_word_starts_offsets,
+ const RowWordStarts& word_starts,
+ const ScoredHistoryMatchClient* client) {
+ // DCHECK(raw_term_score_to_topicality_score_);
+ // A vector that accumulates per-term scores. The strongest match--a
+ // match in the hostname at a word boundary--is worth 10 points.
+ // Everything else is less. In general, a match that's not at a word
+ // boundary is worth about 1/4th or 1/5th of a match at the word boundary
+ // in the same part of the URL/title.
+ DCHECK_GT(num_terms, 0);
+ std::vector<int> term_scores(num_terms, 0);
+ WordStarts::const_iterator next_word_starts =
+ word_starts.url_word_starts_.begin();
+ WordStarts::const_iterator end_word_starts =
+ word_starts.url_word_starts_.end();
+ const size_t question_mark_pos = url.find('?');
+ const size_t colon_pos = url.find(':');
+ // The + 3 skips the // that probably appears in the protocol
+ // after the colon. If the protocol doesn't have two slashes after
+ // the colon, that's okay--all this ends up doing is starting our
+ // search for the next / a few characters into the hostname. The
+ // only times this can cause problems is if we have a protocol without
+ // a // after the colon and the hostname is only one or two characters.
+ // This isn't worth worrying about.
+ const size_t end_of_hostname_pos = (colon_pos != std::string::npos) ?
+ url.find('/', colon_pos + 3) : url.find('/');
+ size_t last_part_of_hostname_pos =
+ (end_of_hostname_pos != std::string::npos) ?
+ url.rfind('.', end_of_hostname_pos) : url.rfind('.');
+ // Loop through all URL matches and score them appropriately.
+ // First, filter all matches not at a word boundary and in the path (or
+ // later).
+ url_matches_ = FilterTermMatchesByWordStarts(
+ url_matches_, terms_to_word_starts_offsets, word_starts.url_word_starts_,
+ end_of_hostname_pos,
+ std::string::npos);
+ if (colon_pos != std::string::npos) {
+ // Also filter matches not at a word boundary and in the scheme.
+ url_matches_ = FilterTermMatchesByWordStarts(
+ url_matches_, terms_to_word_starts_offsets,
+ word_starts.url_word_starts_, 0, colon_pos);
+ }
+ const bool allow_tld_matches = client->AllowTldMatches();
+ const bool allow_scheme_matches = client->AllowSchemeMatches();
+ for (const auto& url_match : url_matches_) {
+ const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num];
+ // Advance next_word_starts until it's >= the position of the term we're
+ // considering (adjusted for where the word begins within the term).
+ while ((next_word_starts != end_word_starts) &&
+ (*next_word_starts < (url_match.offset + term_offset))) {
+ ++next_word_starts;
+ }
+ const bool at_word_boundary =
+ (next_word_starts != end_word_starts) &&
+ (*next_word_starts == url_match.offset + term_offset);
+ if ((question_mark_pos != std::string::npos) &&
+ (url_match.offset > question_mark_pos)) {
+ // The match is in a CGI ?... fragment.
+ DCHECK(at_word_boundary);
+ term_scores[url_match.term_num] += 5;
+ } else if ((end_of_hostname_pos != std::string::npos) &&
+ (url_match.offset > end_of_hostname_pos)) {
+ // The match is in the path.
+ DCHECK(at_word_boundary);
+ term_scores[url_match.term_num] += 8;
+ } else if ((colon_pos == std::string::npos) ||
+ (url_match.offset > colon_pos)) {
+ // The match is in the hostname.
+ if ((last_part_of_hostname_pos == std::string::npos) ||
+ (url_match.offset < last_part_of_hostname_pos)) {
+ // Either there are no dots in the hostname or this match isn't
+ // the last dotted component.
+ term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;
+ } else {
+ // The match is in the last part of a dotted hostname (usually this
+ // is the top-level domain .com, .net, etc.).
+ if (allow_tld_matches)
+ term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;
+ }
+ } else {
+ // The match is in the protocol (a.k.a. scheme).
+ // Matches not at a word boundary should have been filtered already.
+ DCHECK(at_word_boundary);
+ match_in_scheme = true;
+ if (allow_scheme_matches)
+ term_scores[url_match.term_num] += 10;
+ }
+ }
+ // Now do the analogous loop over all matches in the title.
+ next_word_starts = word_starts.title_word_starts_.begin();
+ end_word_starts = word_starts.title_word_starts_.end();
+ int word_num = 0;
+ title_matches_ = FilterTermMatchesByWordStarts(
+ title_matches_, terms_to_word_starts_offsets,
+ word_starts.title_word_starts_, 0, std::string::npos);
+ for (const auto& title_match : title_matches_) {
+ const size_t term_offset =
+ terms_to_word_starts_offsets[title_match.term_num];
+ // Advance next_word_starts until it's >= the position of the term we're
+ // considering (adjusted for where the word begins within the term).
+ while ((next_word_starts != end_word_starts) &&
+ (*next_word_starts < (title_match.offset + term_offset))) {
+ ++next_word_starts;
+ ++word_num;
+ }
+ if (word_num >= 10) break; // only count the first ten words
+ DCHECK(next_word_starts != end_word_starts);
+ DCHECK_EQ(*next_word_starts, title_match.offset + term_offset)
+ << "not at word boundary";
+ term_scores[title_match.term_num] += 8;
+ }
+ // TODO(mpearson): Restore logic for penalizing out-of-order matches.
+ // (Perhaps discount them by 0.8?)
+ // TODO(mpearson): Consider: if the earliest match occurs late in the string,
+ // should we discount it?
+ // TODO(mpearson): Consider: do we want to score based on how much of the
+ // input string the input covers? (I'm leaning toward no.)
+
+ // Compute the topicality_score as the sum of transformed term_scores.
+ float topicality_score = 0;
+ for (int term_score : term_scores) {
+ // Drop this URL if it seems like a term didn't appear or, more precisely,
+ // didn't appear in a part of the URL or title that we trust enough
+ // to give it credit for. For instance, terms that appear in the middle
+ // of a CGI parameter get no credit. Almost all the matches dropped
+ // due to this test would look stupid if shown to the user.
+ if (term_score == 0)
+ return 0;
+ topicality_score +=
+ client->GetTopicalityScoreFromRawScore(term_score);
+ }
+ // TODO(mpearson): If there are multiple terms, consider taking the
+ // geometric mean of per-term scores rather than the arithmetic mean.
+
+ return topicality_score / num_terms;
+}
+
+// static
+float ScoredHistoryMatch::GetFrequency(const base::Time& now,
+ const bool bookmarked,
+ const VisitInfoVector& visits,
+ const ScoredHistoryMatchClient* client) {
+ // Compute the weighted average |value_of_transition| over the last at
+ // most kMaxVisitsToScore visits, where each visit is weighted using
+ // GetRecencyScore() based on how many days ago it happened. Use
+ // kMaxVisitsToScore as the denominator for the average regardless of
+ // how many visits there were in order to penalize a match that has
+ // fewer visits than kMaxVisitsToScore.
+ float summed_visit_points = 0;
+ const int bookmark_value = client->BookmarkValue();
+ size_t visits_to_score = std::min(visits.size(), kMaxVisitsToScore);
+ for (size_t i = 0; i < visits_to_score; ++i) {
+ int value_of_transition =
+ (visits[i].second == ui::PAGE_TRANSITION_TYPED) ? 20 : 1;
+ if (bookmarked)
+ value_of_transition = std::max(value_of_transition, bookmark_value);
+ const float bucket_weight =
+ client->GetRecencyScore((now - visits[i].first).InDays());
+ summed_visit_points += (value_of_transition * bucket_weight);
+ }
+ return visits.size() * summed_visit_points / kMaxVisitsToScore;
+}
+
+// static
+float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score,
+ float frequency_score) {
+ if (topicality_score == 0)
+ return 0;
+ // Here's how to interpret intermediate_score: Suppose the omnibox
+ // has one input term. Suppose we have a URL for which the omnibox
+ // input term has a single URL hostname hit at a word boundary. (This
+ // implies topicality_score = 1.0.). Then the intermediate_score for
+ // this URL will depend entirely on the frequency_score with
+ // this interpretation:
+ // - a single typed visit more than three months ago, no other visits -> 0.2
+ // - a visit every three days, no typed visits -> 0.706
+ // - a visit every day, no typed visits -> 0.916
+ // - a single typed visit yesterday, no other visits -> 2.0
+ // - a typed visit once a week -> 11.77
+ // - a typed visit every three days -> 14.12
+ // - at least ten typed visits today -> 20.0 (maximum score)
+ const float intermediate_score = topicality_score * frequency_score;
+ // The below code maps intermediate_score to the range [0, 1399].
+ // The score maxes out at 1400 (i.e., cannot beat a good inline result).
+ if (intermediate_score <= 1) {
+ // Linearly extrapolate between 0 and 1.5 so 0 has a score of 400
+ // and 1.5 has a score of 600.
+ const float slope = (600 - 400) / (1.5f - 0.0f);
+ return 400 + slope * intermediate_score;
+ }
+ if (intermediate_score <= 12.0) {
+ // Linearly extrapolate up to 12 so 12 has a score of 1300.
+ const float slope = (1300 - 600) / (12.0f - 1.5f);
+ return 600 + slope * (intermediate_score - 1.5);
+ }
+ // Linearly extrapolate so a score of 20 (or more) has a score of 1399.
+ // (Scores above 20 are possible for URLs that have multiple term hits
+ // in the URL and/or title and that are visited practically all
+ // the time using typed visits. We don't attempt to distinguish
+ // between these very good results.)
+ const float slope = (1399 - 1300) / (20.0f - 12.0f);
+ return std::min(1399.0, 1300 + slope * (intermediate_score - 12.0));
+}
+
+} // namespace history
« no previous file with comments | « components/history/core/browser/scored_history_match.h ('k') | components/history/core/browser/scored_history_match_client.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698