chrome/browser/autocomplete/scored_history_match.cc - Issue 1181113005: Componentize HistoryURLProvider/ScoredHistoryMatch.

Side by Side Diff: chrome/browser/autocomplete/scored_history_match.cc

Issue 1181113005: Componentize HistoryURLProvider/ScoredHistoryMatch. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@prepare_history_url_provider_for_c14n

Patch Set: Rebase Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/autocomplete/scored_history_match.h"

6

7 #include <math.h>

8

9 #include <algorithm>

10 #include <vector>

11

12 #include "base/logging.h"

13 #include "base/numerics/safe_conversions.h"

14 #include "base/strings/string_number_conversions.h"

15 #include "base/strings/string_split.h"

16 #include "base/strings/string_util.h"

17 #include "base/strings/utf_offset_string_conversions.h"

18 #include "base/strings/utf_string_conversions.h"

19 #include "chrome/browser/autocomplete/history_url_provider.h"

20 #include "components/bookmarks/browser/bookmark_utils.h"

21 #include "components/omnibox/omnibox_field_trial.h"

22 #include "components/omnibox/url_prefix.h"

23

24 namespace {

25

26 // The number of days of recency scores to precompute.

27 const int kDaysToPrecomputeRecencyScoresFor = 366;

28

29 // The number of raw term score buckets use; raw term scores greater this are

30 // capped at the score of the largest bucket.

31 const int kMaxRawTermScore = 30;

32

33 // Pre-computed information to speed up calculating recency scores.

34 // \|days_ago_to_recency_score\| is a simple array mapping how long ago a page was

35 // visited (in days) to the recency score we should assign it. This allows easy

36 // lookups of scores without requiring math. This is initialized by

37 // InitDaysAgoToRecencyScoreArray called by

38 // ScoredHistoryMatch::Init().

39 float days_ago_to_recency_score[kDaysToPrecomputeRecencyScoresFor];

40

41 // Pre-computed information to speed up calculating topicality scores.

42 // \|raw_term_score_to_topicality_score\| is a simple array mapping how raw terms

43 // scores (a weighted sum of the number of hits for the term, weighted by how

44 // important the hit is: hostname, path, etc.) to the topicality score we should

45 // assign it. This allows easy lookups of scores without requiring math. This

46 // is initialized by InitRawTermScoreToTopicalityScoreArray() called from

47 // ScoredHistoryMatch::Init().

48 float raw_term_score_to_topicality_score[kMaxRawTermScore];

49

50 // Precalculates raw_term_score_to_topicality_score, used in

51 // GetTopicalityScore().

52 void InitRawTermScoreToTopicalityScoreArray() {

53 for (int term_score = 0; term_score < kMaxRawTermScore; ++term_score) {

54 float topicality_score;

55 if (term_score < 10) {

56 // If the term scores less than 10 points (no full-credit hit, or

57 // no combination of hits that score that well), then the topicality

58 // score is linear in the term score.

59 topicality_score = 0.1 * term_score;

60 } else {

61 // For term scores of at least ten points, pass them through a log

62 // function so a score of 10 points gets a 1.0 (to meet up exactly

63 // with the linear component) and increases logarithmically until

64 // maxing out at 30 points, with computes to a score around 2.1.

65 topicality_score = (1.0 + 2.25 * log10(0.1 * term_score));

66 }

67 raw_term_score_to_topicality_score[term_score] = topicality_score;

68 }

69 }

70

71 // Pre-calculates days_ago_to_recency_score, used in GetRecencyScore().

72 void InitDaysAgoToRecencyScoreArray() {

73 for (int days_ago = 0; days_ago < kDaysToPrecomputeRecencyScoresFor;

74 days_ago++) {

75 int unnormalized_recency_score;

76 if (days_ago <= 4) {

77 unnormalized_recency_score = 100;

78 } else if (days_ago <= 14) {

79 // Linearly extrapolate between 4 and 14 days so 14 days has a score

80 // of 70.

81 unnormalized_recency_score = 70 + (14 - days_ago) * (100 - 70) / (14 - 4);

82 } else if (days_ago <= 31) {

83 // Linearly extrapolate between 14 and 31 days so 31 days has a score

84 // of 50.

85 unnormalized_recency_score = 50 + (31 - days_ago) * (70 - 50) / (31 - 14);

86 } else if (days_ago <= 90) {

87 // Linearly extrapolate between 30 and 90 days so 90 days has a score

88 // of 30.

89 unnormalized_recency_score = 30 + (90 - days_ago) * (50 - 30) / (90 - 30);

90 } else {

91 // Linearly extrapolate between 90 and 365 days so 365 days has a score

92 // of 10.

93 unnormalized_recency_score =

94 10 + (365 - days_ago) * (20 - 10) / (365 - 90);

95 }

96 days_ago_to_recency_score[days_ago] = unnormalized_recency_score / 100.0;

97 if (days_ago > 0) {

98 DCHECK_LE(days_ago_to_recency_score[days_ago],

99 days_ago_to_recency_score[days_ago - 1]);

100 }

101 }

102 }

103

104 } // namespace

105

106 // static

107 const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10;

108 bool ScoredHistoryMatch::also_do_hup_like_scoring_ = false;

109 int ScoredHistoryMatch::bookmark_value_ = 1;

110 bool ScoredHistoryMatch::fix_frequency_bugs_ = false;

111 bool ScoredHistoryMatch::allow_tld_matches_ = false;

112 bool ScoredHistoryMatch::allow_scheme_matches_ = false;

113 size_t ScoredHistoryMatch::num_title_words_to_allow_ = 10u;

114 bool ScoredHistoryMatch::hqp_experimental_scoring_enabled_ = false;

115 float ScoredHistoryMatch::topicality_threshold_ = -1;

116 std::vector<ScoredHistoryMatch::ScoreMaxRelevance>*

117 ScoredHistoryMatch::hqp_relevance_buckets_ = nullptr;

118

119 ScoredHistoryMatch::ScoredHistoryMatch()

120 : ScoredHistoryMatch(history::URLRow(),

121 VisitInfoVector(),

122 std::string(),

123 base::string16(),

124 String16Vector(),

125 WordStarts(),

126 RowWordStarts(),

127 false,

128 base::Time::Max()) {

129 }

130

131 ScoredHistoryMatch::ScoredHistoryMatch(

132 const history::URLRow& row,

133 const VisitInfoVector& visits,

134 const std::string& languages,

135 const base::string16& lower_string,

136 const String16Vector& terms_vector,

137 const WordStarts& terms_to_word_starts_offsets,

138 const RowWordStarts& word_starts,

139 bool is_url_bookmarked,

140 base::Time now)

141 : HistoryMatch(row, 0, false, false), raw_score(0), can_inline(false) {

142 // NOTE: Call Init() before doing any validity checking to ensure that the

143 // class is always initialized after an instance has been constructed. In

144 // particular, this ensures that the class is initialized after an instance

145 // has been constructed via the no-args constructor.

146 ScoredHistoryMatch::Init();

147

148 GURL gurl = row.url();

149 if (!gurl.is_valid())

150 return;

151

152 // Figure out where each search term appears in the URL and/or page title

153 // so that we can score as well as provide autocomplete highlighting.

154 base::OffsetAdjuster::Adjustments adjustments;

155 base::string16 url =

156 bookmarks::CleanUpUrlForMatching(gurl, languages, &adjustments);

157 base::string16 title = bookmarks::CleanUpTitleForMatching(row.title());

158 int term_num = 0;

159 for (const auto& term : terms_vector) {

160 TermMatches url_term_matches = MatchTermInString(term, url, term_num);

161 TermMatches title_term_matches = MatchTermInString(term, title, term_num);

162 if (url_term_matches.empty() && title_term_matches.empty()) {

163 // A term was not found in either URL or title - reject.

164 return;

165 }

166 url_matches.insert(url_matches.end(), url_term_matches.begin(),

167 url_term_matches.end());

168 title_matches.insert(title_matches.end(), title_term_matches.begin(),

169 title_term_matches.end());

170 ++term_num;

171 }

172

173 // Sort matches by offset and eliminate any which overlap.

174 // TODO(mpearson): Investigate whether this has any meaningful

175 // effect on scoring. (It's necessary at some point: removing

176 // overlaps and sorting is needed to decide what to highlight in the

177 // suggestion string. But this sort and de-overlap doesn't have to

178 // be done before scoring.)

179 url_matches = SortAndDeoverlapMatches(url_matches);

180 title_matches = SortAndDeoverlapMatches(title_matches);

181

182 // We can inline autocomplete a match if:

183 // 1) there is only one search term

184 // 2) AND the match begins immediately after one of the prefixes in

185 // URLPrefix such as http://www and https:// (note that one of these

186 // is the empty prefix, for cases where the user has typed the scheme)

187 // 3) AND the search string does not end in whitespace (making it look to

188 // the IMUI as though there is a single search term when actually there

189 // is a second, empty term).

190 // \|best_inlineable_prefix\| stores the inlineable prefix computed in

191 // clause (2) or NULL if no such prefix exists. (The URL is not inlineable.)

192 // Note that using the best prefix here means that when multiple

193 // prefixes match, we'll choose to inline following the longest one.

194 // For a URL like "http://www.washingtonmutual.com", this means

195 // typing "w" will inline "ashington..." instead of "ww.washington...".

196 if (!url_matches.empty() && (terms_vector.size() == 1) &&

197 !IsWhitespace(*lower_string.rbegin())) {

198 const base::string16 gurl_spec = base::UTF8ToUTF16(gurl.spec());

199 const URLPrefix* best_inlineable_prefix =

200 URLPrefix::BestURLPrefix(gurl_spec, terms_vector[0]);

201 if (best_inlineable_prefix) {

202 // When inline autocompleting this match, we're going to use the part of

203 // the URL following the end of the matching text. However, it's possible

204 // that FormatUrl(), when formatting this suggestion for display,

205 // mucks with the text. We need to ensure that the text we're thinking

206 // about highlighting isn't in the middle of a mucked sequence. In

207 // particular, for the omnibox input of "x" or "xn", we may get a match

208 // in a punycoded domain name such as http://www.xn--blahblah.com/.

209 // When FormatUrl() processes the xn--blahblah part of the hostname, it'll

210 // transform the whole thing into a series of unicode characters. It's

211 // impossible to give the user an inline autocompletion of the text

212 // following "x" or "xn" in this case because those characters no longer

213 // exist in the displayed URL string.

214 size_t offset =

215 best_inlineable_prefix->prefix.length() + terms_vector[0].length();

216 base::OffsetAdjuster::UnadjustOffset(adjustments, &offset);

217 if (offset != base::string16::npos) {

218 // Initialize innermost_match.

219 // The idea here is that matches that occur in the scheme or

220 // "www." are worse than matches which don't. For the URLs

221 // "http://www.google.com" and "http://wellsfargo.com", we want

222 // the omnibox input "w" to cause the latter URL to rank higher

223 // than the former. Note that this is not the same as checking

224 // whether one match's inlinable prefix has more components than

225 // the other match's, since in this example, both matches would

226 // have an inlinable prefix of "http://", which is one component.

227 //

228 // Instead, we look for the overall best (i.e., most components)

229 // prefix of the current URL, and then check whether the inlinable

230 // prefix has that many components. If it does, this is an

231 // "innermost" match, and should be boosted. In the example

232 // above, the best prefixes for the two URLs have two and one

233 // components respectively, while the inlinable prefixes each

234 // have one component; this means the first match is not innermost

235 // and the second match is innermost, resulting in us boosting the

236 // second match.

237 //

238 // Now, the code that implements this.

239 // The deepest prefix for this URL regardless of where the match is.

240 const URLPrefix* best_prefix =

241 URLPrefix::BestURLPrefix(gurl_spec, base::string16());

242 DCHECK(best_prefix);

243 // If the URL is inlineable, we must have a match. Note the prefix that

244 // makes it inlineable may be empty.

245 can_inline = true;

246 innermost_match = (best_inlineable_prefix->num_components ==

247 best_prefix->num_components);

248 }

249 }

250 }

251

252 const float topicality_score = GetTopicalityScore(

253 terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);

254 const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);

255 raw_score = base::saturated_cast<int>(GetFinalRelevancyScore(

256 topicality_score, frequency_score, *hqp_relevance_buckets_));

257

258 if (also_do_hup_like_scoring_ && can_inline) {

259 // HistoryURL-provider-like scoring gives any match that is

260 // capable of being inlined a certain minimum score. Some of these

261 // are given a higher score that lets them be shown in inline.

262 // This test here derives from the test in

263 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().

264 const bool promote_to_inline =

265 (row.typed_count() > 1) \|\| (IsHostOnly() && (row.typed_count() == 1));

266 int hup_like_score =

267 promote_to_inline

268 ? HistoryURLProvider::kScoreForBestInlineableResult

269 : HistoryURLProvider::kBaseScoreForNonInlineableResult;

270

271 // Also, if the user types the hostname of a host with a typed

272 // visit, then everything from that host get given inlineable scores

273 // (because the URL-that-you-typed will go first and everything

274 // else will be assigned one minus the previous score, as coded

275 // at the end of HistoryURLProvider::DoAutocomplete().

276 if (base::UTF8ToUTF16(gurl.host()) == terms_vector[0])

277 hup_like_score = HistoryURLProvider::kScoreForBestInlineableResult;

278

279 // HistoryURLProvider has the function PromoteOrCreateShorterSuggestion()

280 // that's meant to promote prefixes of the best match (if they've

281 // been visited enough related to the best match) or

282 // create/promote host-only suggestions (even if they've never

283 // been typed). The code is complicated and we don't try to

284 // duplicate the logic here. Instead, we handle a simple case: in

285 // low-typed-count ranges, give host-only matches (i.e.,

286 // http://www.foo.com/ vs. http://www.foo.com/bar.html) a boost so

287 // that the host-only match outscores all the other matches that

288 // would normally have the same base score. This behavior is not

289 // identical to what happens in HistoryURLProvider even in these

290 // low typed count ranges--sometimes it will create/promote when

291 // this test does not (indeed, we cannot create matches like HUP

292 // can) and vice versa--but the underlying philosophy is similar.

293 if (!promote_to_inline && IsHostOnly())

294 hup_like_score++;

295

296 // All the other logic to goes into hup-like-scoring happens in

297 // the tie-breaker case of MatchScoreGreater().

298

299 // Incorporate hup_like_score into raw_score.

300 raw_score = std::max(raw_score, hup_like_score);

301 }

302

303 // Now that we're done processing this entry, correct the offsets of the

304 // matches in \|url_matches\| so they point to offsets in the original URL

305 // spec, not the cleaned-up URL string that we used for matching.

306 std::vector<size_t> offsets = OffsetsFromTermMatches(url_matches);

307 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);

308 url_matches = ReplaceOffsetsInTermMatches(url_matches, offsets);

309 }

310

311 ScoredHistoryMatch::~ScoredHistoryMatch() {

312 }

313

314 // Comparison function for sorting ScoredMatches by their scores with

315 // intelligent tie-breaking.

316 bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,

317 const ScoredHistoryMatch& m2) {

318 if (m1.raw_score != m2.raw_score)

319 return m1.raw_score > m2.raw_score;

320

321 // This tie-breaking logic is inspired by / largely copied from the

322 // ordering logic in history_url_provider.cc CompareHistoryMatch().

323

324 // A URL that has been typed at all is better than one that has never been

325 // typed. (Note "!"s on each side.)

326 if (!m1.url_info.typed_count() != !m2.url_info.typed_count())

327 return m1.url_info.typed_count() > m2.url_info.typed_count();

328

329 // Innermost matches (matches after any scheme or "www.") are better than

330 // non-innermost matches.

331 if (m1.innermost_match != m2.innermost_match)

332 return m1.innermost_match;

333

334 // URLs that have been typed more often are better.

335 if (m1.url_info.typed_count() != m2.url_info.typed_count())

336 return m1.url_info.typed_count() > m2.url_info.typed_count();

337

338 // For URLs that have each been typed once, a host (alone) is better

339 // than a page inside.

340 if (m1.url_info.typed_count() == 1) {

341 if (m1.IsHostOnly() != m2.IsHostOnly())

342 return m1.IsHostOnly();

343 }

344

345 // URLs that have been visited more often are better.

346 if (m1.url_info.visit_count() != m2.url_info.visit_count())

347 return m1.url_info.visit_count() > m2.url_info.visit_count();

348

349 // URLs that have been visited more recently are better.

350 return m1.url_info.last_visit() > m2.url_info.last_visit();

351 }

352

353 // static

354 TermMatches ScoredHistoryMatch::FilterTermMatchesByWordStarts(

355 const TermMatches& term_matches,

356 const WordStarts& terms_to_word_starts_offsets,

357 const WordStarts& word_starts,

358 size_t start_pos,

359 size_t end_pos) {

360 // Return early if no filtering is needed.

361 if (start_pos == std::string::npos)

362 return term_matches;

363 TermMatches filtered_matches;

364 WordStarts::const_iterator next_word_starts = word_starts.begin();

365 WordStarts::const_iterator end_word_starts = word_starts.end();

366 for (const auto& term_match : term_matches) {

367 const size_t term_offset =

368 terms_to_word_starts_offsets[term_match.term_num];

369 // Advance next_word_starts until it's >= the position of the term we're

370 // considering (adjusted for where the word begins within the term).

371 while ((next_word_starts != end_word_starts) &&

372 (*next_word_starts < (term_match.offset + term_offset)))

373 ++next_word_starts;

374 // Add the match if it's before the position we start filtering at or

375 // after the position we stop filtering at (assuming we have a position

376 // to stop filtering at) or if it's at a word boundary.

377 if ((term_match.offset < start_pos) \|\|

378 ((end_pos != std::string::npos) && (term_match.offset >= end_pos)) \|\|

379 ((next_word_starts != end_word_starts) &&

380 (*next_word_starts == term_match.offset + term_offset)))

381 filtered_matches.push_back(term_match);

382 }

383 return filtered_matches;

384 }

385

386 // static

387 void ScoredHistoryMatch::Init() {

388 static bool initialized = false;

389

390 if (initialized)

391 return;

392

393 initialized = true;

394 also_do_hup_like_scoring_ = OmniboxFieldTrial::HQPAlsoDoHUPLikeScoring();

395 bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();

396 fix_frequency_bugs_ = OmniboxFieldTrial::HQPFixFrequencyScoringBugs();

397 allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();

398 allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();

399 num_title_words_to_allow_ = OmniboxFieldTrial::HQPNumTitleWordsToAllow();

400

401 InitRawTermScoreToTopicalityScoreArray();

402 InitDaysAgoToRecencyScoreArray();

403 InitHQPExperimentalParams();

404 }

405

406 float ScoredHistoryMatch::GetTopicalityScore(

407 const int num_terms,

408 const base::string16& url,

409 const WordStarts& terms_to_word_starts_offsets,

410 const RowWordStarts& word_starts) {

411 // A vector that accumulates per-term scores. The strongest match--a

412 // match in the hostname at a word boundary--is worth 10 points.

413 // Everything else is less. In general, a match that's not at a word

414 // boundary is worth about 1/4th or 1/5th of a match at the word boundary

415 // in the same part of the URL/title.

416 DCHECK_GT(num_terms, 0);

417 std::vector<int> term_scores(num_terms, 0);

418 WordStarts::const_iterator next_word_starts =

419 word_starts.url_word_starts_.begin();

420 WordStarts::const_iterator end_word_starts =

421 word_starts.url_word_starts_.end();

422 const size_t question_mark_pos = url.find('?');

423 const size_t colon_pos = url.find(':');

424 // The + 3 skips the // that probably appears in the protocol

425 // after the colon. If the protocol doesn't have two slashes after

426 // the colon, that's okay--all this ends up doing is starting our

427 // search for the next / a few characters into the hostname. The

428 // only times this can cause problems is if we have a protocol without

429 // a // after the colon and the hostname is only one or two characters.

430 // This isn't worth worrying about.

431 const size_t end_of_hostname_pos = (colon_pos != std::string::npos)

432 ? url.find('/', colon_pos + 3)

433 : url.find('/');

434 size_t last_part_of_hostname_pos = (end_of_hostname_pos != std::string::npos)

435 ? url.rfind('.', end_of_hostname_pos)

436 : url.rfind('.');

437 // Loop through all URL matches and score them appropriately.

438 // First, filter all matches not at a word boundary and in the path (or

439 // later).

440 url_matches = FilterTermMatchesByWordStarts(

441 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,

442 end_of_hostname_pos, std::string::npos);

443 if (colon_pos != std::string::npos) {

444 // Also filter matches not at a word boundary and in the scheme.

445 url_matches = FilterTermMatchesByWordStarts(

446 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,

447 0, colon_pos);

448 }

449 for (const auto& url_match : url_matches) {

450 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num];

451 // Advance next_word_starts until it's >= the position of the term we're

452 // considering (adjusted for where the word begins within the term).

453 while ((next_word_starts != end_word_starts) &&

454 (*next_word_starts < (url_match.offset + term_offset))) {

455 ++next_word_starts;

456 }

457 const bool at_word_boundary =

458 (next_word_starts != end_word_starts) &&

459 (*next_word_starts == url_match.offset + term_offset);

460 if ((question_mark_pos != std::string::npos) &&

461 (url_match.offset > question_mark_pos)) {

462 // The match is in a CGI ?... fragment.

463 DCHECK(at_word_boundary);

464 term_scores[url_match.term_num] += 5;

465 } else if ((end_of_hostname_pos != std::string::npos) &&

466 (url_match.offset > end_of_hostname_pos)) {

467 // The match is in the path.

468 DCHECK(at_word_boundary);

469 term_scores[url_match.term_num] += 8;

470 } else if ((colon_pos == std::string::npos) \|\|

471 (url_match.offset > colon_pos)) {

472 // The match is in the hostname.

473 if ((last_part_of_hostname_pos == std::string::npos) \|\|

474 (url_match.offset < last_part_of_hostname_pos)) {

475 // Either there are no dots in the hostname or this match isn't

476 // the last dotted component.

477 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;

478 } else {

479 // The match is in the last part of a dotted hostname (usually this

480 // is the top-level domain .com, .net, etc.).

481 if (allow_tld_matches_)

482 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;

483 }

484 } else {

485 // The match is in the protocol (a.k.a. scheme).

486 // Matches not at a word boundary should have been filtered already.

487 DCHECK(at_word_boundary);

488 match_in_scheme = true;

489 if (allow_scheme_matches_)

490 term_scores[url_match.term_num] += 10;

491 }

492 }

493 // Now do the analogous loop over all matches in the title.

494 next_word_starts = word_starts.title_word_starts_.begin();

495 end_word_starts = word_starts.title_word_starts_.end();

496 size_t word_num = 0;

497 title_matches = FilterTermMatchesByWordStarts(

498 title_matches, terms_to_word_starts_offsets,

499 word_starts.title_word_starts_, 0, std::string::npos);

500 for (const auto& title_match : title_matches) {

501 const size_t term_offset =

502 terms_to_word_starts_offsets[title_match.term_num];

503 // Advance next_word_starts until it's >= the position of the term we're

504 // considering (adjusted for where the word begins within the term).

505 while ((next_word_starts != end_word_starts) &&

506 (*next_word_starts < (title_match.offset + term_offset))) {

507 ++next_word_starts;

508 ++word_num;

509 }

510 if (word_num >= num_title_words_to_allow_)

511 break; // only count the first ten words

512 DCHECK(next_word_starts != end_word_starts);

513 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset)

514 << "not at word boundary";

515 term_scores[title_match.term_num] += 8;

516 }

517 // TODO(mpearson): Restore logic for penalizing out-of-order matches.

518 // (Perhaps discount them by 0.8?)

519 // TODO(mpearson): Consider: if the earliest match occurs late in the string,

520 // should we discount it?

521 // TODO(mpearson): Consider: do we want to score based on how much of the

522 // input string the input covers? (I'm leaning toward no.)

523

524 // Compute the topicality_score as the sum of transformed term_scores.

525 float topicality_score = 0;

526 for (int term_score : term_scores) {

527 // Drop this URL if it seems like a term didn't appear or, more precisely,

528 // didn't appear in a part of the URL or title that we trust enough

529 // to give it credit for. For instance, terms that appear in the middle

530 // of a CGI parameter get no credit. Almost all the matches dropped

531 // due to this test would look stupid if shown to the user.

532 if (term_score == 0)

533 return 0;

534 topicality_score += raw_term_score_to_topicality_score[std::min(

535 term_score, kMaxRawTermScore - 1)];

536 }

537 // TODO(mpearson): If there are multiple terms, consider taking the

538 // geometric mean of per-term scores rather than the arithmetic mean.

539

540 const float final_topicality_score = topicality_score / num_terms;

541

542 // Demote the URL if the topicality score is less than threshold.

543 if (hqp_experimental_scoring_enabled_ &&

544 (final_topicality_score < topicality_threshold_)) {

545 return 0.0;

546 }

547

548 return final_topicality_score;

549 }

550

551 float ScoredHistoryMatch::GetRecencyScore(int last_visit_days_ago) const {

552 // Lookup the score in days_ago_to_recency_score, treating

553 // everything older than what we've precomputed as the oldest thing

554 // we've precomputed. The std::max is to protect against corruption

555 // in the database (in case last_visit_days_ago is negative).

556 return days_ago_to_recency_score[std::max(

557 std::min(last_visit_days_ago, kDaysToPrecomputeRecencyScoresFor - 1), 0)];

558 }

559

560 float ScoredHistoryMatch::GetFrequency(const base::Time& now,

561 const bool bookmarked,

562 const VisitInfoVector& visits) const {

563 // Compute the weighted average \|value_of_transition\| over the last at

564 // most kMaxVisitsToScore visits, where each visit is weighted using

565 // GetRecencyScore() based on how many days ago it happened. Use

566 // kMaxVisitsToScore as the denominator for the average regardless of

567 // how many visits there were in order to penalize a match that has

568 // fewer visits than kMaxVisitsToScore.

569 float summed_visit_points = 0;

570 const size_t max_visit_to_score =

571 std::min(visits.size(), ScoredHistoryMatch::kMaxVisitsToScore);

572 for (size_t i = 0; i < max_visit_to_score; ++i) {

573 const ui::PageTransition page_transition = fix_frequency_bugs_ ?

574 ui::PageTransitionStripQualifier(visits[i].second) : visits[i].second;

575 int value_of_transition =

576 (page_transition == ui::PAGE_TRANSITION_TYPED) ? 20 : 1;

577 if (bookmarked)

578 value_of_transition = std::max(value_of_transition, bookmark_value_);

579 const float bucket_weight =

580 GetRecencyScore((now - visits[i].first).InDays());

581 summed_visit_points += (value_of_transition * bucket_weight);

582 }

583 if (fix_frequency_bugs_)

584 return summed_visit_points / ScoredHistoryMatch::kMaxVisitsToScore;

585 return visits.size() * summed_visit_points /

586 ScoredHistoryMatch::kMaxVisitsToScore;

587 }

588

589 // static

590 float ScoredHistoryMatch::GetFinalRelevancyScore(

591 float topicality_score,

592 float frequency_score,

593 const std::vector<ScoreMaxRelevance>& hqp_relevance_buckets) {

594 DCHECK(hqp_relevance_buckets.size() > 0);

595 DCHECK_EQ(hqp_relevance_buckets[0].first, 0.0);

596

597 if (topicality_score == 0)

598 return 0;

599 // Here's how to interpret intermediate_score: Suppose the omnibox

600 // has one input term. Suppose we have a URL for which the omnibox

601 // input term has a single URL hostname hit at a word boundary. (This

602 // implies topicality_score = 1.0.). Then the intermediate_score for

603 // this URL will depend entirely on the frequency_score with

604 // this interpretation:

605 // - a single typed visit more than three months ago, no other visits -> 0.2

606 // - a visit every three days, no typed visits -> 0.706

607 // - a visit every day, no typed visits -> 0.916

608 // - a single typed visit yesterday, no other visits -> 2.0

609 // - a typed visit once a week -> 11.77

610 // - a typed visit every three days -> 14.12

611 // - at least ten typed visits today -> 20.0 (maximum score)

612 //

613 // The below code maps intermediate_score to the range [0, 1399].

614 // For example:

615 // HQP default scoring buckets: "0.0:400,1.5:600,12.0:1300,20.0:1399"

616 // We will linearly interpolate the scores between:

617 // 0 to 1.5 --> 400 to 600

618 // 1.5 to 12.0 --> 600 to 1300

619 // 12.0 to 20.0 --> 1300 to 1399

620 // >= 20.0 --> 1399

621 //

622 // The score maxes out at 1399 (i.e., cannot beat a good inlineable result

623 // from HistoryURL provider).

624 const float intermediate_score = topicality_score * frequency_score;

625

626 // Find the threshold where intermediate score is greater than bucket.

627 size_t i = 1;

628 for (; i < hqp_relevance_buckets.size(); ++i) {

629 const ScoreMaxRelevance& hqp_bucket = hqp_relevance_buckets[i];

630 if (intermediate_score >= hqp_bucket.first) {

631 continue;

632 }

633 const ScoreMaxRelevance& previous_bucket = hqp_relevance_buckets[i - 1];

634 const float slope = ((hqp_bucket.second - previous_bucket.second) /

635 (hqp_bucket.first - previous_bucket.first));

636 return (previous_bucket.second +

637 (slope * (intermediate_score - previous_bucket.first)));

638 }

639 // It will reach this stage when the score is > highest bucket score.

640 // Return the highest bucket score.

641 return hqp_relevance_buckets[i - 1].second;

642 }

643

644 // static

645 void ScoredHistoryMatch::InitHQPExperimentalParams() {

646 // These are default HQP relevance scoring buckets.

647 // See GetFinalRelevancyScore() for details.

648 std::string hqp_relevance_buckets_str = "0.0:400,1.5:600,12.0:1300,20.0:1399";

649

650 // Fetch the experiment params if they are any.

651 hqp_experimental_scoring_enabled_ =

652 OmniboxFieldTrial::HQPExperimentalScoringEnabled();

653

654 if (hqp_experimental_scoring_enabled_) {

655 // Add the topicality threshold from experiment params.

656 float hqp_experimental_topicality_threhold =

657 OmniboxFieldTrial::HQPExperimentalTopicalityThreshold();

658 topicality_threshold_ = hqp_experimental_topicality_threhold;

659

660 // Add the HQP experimental scoring buckets.

661 std::string hqp_experimental_scoring_buckets =

662 OmniboxFieldTrial::HQPExperimentalScoringBuckets();

663 if (!hqp_experimental_scoring_buckets.empty())

664 hqp_relevance_buckets_str = hqp_experimental_scoring_buckets;

665 }

666

667 // Parse the hqp_relevance_buckets_str string once and store them in vector

668 // which is easy to access.

669 hqp_relevance_buckets_ =

670 new std::vector<ScoredHistoryMatch::ScoreMaxRelevance>();

671

672 bool is_valid_bucket_str = GetHQPBucketsFromString(hqp_relevance_buckets_str,

673 hqp_relevance_buckets_);

674 DCHECK(is_valid_bucket_str);

675 }

676

677 // static

678 bool ScoredHistoryMatch::GetHQPBucketsFromString(

679 const std::string& buckets_str,

680 std::vector<ScoreMaxRelevance>* hqp_buckets) {

681 DCHECK(hqp_buckets != NULL);

682 DCHECK(!buckets_str.empty());

683

684 base::StringPairs kv_pairs;

685 if (base::SplitStringIntoKeyValuePairs(buckets_str, ':', ',', &kv_pairs)) {

686 for (base::StringPairs::const_iterator it = kv_pairs.begin();

687 it != kv_pairs.end(); ++it) {

688 ScoreMaxRelevance bucket;

689 bool is_valid_intermediate_score =

690 base::StringToDouble(it->first, &bucket.first);

691 DCHECK(is_valid_intermediate_score);

692 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);

693 DCHECK(is_valid_hqp_score);

694 hqp_buckets->push_back(bucket);

695 }

696 return true;

697 }

698 return false;

699 }

OLD	NEW

« no previous file with comments | « chrome/browser/autocomplete/scored_history_match.h ('k') | chrome/browser/autocomplete/scored_history_match_unittest.cc » ('j') | no next file with comments »