Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2421373003: Omnibox: Improve HQP Scoring for Terms that Start with Punctuation (Closed)
Patch Set: restore dcheck Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/scored_history_match.h" 5 #include "components/omnibox/browser/scored_history_match.h"
6 6
7 #include <math.h> 7 #include <math.h>
8 8
9 #include <algorithm> 9 #include <algorithm>
10 #include <vector> 10 #include <vector>
(...skipping 455 matching lines...) Expand 10 before | Expand all | Expand 10 after
466 url_matches = FilterTermMatchesByWordStarts( 466 url_matches = FilterTermMatchesByWordStarts(
467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, 467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
468 end_of_hostname_pos, std::string::npos); 468 end_of_hostname_pos, std::string::npos);
469 if (colon_pos != std::string::npos) { 469 if (colon_pos != std::string::npos) {
470 // Also filter matches not at a word boundary and in the scheme. 470 // Also filter matches not at a word boundary and in the scheme.
471 url_matches = FilterTermMatchesByWordStarts( 471 url_matches = FilterTermMatchesByWordStarts(
472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, 472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
473 0, colon_pos); 473 0, colon_pos);
474 } 474 }
475 for (const auto& url_match : url_matches) { 475 for (const auto& url_match : url_matches) {
476 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num]; 476 // Calculate the offset in the URL string where the meaningful (word) part
477 // of the term starts. This takes into account times when a term starts
478 // with punctuation such as "/foo".
479 const size_t term_word_offset =
480 url_match.offset + terms_to_word_starts_offsets[url_match.term_num];
477 // Advance next_word_starts until it's >= the position of the term we're 481 // Advance next_word_starts until it's >= the position of the term we're
478 // considering (adjusted for where the word begins within the term). 482 // considering (adjusted for where the word begins within the term).
479 while ((next_word_starts != end_word_starts) && 483 while ((next_word_starts != end_word_starts) &&
480 (*next_word_starts < (url_match.offset + term_offset))) { 484 (*next_word_starts < term_word_offset)) {
481 ++next_word_starts; 485 ++next_word_starts;
482 } 486 }
483 const bool at_word_boundary = 487 const bool at_word_boundary =
484 (next_word_starts != end_word_starts) && 488 (next_word_starts != end_word_starts) &&
485 (*next_word_starts == url_match.offset + term_offset); 489 (*next_word_starts == term_word_offset);
486 if ((question_mark_pos != std::string::npos) && 490 if ((question_mark_pos != std::string::npos) &&
487 (url_match.offset > question_mark_pos)) { 491 (term_word_offset >= question_mark_pos)) {
488 // The match is in a CGI ?... fragment. 492 // The match is in a CGI ?... fragment.
489 DCHECK(at_word_boundary); 493 DCHECK(at_word_boundary);
490 term_scores[url_match.term_num] += 5; 494 term_scores[url_match.term_num] += 5;
491 } else if ((end_of_hostname_pos != std::string::npos) && 495 } else if ((end_of_hostname_pos != std::string::npos) &&
492 (url_match.offset > end_of_hostname_pos)) { 496 (term_word_offset >= end_of_hostname_pos)) {
493 // The match is in the path. 497 // The match is in the path.
494 DCHECK(at_word_boundary); 498 DCHECK(at_word_boundary);
495 term_scores[url_match.term_num] += 8; 499 term_scores[url_match.term_num] += 8;
496 } else if ((colon_pos == std::string::npos) || 500 } else if ((colon_pos == std::string::npos) ||
497 (url_match.offset > colon_pos)) { 501 (term_word_offset >= colon_pos)) {
498 // The match is in the hostname. 502 // The match is in the hostname.
499 if ((last_part_of_hostname_pos == std::string::npos) || 503 if ((last_part_of_hostname_pos == std::string::npos) ||
500 (url_match.offset < last_part_of_hostname_pos)) { 504 (term_word_offset < last_part_of_hostname_pos)) {
501 // Either there are no dots in the hostname or this match isn't 505 // Either there are no dots in the hostname or this match isn't
502 // the last dotted component. 506 // the last dotted component.
503 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; 507 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;
504 } else { 508 } else {
505 // The match is in the last part of a dotted hostname (usually this 509 // The match is in the last part of a dotted hostname (usually this
506 // is the top-level domain .com, .net, etc.). 510 // is the top-level domain .com, .net, etc.).
507 if (allow_tld_matches_) 511 if (allow_tld_matches_)
508 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0; 512 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;
509 } 513 }
510 } else { 514 } else {
511 // The match is in the protocol (a.k.a. scheme). 515 // The match is in the protocol (a.k.a. scheme).
512 // Matches not at a word boundary should have been filtered already. 516 // Matches not at a word boundary should have been filtered already.
513 DCHECK(at_word_boundary); 517 DCHECK(at_word_boundary);
514 match_in_scheme = true; 518 match_in_scheme = true;
515 if (allow_scheme_matches_) 519 if (allow_scheme_matches_)
516 term_scores[url_match.term_num] += 10; 520 term_scores[url_match.term_num] += 10;
517 } 521 }
518 } 522 }
519 // Now do the analogous loop over all matches in the title. 523 // Now do the analogous loop over all matches in the title.
520 next_word_starts = word_starts.title_word_starts_.begin(); 524 next_word_starts = word_starts.title_word_starts_.begin();
521 end_word_starts = word_starts.title_word_starts_.end(); 525 end_word_starts = word_starts.title_word_starts_.end();
522 size_t word_num = 0; 526 size_t word_num = 0;
523 title_matches = FilterTermMatchesByWordStarts( 527 title_matches = FilterTermMatchesByWordStarts(
524 title_matches, terms_to_word_starts_offsets, 528 title_matches, terms_to_word_starts_offsets,
525 word_starts.title_word_starts_, 0, std::string::npos); 529 word_starts.title_word_starts_, 0, std::string::npos);
526 for (const auto& title_match : title_matches) { 530 for (const auto& title_match : title_matches) {
527 const size_t term_offset = 531 // Calculate the offset in the title string where the meaningful (word) part
528 terms_to_word_starts_offsets[title_match.term_num]; 532 // of the term starts. This takes into account times when a term starts
533 // with punctuation such as "/foo".
534 const size_t term_word_offset =
535 title_match.offset + terms_to_word_starts_offsets[title_match.term_num];
529 // Advance next_word_starts until it's >= the position of the term we're 536 // Advance next_word_starts until it's >= the position of the term we're
530 // considering (adjusted for where the word begins within the term). 537 // considering (adjusted for where the word begins within the term).
531 while ((next_word_starts != end_word_starts) && 538 while ((next_word_starts != end_word_starts) &&
532 (*next_word_starts < (title_match.offset + term_offset))) { 539 (*next_word_starts < term_word_offset)) {
533 ++next_word_starts; 540 ++next_word_starts;
534 ++word_num; 541 ++word_num;
535 } 542 }
536 if (word_num >= num_title_words_to_allow_) 543 if (word_num >= num_title_words_to_allow_)
537 break; // only count the first ten words 544 break; // only count the first ten words
538 DCHECK(next_word_starts != end_word_starts); 545 DCHECK(next_word_starts != end_word_starts);
539 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset) 546 DCHECK_EQ(*next_word_starts, term_word_offset)
540 << "not at word boundary"; 547 << "not at word boundary";
541 term_scores[title_match.term_num] += 8; 548 term_scores[title_match.term_num] += 8;
542 } 549 }
543 // TODO(mpearson): Restore logic for penalizing out-of-order matches. 550 // TODO(mpearson): Restore logic for penalizing out-of-order matches.
544 // (Perhaps discount them by 0.8?) 551 // (Perhaps discount them by 0.8?)
545 // TODO(mpearson): Consider: if the earliest match occurs late in the string, 552 // TODO(mpearson): Consider: if the earliest match occurs late in the string,
546 // should we discount it? 553 // should we discount it?
547 // TODO(mpearson): Consider: do we want to score based on how much of the 554 // TODO(mpearson): Consider: do we want to score based on how much of the
548 // input string the input covers? (I'm leaning toward no.) 555 // input string the input covers? (I'm leaning toward no.)
549 556
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after
715 base::StringToDouble(it->first, &bucket.first); 722 base::StringToDouble(it->first, &bucket.first);
716 DCHECK(is_valid_intermediate_score); 723 DCHECK(is_valid_intermediate_score);
717 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); 724 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);
718 DCHECK(is_valid_hqp_score); 725 DCHECK(is_valid_hqp_score);
719 hqp_buckets->push_back(bucket); 726 hqp_buckets->push_back(bucket);
720 } 727 }
721 return true; 728 return true;
722 } 729 }
723 return false; 730 return false;
724 } 731 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698