components/omnibox/browser/scored_history_match.cc - Issue 2421373003: Omnibox: Improve HQP Scoring for Terms that Start with Punctuation

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2421373003: Omnibox: Improve HQP Scoring for Terms that Start with Punctuation (Closed)

Patch Set: restore dcheck Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/scored_history_match.h"	5 #include "components/omnibox/browser/scored_history_match.h"

6	6

7 #include <math.h>	7 #include <math.h>

8	8

9 #include <algorithm>	9 #include <algorithm>

10 #include <vector>	10 #include <vector>

(...skipping 455 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
466 url_matches = FilterTermMatchesByWordStarts(	466 url_matches = FilterTermMatchesByWordStarts(

467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,	467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,

468 end_of_hostname_pos, std::string::npos);	468 end_of_hostname_pos, std::string::npos);

469 if (colon_pos != std::string::npos) {	469 if (colon_pos != std::string::npos) {

470 // Also filter matches not at a word boundary and in the scheme.	470 // Also filter matches not at a word boundary and in the scheme.

471 url_matches = FilterTermMatchesByWordStarts(	471 url_matches = FilterTermMatchesByWordStarts(

472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,	472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,

473 0, colon_pos);	473 0, colon_pos);

474 }	474 }

475 for (const auto& url_match : url_matches) {	475 for (const auto& url_match : url_matches) {

476 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num];	476 // Calculate the offset in the URL string where the meaningful (word) part

	477 // of the term starts. This takes into account times when a term starts

	478 // with punctuation such as "/foo".

	479 const size_t term_word_offset =

	480 url_match.offset + terms_to_word_starts_offsets[url_match.term_num];

477 // Advance next_word_starts until it's >= the position of the term we're	481 // Advance next_word_starts until it's >= the position of the term we're

478 // considering (adjusted for where the word begins within the term).	482 // considering (adjusted for where the word begins within the term).

479 while ((next_word_starts != end_word_starts) &&	483 while ((next_word_starts != end_word_starts) &&

480 (*next_word_starts < (url_match.offset + term_offset))) {	484 (*next_word_starts < term_word_offset)) {

481 ++next_word_starts;	485 ++next_word_starts;

482 }	486 }

483 const bool at_word_boundary =	487 const bool at_word_boundary =

484 (next_word_starts != end_word_starts) &&	488 (next_word_starts != end_word_starts) &&

485 (*next_word_starts == url_match.offset + term_offset);	489 (*next_word_starts == term_word_offset);

486 if ((question_mark_pos != std::string::npos) &&	490 if ((question_mark_pos != std::string::npos) &&

487 (url_match.offset > question_mark_pos)) {	491 (term_word_offset >= question_mark_pos)) {

488 // The match is in a CGI ?... fragment.	492 // The match is in a CGI ?... fragment.

489 DCHECK(at_word_boundary);	493 DCHECK(at_word_boundary);

490 term_scores[url_match.term_num] += 5;	494 term_scores[url_match.term_num] += 5;

491 } else if ((end_of_hostname_pos != std::string::npos) &&	495 } else if ((end_of_hostname_pos != std::string::npos) &&

492 (url_match.offset > end_of_hostname_pos)) {	496 (term_word_offset >= end_of_hostname_pos)) {

493 // The match is in the path.	497 // The match is in the path.

494 DCHECK(at_word_boundary);	498 DCHECK(at_word_boundary);

495 term_scores[url_match.term_num] += 8;	499 term_scores[url_match.term_num] += 8;

496 } else if ((colon_pos == std::string::npos) \|\|	500 } else if ((colon_pos == std::string::npos) \|\|

497 (url_match.offset > colon_pos)) {	501 (term_word_offset >= colon_pos)) {

498 // The match is in the hostname.	502 // The match is in the hostname.

499 if ((last_part_of_hostname_pos == std::string::npos) \|\|	503 if ((last_part_of_hostname_pos == std::string::npos) \|\|

500 (url_match.offset < last_part_of_hostname_pos)) {	504 (term_word_offset < last_part_of_hostname_pos)) {

501 // Either there are no dots in the hostname or this match isn't	505 // Either there are no dots in the hostname or this match isn't

502 // the last dotted component.	506 // the last dotted component.

503 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;	507 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;

504 } else {	508 } else {

505 // The match is in the last part of a dotted hostname (usually this	509 // The match is in the last part of a dotted hostname (usually this

506 // is the top-level domain .com, .net, etc.).	510 // is the top-level domain .com, .net, etc.).

507 if (allow_tld_matches_)	511 if (allow_tld_matches_)

508 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;	512 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;

509 }	513 }

510 } else {	514 } else {

511 // The match is in the protocol (a.k.a. scheme).	515 // The match is in the protocol (a.k.a. scheme).

512 // Matches not at a word boundary should have been filtered already.	516 // Matches not at a word boundary should have been filtered already.

513 DCHECK(at_word_boundary);	517 DCHECK(at_word_boundary);

514 match_in_scheme = true;	518 match_in_scheme = true;

515 if (allow_scheme_matches_)	519 if (allow_scheme_matches_)

516 term_scores[url_match.term_num] += 10;	520 term_scores[url_match.term_num] += 10;

517 }	521 }

518 }	522 }

519 // Now do the analogous loop over all matches in the title.	523 // Now do the analogous loop over all matches in the title.

520 next_word_starts = word_starts.title_word_starts_.begin();	524 next_word_starts = word_starts.title_word_starts_.begin();

521 end_word_starts = word_starts.title_word_starts_.end();	525 end_word_starts = word_starts.title_word_starts_.end();

522 size_t word_num = 0;	526 size_t word_num = 0;

523 title_matches = FilterTermMatchesByWordStarts(	527 title_matches = FilterTermMatchesByWordStarts(

524 title_matches, terms_to_word_starts_offsets,	528 title_matches, terms_to_word_starts_offsets,

525 word_starts.title_word_starts_, 0, std::string::npos);	529 word_starts.title_word_starts_, 0, std::string::npos);

526 for (const auto& title_match : title_matches) {	530 for (const auto& title_match : title_matches) {

527 const size_t term_offset =	531 // Calculate the offset in the title string where the meaningful (word) part

528 terms_to_word_starts_offsets[title_match.term_num];	532 // of the term starts. This takes into account times when a term starts

	533 // with punctuation such as "/foo".

	534 const size_t term_word_offset =

	535 title_match.offset + terms_to_word_starts_offsets[title_match.term_num];

529 // Advance next_word_starts until it's >= the position of the term we're	536 // Advance next_word_starts until it's >= the position of the term we're

530 // considering (adjusted for where the word begins within the term).	537 // considering (adjusted for where the word begins within the term).

531 while ((next_word_starts != end_word_starts) &&	538 while ((next_word_starts != end_word_starts) &&

532 (*next_word_starts < (title_match.offset + term_offset))) {	539 (*next_word_starts < term_word_offset)) {

533 ++next_word_starts;	540 ++next_word_starts;

534 ++word_num;	541 ++word_num;

535 }	542 }

536 if (word_num >= num_title_words_to_allow_)	543 if (word_num >= num_title_words_to_allow_)

537 break; // only count the first ten words	544 break; // only count the first ten words

538 DCHECK(next_word_starts != end_word_starts);	545 DCHECK(next_word_starts != end_word_starts);

539 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset)	546 DCHECK_EQ(*next_word_starts, term_word_offset)

540 << "not at word boundary";	547 << "not at word boundary";

541 term_scores[title_match.term_num] += 8;	548 term_scores[title_match.term_num] += 8;

542 }	549 }

543 // TODO(mpearson): Restore logic for penalizing out-of-order matches.	550 // TODO(mpearson): Restore logic for penalizing out-of-order matches.

544 // (Perhaps discount them by 0.8?)	551 // (Perhaps discount them by 0.8?)

545 // TODO(mpearson): Consider: if the earliest match occurs late in the string,	552 // TODO(mpearson): Consider: if the earliest match occurs late in the string,

546 // should we discount it?	553 // should we discount it?

547 // TODO(mpearson): Consider: do we want to score based on how much of the	554 // TODO(mpearson): Consider: do we want to score based on how much of the

548 // input string the input covers? (I'm leaning toward no.)	555 // input string the input covers? (I'm leaning toward no.)

549	556

(...skipping 165 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
715 base::StringToDouble(it->first, &bucket.first);	722 base::StringToDouble(it->first, &bucket.first);

716 DCHECK(is_valid_intermediate_score);	723 DCHECK(is_valid_intermediate_score);

717 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);	724 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);

718 DCHECK(is_valid_hqp_score);	725 DCHECK(is_valid_hqp_score);

719 hqp_buckets->push_back(bucket);	726 hqp_buckets->push_back(bucket);

720 }	727 }

721 return true;	728 return true;

722 }	729 }

723 return false;	730 return false;

724 }	731 }

OLD	NEW

« no previous file with comments | « no previous file | components/omnibox/browser/scored_history_match_unittest.cc » ('j') | components/omnibox/browser/scored_history_match_unittest.cc » ('J')