OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/scored_history_match.h" | 5 #include "components/omnibox/browser/scored_history_match.h" |
6 | 6 |
7 #include <math.h> | 7 #include <math.h> |
8 | 8 |
9 #include <algorithm> | 9 #include <algorithm> |
10 #include <vector> | 10 #include <vector> |
(...skipping 455 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
466 url_matches = FilterTermMatchesByWordStarts( | 466 url_matches = FilterTermMatchesByWordStarts( |
467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, | 467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, |
468 end_of_hostname_pos, std::string::npos); | 468 end_of_hostname_pos, std::string::npos); |
469 if (colon_pos != std::string::npos) { | 469 if (colon_pos != std::string::npos) { |
470 // Also filter matches not at a word boundary and in the scheme. | 470 // Also filter matches not at a word boundary and in the scheme. |
471 url_matches = FilterTermMatchesByWordStarts( | 471 url_matches = FilterTermMatchesByWordStarts( |
472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, | 472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, |
473 0, colon_pos); | 473 0, colon_pos); |
474 } | 474 } |
475 for (const auto& url_match : url_matches) { | 475 for (const auto& url_match : url_matches) { |
476 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num]; | 476 // Calculate the offset in the URL string where the meaningful (word) part |
| 477 // of the term starts. This takes into account times when a term starts |
| 478 // with punctuation such as "/foo". |
| 479 const size_t term_word_offset = |
| 480 url_match.offset + terms_to_word_starts_offsets[url_match.term_num]; |
477 // Advance next_word_starts until it's >= the position of the term we're | 481 // Advance next_word_starts until it's >= the position of the term we're |
478 // considering (adjusted for where the word begins within the term). | 482 // considering (adjusted for where the word begins within the term). |
479 while ((next_word_starts != end_word_starts) && | 483 while ((next_word_starts != end_word_starts) && |
480 (*next_word_starts < (url_match.offset + term_offset))) { | 484 (*next_word_starts < term_word_offset)) { |
481 ++next_word_starts; | 485 ++next_word_starts; |
482 } | 486 } |
483 const bool at_word_boundary = | 487 const bool at_word_boundary = |
484 (next_word_starts != end_word_starts) && | 488 (next_word_starts != end_word_starts) && |
485 (*next_word_starts == url_match.offset + term_offset); | 489 (*next_word_starts == term_word_offset); |
486 if ((question_mark_pos != std::string::npos) && | 490 if ((question_mark_pos != std::string::npos) && |
487 (url_match.offset > question_mark_pos)) { | 491 (term_word_offset >= question_mark_pos)) { |
488 // The match is in a CGI ?... fragment. | 492 // The match is in a CGI ?... fragment. |
489 DCHECK(at_word_boundary); | 493 DCHECK(at_word_boundary); |
490 term_scores[url_match.term_num] += 5; | 494 term_scores[url_match.term_num] += 5; |
491 } else if ((end_of_hostname_pos != std::string::npos) && | 495 } else if ((end_of_hostname_pos != std::string::npos) && |
492 (url_match.offset > end_of_hostname_pos)) { | 496 (term_word_offset >= end_of_hostname_pos)) { |
493 // The match is in the path. | 497 // The match is in the path. |
494 DCHECK(at_word_boundary); | 498 DCHECK(at_word_boundary); |
495 term_scores[url_match.term_num] += 8; | 499 term_scores[url_match.term_num] += 8; |
496 } else if ((colon_pos == std::string::npos) || | 500 } else if ((colon_pos == std::string::npos) || |
497 (url_match.offset > colon_pos)) { | 501 (term_word_offset >= colon_pos)) { |
498 // The match is in the hostname. | 502 // The match is in the hostname. |
499 if ((last_part_of_hostname_pos == std::string::npos) || | 503 if ((last_part_of_hostname_pos == std::string::npos) || |
500 (url_match.offset < last_part_of_hostname_pos)) { | 504 (term_word_offset < last_part_of_hostname_pos)) { |
501 // Either there are no dots in the hostname or this match isn't | 505 // Either there are no dots in the hostname or this match isn't |
502 // the last dotted component. | 506 // the last dotted component. |
503 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; | 507 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; |
504 } else { | 508 } else { |
505 // The match is in the last part of a dotted hostname (usually this | 509 // The match is in the last part of a dotted hostname (usually this |
506 // is the top-level domain .com, .net, etc.). | 510 // is the top-level domain .com, .net, etc.). |
507 if (allow_tld_matches_) | 511 if (allow_tld_matches_) |
508 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0; | 512 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0; |
509 } | 513 } |
510 } else { | 514 } else { |
511 // The match is in the protocol (a.k.a. scheme). | 515 // The match is in the protocol (a.k.a. scheme). |
512 // Matches not at a word boundary should have been filtered already. | 516 // Matches not at a word boundary should have been filtered already. |
513 DCHECK(at_word_boundary); | 517 DCHECK(at_word_boundary); |
514 match_in_scheme = true; | 518 match_in_scheme = true; |
515 if (allow_scheme_matches_) | 519 if (allow_scheme_matches_) |
516 term_scores[url_match.term_num] += 10; | 520 term_scores[url_match.term_num] += 10; |
517 } | 521 } |
518 } | 522 } |
519 // Now do the analogous loop over all matches in the title. | 523 // Now do the analogous loop over all matches in the title. |
520 next_word_starts = word_starts.title_word_starts_.begin(); | 524 next_word_starts = word_starts.title_word_starts_.begin(); |
521 end_word_starts = word_starts.title_word_starts_.end(); | 525 end_word_starts = word_starts.title_word_starts_.end(); |
522 size_t word_num = 0; | 526 size_t word_num = 0; |
523 title_matches = FilterTermMatchesByWordStarts( | 527 title_matches = FilterTermMatchesByWordStarts( |
524 title_matches, terms_to_word_starts_offsets, | 528 title_matches, terms_to_word_starts_offsets, |
525 word_starts.title_word_starts_, 0, std::string::npos); | 529 word_starts.title_word_starts_, 0, std::string::npos); |
526 for (const auto& title_match : title_matches) { | 530 for (const auto& title_match : title_matches) { |
527 const size_t term_offset = | 531 // Calculate the offset in the title string where the meaningful (word) part |
528 terms_to_word_starts_offsets[title_match.term_num]; | 532 // of the term starts. This takes into account times when a term starts |
| 533 // with punctuation such as "/foo". |
| 534 const size_t term_word_offset = |
| 535 title_match.offset + terms_to_word_starts_offsets[title_match.term_num]; |
529 // Advance next_word_starts until it's >= the position of the term we're | 536 // Advance next_word_starts until it's >= the position of the term we're |
530 // considering (adjusted for where the word begins within the term). | 537 // considering (adjusted for where the word begins within the term). |
531 while ((next_word_starts != end_word_starts) && | 538 while ((next_word_starts != end_word_starts) && |
532 (*next_word_starts < (title_match.offset + term_offset))) { | 539 (*next_word_starts < term_word_offset)) { |
533 ++next_word_starts; | 540 ++next_word_starts; |
534 ++word_num; | 541 ++word_num; |
535 } | 542 } |
536 if (word_num >= num_title_words_to_allow_) | 543 if (word_num >= num_title_words_to_allow_) |
537 break; // only count the first ten words | 544 break; // only count the first ten words |
538 DCHECK(next_word_starts != end_word_starts); | 545 DCHECK(next_word_starts != end_word_starts); |
539 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset) | 546 DCHECK_EQ(*next_word_starts, term_word_offset) |
540 << "not at word boundary"; | 547 << "not at word boundary"; |
541 term_scores[title_match.term_num] += 8; | 548 term_scores[title_match.term_num] += 8; |
542 } | 549 } |
543 // TODO(mpearson): Restore logic for penalizing out-of-order matches. | 550 // TODO(mpearson): Restore logic for penalizing out-of-order matches. |
544 // (Perhaps discount them by 0.8?) | 551 // (Perhaps discount them by 0.8?) |
545 // TODO(mpearson): Consider: if the earliest match occurs late in the string, | 552 // TODO(mpearson): Consider: if the earliest match occurs late in the string, |
546 // should we discount it? | 553 // should we discount it? |
547 // TODO(mpearson): Consider: do we want to score based on how much of the | 554 // TODO(mpearson): Consider: do we want to score based on how much of the |
548 // input string the input covers? (I'm leaning toward no.) | 555 // input string the input covers? (I'm leaning toward no.) |
549 | 556 |
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
715 base::StringToDouble(it->first, &bucket.first); | 722 base::StringToDouble(it->first, &bucket.first); |
716 DCHECK(is_valid_intermediate_score); | 723 DCHECK(is_valid_intermediate_score); |
717 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); | 724 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); |
718 DCHECK(is_valid_hqp_score); | 725 DCHECK(is_valid_hqp_score); |
719 hqp_buckets->push_back(bucket); | 726 hqp_buckets->push_back(bucket); |
720 } | 727 } |
721 return true; | 728 return true; |
722 } | 729 } |
723 return false; | 730 return false; |
724 } | 731 } |
OLD | NEW |