Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/history/in_memory_url_index.h" | 5 #include "chrome/browser/history/in_memory_url_index.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <functional> | 8 #include <functional> |
| 9 #include <iterator> | 9 #include <iterator> |
| 10 #include <limits> | 10 #include <limits> |
| (...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 393 // Note that this does not remove any reference to this row from the | 393 // Note that this does not remove any reference to this row from the |
| 394 // word_id_history_map_. That map will continue to contain (and return) | 394 // word_id_history_map_. That map will continue to contain (and return) |
| 395 // hits against this row until that map is rebuilt, but since the | 395 // hits against this row until that map is rebuilt, but since the |
| 396 // history_info_map_ no longer references the row no erroneous results | 396 // history_info_map_ no longer references the row no erroneous results |
| 397 // will propagate to the user. | 397 // will propagate to the user. |
| 398 private_data_->history_info_map_.erase(row_id); | 398 private_data_->history_info_map_.erase(row_id); |
| 399 // This invalidates the word cache. | 399 // This invalidates the word cache. |
| 400 search_term_cache_.clear(); | 400 search_term_cache_.clear(); |
| 401 } | 401 } |
| 402 | 402 |
| 403 // Searching | 403 // InMemoryURLIndex::AddHistoryMatch ------------------------------------------- |
| 404 | |
| 405 InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
| 406 const HistoryInfoMap& history_info_map) | |
| 407 : history_info_map_(history_info_map) {} | |
|
Peter Kasting
2011/11/21 20:31:02
Nit: 4 spaces before colon. I generally prefer {}
mrossetti
2011/11/21 21:38:25
Done.
| |
| 408 | |
| 409 InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
| 410 | |
| 411 bool InMemoryURLIndex::HistoryItemFactorGreater::operator()( | |
| 412 const HistoryID h1, | |
| 413 const HistoryID h2) { | |
| 414 const URLRow& r1(history_info_map_.find(h1)->second); | |
| 415 const URLRow& r2(history_info_map_.find(h2)->second); | |
| 416 // First cut: typed count, visit count, recency. | |
| 417 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
| 418 // recently visited (within the last 12/24 hours) as highly important. Get | |
| 419 // input from mpearson. | |
| 420 if (r1.typed_count() != r2.typed_count()) | |
| 421 return (r1.typed_count() > r2.typed_count()); | |
| 422 if (r1.visit_count() != r2.visit_count()) | |
| 423 return (r1.visit_count() > r2.visit_count()); | |
| 424 return (r1.last_visit() > r2.last_visit()); | |
| 425 } | |
| 426 | |
| 427 // Searching ------------------------------------------------------------------- | |
| 404 | 428 |
| 405 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( | 429 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( |
| 406 const String16Vector& terms) { | 430 const string16& term_string) { |
| 431 pre_filter_item_count = 0; | |
| 432 post_filter_item_count = 0; | |
| 433 post_scoring_item_count = 0; | |
| 434 string16 clean_string = net::UnescapeURLComponent(term_string, | |
| 435 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
| 436 string16 lower_string(base::i18n::ToLower(clean_string)); | |
| 437 history::String16Vector words( | |
| 438 history::String16VectorFromString16(lower_string, false)); | |
| 439 | |
| 407 ScoredHistoryMatches scored_items; | 440 ScoredHistoryMatches scored_items; |
| 408 | 441 |
| 409 // Do nothing if we have indexed no words (probably because we've not been | 442 // Do nothing if we have indexed no words (probably because we've not been |
| 410 // initialized yet). | 443 // initialized yet). |
| 411 if (private_data_->word_list_.empty()) | 444 if (private_data_->word_list_.empty() || words.empty()) { |
| 445 search_term_cache_.clear(); // Invalidate the term cache. | |
|
Peter Kasting
2011/11/21 20:31:02
Nit: Why is this line necessary? Wouldn't the cac
mrossetti
2011/11/21 21:38:25
I updated the comment since it did not mention tha
| |
| 412 return scored_items; | 446 return scored_items; |
| 447 } | |
| 413 | 448 |
| 414 if (!terms.empty()) { | 449 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
| 415 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 450 // approach. |
| 416 // approach. | 451 ResetSearchTermCache(); |
| 417 ResetSearchTermCache(); | |
| 418 | 452 |
| 419 // Lowercase the terms. | 453 HistoryIDSet history_id_set = HistoryIDSetFromWords(words); |
| 420 // TODO(mrossetti): Another opportunity for a transform algorithm. | |
| 421 String16Vector lower_terms; | |
| 422 for (String16Vector::const_iterator term_iter = terms.begin(); | |
| 423 term_iter != terms.end(); ++term_iter) | |
| 424 lower_terms.push_back(base::i18n::ToLower(*term_iter)); | |
| 425 | 454 |
| 426 string16 all_terms(JoinString(lower_terms, ' ')); | 455 // Trim the candidate pool if it is large. Note that we do not filter out |
| 427 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); | 456 // items that do not contain the search terms as proper substrings -- doing |
| 457 // so is the performance-costly operation we are trying to avoid in order | |
| 458 // to maintain omnibox responsiveness. | |
| 459 const size_t kItemsToScoreLimit = 500; | |
| 460 pre_filter_item_count = history_id_set.size(); | |
| 461 // If we trim the results set we do not want to cache the results for next | |
| 462 // time as the user's ultimately desired result could easily be eliminated | |
| 463 // in this early rough filter. | |
| 464 bool was_trimmed = false; | |
|
Peter Kasting
2011/11/21 20:31:02
Nit: Even shorter:
bool was_trimmed = (pre_filt
mrossetti
2011/11/21 21:38:25
Done.
| |
| 465 if (pre_filter_item_count > kItemsToScoreLimit) { | |
| 466 was_trimmed = true; | |
| 467 HistoryIDVector history_ids; | |
| 468 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 469 std::back_inserter(history_ids)); | |
| 470 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 471 // visit. | |
| 472 HistoryItemFactorGreater | |
| 473 item_factor_functor(private_data_->history_info_map_); | |
| 474 std::partial_sort(history_ids.begin(), | |
| 475 history_ids.begin() + kItemsToScoreLimit, | |
| 476 history_ids.end(), | |
| 477 item_factor_functor); | |
| 478 history_ids.resize(kItemsToScoreLimit); | |
|
Peter Kasting
2011/11/21 20:31:02
Nit: You can eliminate this line and simply use "h
mrossetti
2011/11/21 21:38:25
Delicious!
| |
| 479 history_id_set.clear(); | |
| 480 std::copy(history_ids.begin(), history_ids.end(), | |
| 481 std::inserter(history_id_set, history_id_set.end())); | |
| 482 post_filter_item_count = history_id_set.size(); | |
| 483 } | |
| 428 | 484 |
| 429 // Don't perform any scoring (and don't return any matches) if the | 485 // Pass over all of the candidates filtering out any without a proper |
| 430 // candidate pool is large. (See comments in header.) | 486 // substring match, inserting those which pass in order by score. |
| 431 const size_t kItemsToScoreLimit = 500; | 487 history::String16Vector terms; |
| 432 if (history_id_set.size() <= kItemsToScoreLimit) { | 488 Tokenize(lower_string, kWhitespaceUTF16, &terms); |
| 433 // Pass over all of the candidates filtering out any without a proper | 489 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
| 434 // substring match, inserting those which pass in order by score. | 490 AddHistoryMatch(*this, terms)).ScoredMatches(); |
| 435 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
| 436 AddHistoryMatch(*this, lower_terms)).ScoredMatches(); | |
| 437 | 491 |
| 438 // Select and sort only the top kMaxMatches results. | 492 // Select and sort only the top kMaxMatches results. |
| 439 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | 493 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { |
| 440 std::partial_sort(scored_items.begin(), | 494 std::partial_sort(scored_items.begin(), |
| 441 scored_items.begin() + | 495 scored_items.begin() + |
| 442 AutocompleteProvider::kMaxMatches, | 496 AutocompleteProvider::kMaxMatches, |
| 443 scored_items.end(), | 497 scored_items.end(), |
| 444 ScoredHistoryMatch::MatchScoreGreater); | 498 ScoredHistoryMatch::MatchScoreGreater); |
| 445 scored_items.resize(AutocompleteProvider::kMaxMatches); | 499 scored_items.resize(AutocompleteProvider::kMaxMatches); |
| 446 } else { | 500 } else { |
| 447 std::sort(scored_items.begin(), scored_items.end(), | 501 std::sort(scored_items.begin(), scored_items.end(), |
| 448 ScoredHistoryMatch::MatchScoreGreater); | 502 ScoredHistoryMatch::MatchScoreGreater); |
| 449 } | 503 } |
| 504 post_scoring_item_count = scored_items.size(); | |
| 505 | |
| 506 if (was_trimmed) { | |
| 507 search_term_cache_.clear(); // Invalidate the term cache. | |
| 508 } else { | |
| 509 // Remove any stale SearchTermCacheItems. | |
| 510 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 511 cache_iter != search_term_cache_.end(); ) { | |
| 512 if (!cache_iter->second.used_) | |
| 513 search_term_cache_.erase(cache_iter++); | |
| 514 else | |
| 515 ++cache_iter; | |
| 450 } | 516 } |
| 451 } | 517 } |
| 452 | 518 |
| 453 // Remove any stale SearchTermCacheItems. | |
| 454 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 455 cache_iter != search_term_cache_.end(); ) { | |
| 456 if (!cache_iter->second.used_) | |
| 457 search_term_cache_.erase(cache_iter++); | |
| 458 else | |
| 459 ++cache_iter; | |
| 460 } | |
| 461 | |
| 462 return scored_items; | 519 return scored_items; |
| 463 } | 520 } |
| 464 | 521 |
| 465 void InMemoryURLIndex::ResetSearchTermCache() { | 522 void InMemoryURLIndex::ResetSearchTermCache() { |
| 466 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | 523 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); |
| 467 iter != search_term_cache_.end(); ++iter) | 524 iter != search_term_cache_.end(); ++iter) |
| 468 iter->second.used_ = false; | 525 iter->second.used_ = false; |
| 469 } | 526 } |
| 470 | 527 |
| 471 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( | 528 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( |
| 472 const string16& uni_string) { | 529 const String16Vector& unsorted_words) { |
| 473 // Break the terms down into individual terms (words), get the candidate | 530 // Break the terms down into individual terms (words), get the candidate |
| 474 // set for each term, and intersect each to get a final candidate list. | 531 // set for each term, and intersect each to get a final candidate list. |
| 475 // Note that a single 'term' from the user's perspective might be | 532 // Note that a single 'term' from the user's perspective might be |
| 476 // a string like "http://www.somewebsite.com" which, from our perspective, | 533 // a string like "http://www.somewebsite.com" which, from our perspective, |
| 477 // is four words: 'http', 'www', 'somewebsite', and 'com'. | 534 // is four words: 'http', 'www', 'somewebsite', and 'com'. |
| 478 HistoryIDSet history_id_set; | 535 HistoryIDSet history_id_set; |
| 479 String16Vector terms = String16VectorFromString16(uni_string, true); | 536 String16Vector words(unsorted_words); |
| 480 // Sort the terms into the longest first as such are likely to narrow down | 537 // Sort the terms into the longest first as such are likely to narrow down |
| 481 // the results quicker. Also, single character terms are the most expensive | 538 // the results quicker. Also, single character terms are the most expensive |
| 482 // to process so save them for last. | 539 // to process so save them for last. |
| 483 std::sort(terms.begin(), terms.end(), LengthGreater); | 540 std::sort(words.begin(), words.end(), LengthGreater); |
| 484 for (String16Vector::iterator iter = terms.begin(); iter != terms.end(); | 541 for (String16Vector::iterator iter = words.begin(); iter != words.end(); |
| 485 ++iter) { | 542 ++iter) { |
| 486 string16 uni_word = *iter; | 543 string16 uni_word = *iter; |
| 487 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | 544 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); |
| 488 if (term_history_set.empty()) { | 545 if (term_history_set.empty()) { |
| 489 history_id_set.clear(); | 546 history_id_set.clear(); |
| 490 break; | 547 break; |
| 491 } | 548 } |
| 492 if (iter == terms.begin()) { | 549 if (iter == words.begin()) { |
| 493 history_id_set.swap(term_history_set); | 550 history_id_set.swap(term_history_set); |
| 494 } else { | 551 } else { |
| 495 HistoryIDSet new_history_id_set; | 552 HistoryIDSet new_history_id_set; |
| 496 std::set_intersection(history_id_set.begin(), history_id_set.end(), | 553 std::set_intersection(history_id_set.begin(), history_id_set.end(), |
| 497 term_history_set.begin(), term_history_set.end(), | 554 term_history_set.begin(), term_history_set.end(), |
| 498 std::inserter(new_history_id_set, | 555 std::inserter(new_history_id_set, |
| 499 new_history_id_set.begin())); | 556 new_history_id_set.begin())); |
| 500 history_id_set.swap(new_history_id_set); | 557 history_id_set.swap(new_history_id_set); |
| 501 } | 558 } |
| 502 } | 559 } |
| (...skipping 311 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 814 term_length_total * kCompleteMaxValue / max_significant_length; | 871 term_length_total * kCompleteMaxValue / max_significant_length; |
| 815 | 872 |
| 816 int raw_score = order_value + start_value + complete_value; | 873 int raw_score = order_value + start_value + complete_value; |
| 817 const int kTermScoreLevel[] = { 1000, 650, 500, 200 }; | 874 const int kTermScoreLevel[] = { 1000, 650, 500, 200 }; |
| 818 | 875 |
| 819 // Scale the sum of the three components above into a single score component | 876 // Scale the sum of the three components above into a single score component |
| 820 // on the same scale as that used in ScoredMatchForURL(). | 877 // on the same scale as that used in ScoredMatchForURL(). |
| 821 return ScoreForValue(raw_score, kTermScoreLevel); | 878 return ScoreForValue(raw_score, kTermScoreLevel); |
| 822 } | 879 } |
| 823 | 880 |
| 881 // InMemoryURLIndex::AddHistoryMatch ------------------------------------------- | |
| 882 | |
| 824 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( | 883 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( |
| 825 const InMemoryURLIndex& index, | 884 const InMemoryURLIndex& index, |
| 826 const String16Vector& lower_terms) | 885 const String16Vector& lower_terms) |
| 827 : index_(index), | 886 : index_(index), |
| 828 lower_terms_(lower_terms) {} | 887 lower_terms_(lower_terms) {} |
| 829 | 888 |
| 830 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} | 889 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} |
| 831 | 890 |
| 832 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) { | 891 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) { |
| 833 HistoryInfoMap::const_iterator hist_pos = | 892 HistoryInfoMap::const_iterator hist_pos = |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 847 if (history_dir_.empty()) | 906 if (history_dir_.empty()) |
| 848 return false; | 907 return false; |
| 849 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); | 908 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); |
| 850 return true; | 909 return true; |
| 851 } | 910 } |
| 852 | 911 |
| 853 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const { | 912 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const { |
| 854 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); | 913 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); |
| 855 } | 914 } |
| 856 | 915 |
| 916 // Cache Management ------------------------------------------------------------ | |
| 917 | |
| 857 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { | 918 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { |
| 858 DCHECK(cache); | 919 DCHECK(cache); |
| 859 cache->set_timestamp(base::Time::Now().ToInternalValue()); | 920 cache->set_timestamp(base::Time::Now().ToInternalValue()); |
| 860 // history_item_count_ is no longer used but rather than change the protobuf | 921 // history_item_count_ is no longer used but rather than change the protobuf |
| 861 // definition use a placeholder. This will go away with the switch to SQLite. | 922 // definition use a placeholder. This will go away with the switch to SQLite. |
| 862 cache->set_history_item_count(0); | 923 cache->set_history_item_count(0); |
| 863 SaveWordList(cache); | 924 SaveWordList(cache); |
| 864 SaveWordMap(cache); | 925 SaveWordMap(cache); |
| 865 SaveCharWordMap(cache); | 926 SaveCharWordMap(cache); |
| 866 SaveWordIDHistoryMap(cache); | 927 SaveWordIDHistoryMap(cache); |
| (...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1069 if (iter->has_title()) { | 1130 if (iter->has_title()) { |
| 1070 string16 title(UTF8ToUTF16(iter->title())); | 1131 string16 title(UTF8ToUTF16(iter->title())); |
| 1071 url_row.set_title(title); | 1132 url_row.set_title(title); |
| 1072 } | 1133 } |
| 1073 private_data_->history_info_map_[history_id] = url_row; | 1134 private_data_->history_info_map_[history_id] = url_row; |
| 1074 } | 1135 } |
| 1075 return true; | 1136 return true; |
| 1076 } | 1137 } |
| 1077 | 1138 |
| 1078 } // namespace history | 1139 } // namespace history |
| OLD | NEW |