OLD | NEW |
---|---|
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/history/in_memory_url_index.h" | 5 #include "chrome/browser/history/in_memory_url_index.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <functional> | 8 #include <functional> |
9 #include <iterator> | 9 #include <iterator> |
10 #include <limits> | 10 #include <limits> |
(...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
393 // Note that this does not remove any reference to this row from the | 393 // Note that this does not remove any reference to this row from the |
394 // word_id_history_map_. That map will continue to contain (and return) | 394 // word_id_history_map_. That map will continue to contain (and return) |
395 // hits against this row until that map is rebuilt, but since the | 395 // hits against this row until that map is rebuilt, but since the |
396 // history_info_map_ no longer references the row no erroneous results | 396 // history_info_map_ no longer references the row no erroneous results |
397 // will propagate to the user. | 397 // will propagate to the user. |
398 private_data_->history_info_map_.erase(row_id); | 398 private_data_->history_info_map_.erase(row_id); |
399 // This invalidates the word cache. | 399 // This invalidates the word cache. |
400 search_term_cache_.clear(); | 400 search_term_cache_.clear(); |
401 } | 401 } |
402 | 402 |
403 // Searching | 403 // InMemoryURLIndex::AddHistoryMatch ------------------------------------------- |
404 | |
405 InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
406 const HistoryInfoMap& history_info_map) | |
407 : history_info_map_(history_info_map) { | |
408 } | |
409 | |
410 InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
411 | |
412 bool InMemoryURLIndex::HistoryItemFactorGreater::operator()( | |
413 const HistoryID h1, | |
414 const HistoryID h2) { | |
415 const URLRow& r1(history_info_map_.find(h1)->second); | |
416 const URLRow& r2(history_info_map_.find(h2)->second); | |
417 // First cut: typed count, visit count, recency. | |
418 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
419 // recently visited (within the last 12/24 hours) as highly important. Get | |
420 // input from mpearson. | |
421 if (r1.typed_count() != r2.typed_count()) | |
422 return (r1.typed_count() > r2.typed_count()); | |
423 if (r1.visit_count() != r2.visit_count()) | |
424 return (r1.visit_count() > r2.visit_count()); | |
425 return (r1.last_visit() > r2.last_visit()); | |
426 } | |
427 | |
428 // Searching ------------------------------------------------------------------- | |
404 | 429 |
405 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( | 430 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( |
406 const String16Vector& terms) { | 431 const string16& term_string) { |
432 pre_filter_item_count = 0; | |
433 post_filter_item_count = 0; | |
434 post_scoring_item_count = 0; | |
435 string16 clean_string = net::UnescapeURLComponent(term_string, | |
436 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
437 string16 lower_string(base::i18n::ToLower(clean_string)); | |
438 history::String16Vector words( | |
439 history::String16VectorFromString16(lower_string, false)); | |
440 | |
407 ScoredHistoryMatches scored_items; | 441 ScoredHistoryMatches scored_items; |
408 | 442 |
409 // Do nothing if we have indexed no words (probably because we've not been | 443 // Do nothing if we have indexed no words (probably because we've not been |
410 // initialized yet). | 444 // initialized yet) or the search string has no words. |
411 if (private_data_->word_list_.empty()) | 445 if (private_data_->word_list_.empty() || words.empty()) { |
446 search_term_cache_.clear(); // Invalidate the term cache. | |
412 return scored_items; | 447 return scored_items; |
448 } | |
413 | 449 |
414 if (!terms.empty()) { | 450 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
415 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 451 // approach. |
416 // approach. | 452 ResetSearchTermCache(); |
417 ResetSearchTermCache(); | |
418 | 453 |
419 // Lowercase the terms. | 454 HistoryIDSet history_id_set = HistoryIDSetFromWords(words); |
GeorgeY
2011/11/21 22:52:03
I wonder how fast it would work on lowest common d
mrossetti
2011/11/22 23:29:12
This should be very fast as the dataset underlying
| |
420 // TODO(mrossetti): Another opportunity for a transform algorithm. | |
421 String16Vector lower_terms; | |
422 for (String16Vector::const_iterator term_iter = terms.begin(); | |
423 term_iter != terms.end(); ++term_iter) | |
424 lower_terms.push_back(base::i18n::ToLower(*term_iter)); | |
425 | 455 |
426 string16 all_terms(JoinString(lower_terms, ' ')); | 456 // Trim the candidate pool if it is large. Note that we do not filter out |
427 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); | 457 // items that do not contain the search terms as proper substrings -- doing |
458 // so is the performance-costly operation we are trying to avoid in order | |
459 // to maintain omnibox responsiveness. | |
460 const size_t kItemsToScoreLimit = 500; | |
461 pre_filter_item_count = history_id_set.size(); | |
462 // If we trim the results set we do not want to cache the results for next | |
463 // time as the user's ultimately desired result could easily be eliminated | |
464 // in this early rough filter. | |
465 bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit); | |
466 if (was_trimmed) { | |
467 HistoryIDVector history_ids; | |
468 std::copy(history_id_set.begin(), history_id_set.end(), | |
469 std::back_inserter(history_ids)); | |
470 // Trim down the set by sorting by typed-count, visit-count, and last | |
471 // visit. | |
472 HistoryItemFactorGreater | |
473 item_factor_functor(private_data_->history_info_map_); | |
474 std::partial_sort(history_ids.begin(), | |
475 history_ids.begin() + kItemsToScoreLimit, | |
476 history_ids.end(), | |
477 item_factor_functor); | |
478 history_id_set.clear(); | |
479 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
480 std::inserter(history_id_set, history_id_set.end())); | |
481 post_filter_item_count = history_id_set.size(); | |
482 } | |
428 | 483 |
429 // Don't perform any scoring (and don't return any matches) if the | 484 // Pass over all of the candidates filtering out any without a proper |
430 // candidate pool is large. (See comments in header.) | 485 // substring match, inserting those which pass in order by score. |
431 const size_t kItemsToScoreLimit = 500; | 486 history::String16Vector terms; |
432 if (history_id_set.size() <= kItemsToScoreLimit) { | 487 Tokenize(lower_string, kWhitespaceUTF16, &terms); |
433 // Pass over all of the candidates filtering out any without a proper | 488 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
434 // substring match, inserting those which pass in order by score. | 489 AddHistoryMatch(*this, terms)).ScoredMatches(); |
435 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
436 AddHistoryMatch(*this, lower_terms)).ScoredMatches(); | |
437 | 490 |
438 // Select and sort only the top kMaxMatches results. | 491 // Select and sort only the top kMaxMatches results. |
439 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | 492 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { |
440 std::partial_sort(scored_items.begin(), | 493 std::partial_sort(scored_items.begin(), |
441 scored_items.begin() + | 494 scored_items.begin() + |
442 AutocompleteProvider::kMaxMatches, | 495 AutocompleteProvider::kMaxMatches, |
443 scored_items.end(), | 496 scored_items.end(), |
444 ScoredHistoryMatch::MatchScoreGreater); | 497 ScoredHistoryMatch::MatchScoreGreater); |
445 scored_items.resize(AutocompleteProvider::kMaxMatches); | 498 scored_items.resize(AutocompleteProvider::kMaxMatches); |
446 } else { | 499 } else { |
447 std::sort(scored_items.begin(), scored_items.end(), | 500 std::sort(scored_items.begin(), scored_items.end(), |
448 ScoredHistoryMatch::MatchScoreGreater); | 501 ScoredHistoryMatch::MatchScoreGreater); |
449 } | 502 } |
503 post_scoring_item_count = scored_items.size(); | |
504 | |
505 if (was_trimmed) { | |
506 search_term_cache_.clear(); // Invalidate the term cache. | |
507 } else { | |
508 // Remove any stale SearchTermCacheItems. | |
509 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
510 cache_iter != search_term_cache_.end(); ) { | |
511 if (!cache_iter->second.used_) | |
512 search_term_cache_.erase(cache_iter++); | |
513 else | |
514 ++cache_iter; | |
450 } | 515 } |
451 } | 516 } |
452 | 517 |
453 // Remove any stale SearchTermCacheItems. | |
454 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
455 cache_iter != search_term_cache_.end(); ) { | |
456 if (!cache_iter->second.used_) | |
457 search_term_cache_.erase(cache_iter++); | |
458 else | |
459 ++cache_iter; | |
460 } | |
461 | |
462 return scored_items; | 518 return scored_items; |
463 } | 519 } |
464 | 520 |
465 void InMemoryURLIndex::ResetSearchTermCache() { | 521 void InMemoryURLIndex::ResetSearchTermCache() { |
466 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | 522 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); |
467 iter != search_term_cache_.end(); ++iter) | 523 iter != search_term_cache_.end(); ++iter) |
468 iter->second.used_ = false; | 524 iter->second.used_ = false; |
469 } | 525 } |
470 | 526 |
471 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( | 527 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( |
472 const string16& uni_string) { | 528 const String16Vector& unsorted_words) { |
473 // Break the terms down into individual terms (words), get the candidate | 529 // Break the terms down into individual terms (words), get the candidate |
474 // set for each term, and intersect each to get a final candidate list. | 530 // set for each term, and intersect each to get a final candidate list. |
475 // Note that a single 'term' from the user's perspective might be | 531 // Note that a single 'term' from the user's perspective might be |
476 // a string like "http://www.somewebsite.com" which, from our perspective, | 532 // a string like "http://www.somewebsite.com" which, from our perspective, |
477 // is four words: 'http', 'www', 'somewebsite', and 'com'. | 533 // is four words: 'http', 'www', 'somewebsite', and 'com'. |
478 HistoryIDSet history_id_set; | 534 HistoryIDSet history_id_set; |
479 String16Vector terms = String16VectorFromString16(uni_string, true); | 535 String16Vector words(unsorted_words); |
480 // Sort the terms into the longest first as such are likely to narrow down | 536 // Sort the terms into the longest first as such are likely to narrow down |
481 // the results quicker. Also, single character terms are the most expensive | 537 // the results quicker. Also, single character terms are the most expensive |
482 // to process so save them for last. | 538 // to process so save them for last. |
483 std::sort(terms.begin(), terms.end(), LengthGreater); | 539 std::sort(words.begin(), words.end(), LengthGreater); |
484 for (String16Vector::iterator iter = terms.begin(); iter != terms.end(); | 540 for (String16Vector::iterator iter = words.begin(); iter != words.end(); |
485 ++iter) { | 541 ++iter) { |
486 string16 uni_word = *iter; | 542 string16 uni_word = *iter; |
487 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | 543 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); |
488 if (term_history_set.empty()) { | 544 if (term_history_set.empty()) { |
489 history_id_set.clear(); | 545 history_id_set.clear(); |
490 break; | 546 break; |
491 } | 547 } |
492 if (iter == terms.begin()) { | 548 if (iter == words.begin()) { |
493 history_id_set.swap(term_history_set); | 549 history_id_set.swap(term_history_set); |
494 } else { | 550 } else { |
495 HistoryIDSet new_history_id_set; | 551 HistoryIDSet new_history_id_set; |
496 std::set_intersection(history_id_set.begin(), history_id_set.end(), | 552 std::set_intersection(history_id_set.begin(), history_id_set.end(), |
497 term_history_set.begin(), term_history_set.end(), | 553 term_history_set.begin(), term_history_set.end(), |
498 std::inserter(new_history_id_set, | 554 std::inserter(new_history_id_set, |
499 new_history_id_set.begin())); | 555 new_history_id_set.begin())); |
500 history_id_set.swap(new_history_id_set); | 556 history_id_set.swap(new_history_id_set); |
501 } | 557 } |
502 } | 558 } |
(...skipping 311 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
814 term_length_total * kCompleteMaxValue / max_significant_length; | 870 term_length_total * kCompleteMaxValue / max_significant_length; |
815 | 871 |
816 int raw_score = order_value + start_value + complete_value; | 872 int raw_score = order_value + start_value + complete_value; |
817 const int kTermScoreLevel[] = { 1000, 650, 500, 200 }; | 873 const int kTermScoreLevel[] = { 1000, 650, 500, 200 }; |
818 | 874 |
819 // Scale the sum of the three components above into a single score component | 875 // Scale the sum of the three components above into a single score component |
820 // on the same scale as that used in ScoredMatchForURL(). | 876 // on the same scale as that used in ScoredMatchForURL(). |
821 return ScoreForValue(raw_score, kTermScoreLevel); | 877 return ScoreForValue(raw_score, kTermScoreLevel); |
822 } | 878 } |
823 | 879 |
880 // InMemoryURLIndex::AddHistoryMatch ------------------------------------------- | |
881 | |
824 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( | 882 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( |
825 const InMemoryURLIndex& index, | 883 const InMemoryURLIndex& index, |
826 const String16Vector& lower_terms) | 884 const String16Vector& lower_terms) |
827 : index_(index), | 885 : index_(index), |
828 lower_terms_(lower_terms) {} | 886 lower_terms_(lower_terms) {} |
829 | 887 |
830 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} | 888 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} |
831 | 889 |
832 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) { | 890 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) { |
833 HistoryInfoMap::const_iterator hist_pos = | 891 HistoryInfoMap::const_iterator hist_pos = |
(...skipping 13 matching lines...) Expand all Loading... | |
847 if (history_dir_.empty()) | 905 if (history_dir_.empty()) |
848 return false; | 906 return false; |
849 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); | 907 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); |
850 return true; | 908 return true; |
851 } | 909 } |
852 | 910 |
853 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const { | 911 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const { |
854 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); | 912 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); |
855 } | 913 } |
856 | 914 |
915 // Cache Management ------------------------------------------------------------ | |
916 | |
857 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { | 917 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { |
858 DCHECK(cache); | 918 DCHECK(cache); |
859 cache->set_timestamp(base::Time::Now().ToInternalValue()); | 919 cache->set_timestamp(base::Time::Now().ToInternalValue()); |
860 // history_item_count_ is no longer used but rather than change the protobuf | 920 // history_item_count_ is no longer used but rather than change the protobuf |
861 // definition use a placeholder. This will go away with the switch to SQLite. | 921 // definition use a placeholder. This will go away with the switch to SQLite. |
862 cache->set_history_item_count(0); | 922 cache->set_history_item_count(0); |
863 SaveWordList(cache); | 923 SaveWordList(cache); |
864 SaveWordMap(cache); | 924 SaveWordMap(cache); |
865 SaveCharWordMap(cache); | 925 SaveCharWordMap(cache); |
866 SaveWordIDHistoryMap(cache); | 926 SaveWordIDHistoryMap(cache); |
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1069 if (iter->has_title()) { | 1129 if (iter->has_title()) { |
1070 string16 title(UTF8ToUTF16(iter->title())); | 1130 string16 title(UTF8ToUTF16(iter->title())); |
1071 url_row.set_title(title); | 1131 url_row.set_title(title); |
1072 } | 1132 } |
1073 private_data_->history_info_map_[history_id] = url_row; | 1133 private_data_->history_info_map_[history_id] = url_row; |
1074 } | 1134 } |
1075 return true; | 1135 return true; |
1076 } | 1136 } |
1077 | 1137 |
1078 } // namespace history | 1138 } // namespace history |
OLD | NEW |