Chromium Code Reviews| Index: chrome/browser/history/in_memory_url_index.cc |
| =================================================================== |
| --- chrome/browser/history/in_memory_url_index.cc (revision 110946) |
| +++ chrome/browser/history/in_memory_url_index.cc (working copy) |
| @@ -400,63 +400,119 @@ |
| search_term_cache_.clear(); |
| } |
| -// Searching |
| +// InMemoryURLIndex::AddHistoryMatch ------------------------------------------- |
| +InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater( |
| + const HistoryInfoMap& history_info_map) |
| + : history_info_map_(history_info_map) { |
| +} |
| + |
| +InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} |
| + |
| +bool InMemoryURLIndex::HistoryItemFactorGreater::operator()( |
| + const HistoryID h1, |
| + const HistoryID h2) { |
| + const URLRow& r1(history_info_map_.find(h1)->second); |
| + const URLRow& r2(history_info_map_.find(h2)->second); |
| + // First cut: typed count, visit count, recency. |
| + // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
| + // recently visited (within the last 12/24 hours) as highly important. Get |
| + // input from mpearson. |
| + if (r1.typed_count() != r2.typed_count()) |
| + return (r1.typed_count() > r2.typed_count()); |
| + if (r1.visit_count() != r2.visit_count()) |
| + return (r1.visit_count() > r2.visit_count()); |
| + return (r1.last_visit() > r2.last_visit()); |
| +} |
| + |
| +// Searching ------------------------------------------------------------------- |
| + |
| ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( |
| - const String16Vector& terms) { |
| + const string16& term_string) { |
| + pre_filter_item_count = 0; |
| + post_filter_item_count = 0; |
| + post_scoring_item_count = 0; |
| + string16 clean_string = net::UnescapeURLComponent(term_string, |
| + net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); |
| + string16 lower_string(base::i18n::ToLower(clean_string)); |
| + history::String16Vector words( |
| + history::String16VectorFromString16(lower_string, false)); |
| + |
| ScoredHistoryMatches scored_items; |
| // Do nothing if we have indexed no words (probably because we've not been |
| - // initialized yet). |
| - if (private_data_->word_list_.empty()) |
| + // initialized yet) or the search string has no words. |
| + if (private_data_->word_list_.empty() || words.empty()) { |
| + search_term_cache_.clear(); // Invalidate the term cache. |
| return scored_items; |
| + } |
| - if (!terms.empty()) { |
| - // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
| - // approach. |
| - ResetSearchTermCache(); |
| + // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
| + // approach. |
| + ResetSearchTermCache(); |
| - // Lowercase the terms. |
| - // TODO(mrossetti): Another opportunity for a transform algorithm. |
| - String16Vector lower_terms; |
| - for (String16Vector::const_iterator term_iter = terms.begin(); |
| - term_iter != terms.end(); ++term_iter) |
| - lower_terms.push_back(base::i18n::ToLower(*term_iter)); |
| + HistoryIDSet history_id_set = HistoryIDSetFromWords(words); |
|
GeorgeY
2011/11/21 22:52:03
I wonder how fast it would work on lowest common d
mrossetti
2011/11/22 23:29:12
This should be very fast as the dataset underlying
|
| - string16 all_terms(JoinString(lower_terms, ' ')); |
| - HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); |
| + // Trim the candidate pool if it is large. Note that we do not filter out |
| + // items that do not contain the search terms as proper substrings -- doing |
| + // so is the performance-costly operation we are trying to avoid in order |
| + // to maintain omnibox responsiveness. |
| + const size_t kItemsToScoreLimit = 500; |
| + pre_filter_item_count = history_id_set.size(); |
| + // If we trim the results set we do not want to cache the results for next |
| + // time as the user's ultimately desired result could easily be eliminated |
| + // in this early rough filter. |
| + bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit); |
| + if (was_trimmed) { |
| + HistoryIDVector history_ids; |
| + std::copy(history_id_set.begin(), history_id_set.end(), |
| + std::back_inserter(history_ids)); |
| + // Trim down the set by sorting by typed-count, visit-count, and last |
| + // visit. |
| + HistoryItemFactorGreater |
| + item_factor_functor(private_data_->history_info_map_); |
| + std::partial_sort(history_ids.begin(), |
| + history_ids.begin() + kItemsToScoreLimit, |
| + history_ids.end(), |
| + item_factor_functor); |
| + history_id_set.clear(); |
| + std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, |
| + std::inserter(history_id_set, history_id_set.end())); |
| + post_filter_item_count = history_id_set.size(); |
| + } |
| - // Don't perform any scoring (and don't return any matches) if the |
| - // candidate pool is large. (See comments in header.) |
| - const size_t kItemsToScoreLimit = 500; |
| - if (history_id_set.size() <= kItemsToScoreLimit) { |
| - // Pass over all of the candidates filtering out any without a proper |
| - // substring match, inserting those which pass in order by score. |
| - scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
| - AddHistoryMatch(*this, lower_terms)).ScoredMatches(); |
| + // Pass over all of the candidates filtering out any without a proper |
| + // substring match, inserting those which pass in order by score. |
| + history::String16Vector terms; |
| + Tokenize(lower_string, kWhitespaceUTF16, &terms); |
| + scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
| + AddHistoryMatch(*this, terms)).ScoredMatches(); |
| - // Select and sort only the top kMaxMatches results. |
| - if (scored_items.size() > AutocompleteProvider::kMaxMatches) { |
| - std::partial_sort(scored_items.begin(), |
| - scored_items.begin() + |
| - AutocompleteProvider::kMaxMatches, |
| - scored_items.end(), |
| - ScoredHistoryMatch::MatchScoreGreater); |
| - scored_items.resize(AutocompleteProvider::kMaxMatches); |
| - } else { |
| - std::sort(scored_items.begin(), scored_items.end(), |
| - ScoredHistoryMatch::MatchScoreGreater); |
| - } |
| - } |
| + // Select and sort only the top kMaxMatches results. |
| + if (scored_items.size() > AutocompleteProvider::kMaxMatches) { |
| + std::partial_sort(scored_items.begin(), |
| + scored_items.begin() + |
| + AutocompleteProvider::kMaxMatches, |
| + scored_items.end(), |
| + ScoredHistoryMatch::MatchScoreGreater); |
| + scored_items.resize(AutocompleteProvider::kMaxMatches); |
| + } else { |
| + std::sort(scored_items.begin(), scored_items.end(), |
| + ScoredHistoryMatch::MatchScoreGreater); |
| } |
| + post_scoring_item_count = scored_items.size(); |
| - // Remove any stale SearchTermCacheItems. |
| - for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
| - cache_iter != search_term_cache_.end(); ) { |
| - if (!cache_iter->second.used_) |
| - search_term_cache_.erase(cache_iter++); |
| - else |
| - ++cache_iter; |
| + if (was_trimmed) { |
| + search_term_cache_.clear(); // Invalidate the term cache. |
| + } else { |
| + // Remove any stale SearchTermCacheItems. |
| + for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
| + cache_iter != search_term_cache_.end(); ) { |
| + if (!cache_iter->second.used_) |
| + search_term_cache_.erase(cache_iter++); |
| + else |
| + ++cache_iter; |
| + } |
| } |
| return scored_items; |
| @@ -469,19 +525,19 @@ |
| } |
| HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( |
| - const string16& uni_string) { |
| + const String16Vector& unsorted_words) { |
| // Break the terms down into individual terms (words), get the candidate |
| // set for each term, and intersect each to get a final candidate list. |
| // Note that a single 'term' from the user's perspective might be |
| // a string like "http://www.somewebsite.com" which, from our perspective, |
| // is four words: 'http', 'www', 'somewebsite', and 'com'. |
| HistoryIDSet history_id_set; |
| - String16Vector terms = String16VectorFromString16(uni_string, true); |
| + String16Vector words(unsorted_words); |
| // Sort the terms into the longest first as such are likely to narrow down |
| // the results quicker. Also, single character terms are the most expensive |
| // to process so save them for last. |
| - std::sort(terms.begin(), terms.end(), LengthGreater); |
| - for (String16Vector::iterator iter = terms.begin(); iter != terms.end(); |
| + std::sort(words.begin(), words.end(), LengthGreater); |
| + for (String16Vector::iterator iter = words.begin(); iter != words.end(); |
| ++iter) { |
| string16 uni_word = *iter; |
| HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); |
| @@ -489,7 +545,7 @@ |
| history_id_set.clear(); |
| break; |
| } |
| - if (iter == terms.begin()) { |
| + if (iter == words.begin()) { |
| history_id_set.swap(term_history_set); |
| } else { |
| HistoryIDSet new_history_id_set; |
| @@ -821,6 +877,8 @@ |
| return ScoreForValue(raw_score, kTermScoreLevel); |
| } |
| +// InMemoryURLIndex::AddHistoryMatch ------------------------------------------- |
| + |
| InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( |
| const InMemoryURLIndex& index, |
| const String16Vector& lower_terms) |
| @@ -854,6 +912,8 @@ |
| return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); |
| } |
| +// Cache Management ------------------------------------------------------------ |
| + |
| void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { |
| DCHECK(cache); |
| cache->set_timestamp(base::Time::Now().ToInternalValue()); |