chrome/browser/history/in_memory_url_index.cc - Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets

Unified Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 9 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« chrome/browser/history/in_memory_url_index.h ('K') | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_types.h » ('j') | chrome/browser/history/in_memory_url_index_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/history/in_memory_url_index.cc

===================================================================

--- chrome/browser/history/in_memory_url_index.cc (revision 110116)

+++ chrome/browser/history/in_memory_url_index.cc (working copy)

@@ -400,63 +400,120 @@

search_term_cache_.clear();

}

-// Searching

+// InMemoryURLIndex::AddHistoryMatch -------------------------------------------

+InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater(

+ const HistoryInfoMap& history_info_map)

+ : history_info_map_(history_info_map) {}

Peter Kasting 2011/11/21 20:31:02 Nit: 4 spaces before colon. I generally prefer {}

mrossetti 2011/11/21 21:38:25 Done.

+InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}

+bool InMemoryURLIndex::HistoryItemFactorGreater::operator()(

+ const HistoryID h1,

+ const HistoryID h2) {

+ const URLRow& r1(history_info_map_.find(h1)->second);

+ const URLRow& r2(history_info_map_.find(h2)->second);

+ // First cut: typed count, visit count, recency.

+ // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

+ // recently visited (within the last 12/24 hours) as highly important. Get

+ // input from mpearson.

+ if (r1.typed_count() != r2.typed_count())

+ return (r1.typed_count() > r2.typed_count());

+ if (r1.visit_count() != r2.visit_count())

+ return (r1.visit_count() > r2.visit_count());

+ return (r1.last_visit() > r2.last_visit());

+// Searching -------------------------------------------------------------------

ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(

- const String16Vector& terms) {

+ const string16& term_string) {

+ pre_filter_item_count = 0;

+ post_filter_item_count = 0;

+ post_scoring_item_count = 0;

+ string16 clean_string = net::UnescapeURLComponent(term_string,

+ net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);

+ string16 lower_string(base::i18n::ToLower(clean_string));

+ history::String16Vector words(

+ history::String16VectorFromString16(lower_string, false));

ScoredHistoryMatches scored_items;

// Do nothing if we have indexed no words (probably because we've not been

// initialized yet).

- if (private_data_->word_list_.empty())

+ if (private_data_->word_list_.empty() || words.empty()) {

+ search_term_cache_.clear(); // Invalidate the term cache.

Peter Kasting 2011/11/21 20:31:02 Nit: Why is this line necessary? Wouldn't the cac

mrossetti 2011/11/21 21:38:25 I updated the comment since it did not mention tha

return scored_items;

+ }

- if (!terms.empty()) {

- // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

- // approach.

- ResetSearchTermCache();

+ // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

+ // approach.

+ ResetSearchTermCache();

- // Lowercase the terms.

- // TODO(mrossetti): Another opportunity for a transform algorithm.

- String16Vector lower_terms;

- for (String16Vector::const_iterator term_iter = terms.begin();

- term_iter != terms.end(); ++term_iter)

- lower_terms.push_back(base::i18n::ToLower(*term_iter));

+ HistoryIDSet history_id_set = HistoryIDSetFromWords(words);

- string16 all_terms(JoinString(lower_terms, ' '));

- HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);

+ // Trim the candidate pool if it is large. Note that we do not filter out

+ // items that do not contain the search terms as proper substrings -- doing

+ // so is the performance-costly operation we are trying to avoid in order

+ // to maintain omnibox responsiveness.

+ const size_t kItemsToScoreLimit = 500;

+ pre_filter_item_count = history_id_set.size();

+ // If we trim the results set we do not want to cache the results for next

+ // time as the user's ultimately desired result could easily be eliminated

+ // in this early rough filter.

+ bool was_trimmed = false;

Peter Kasting 2011/11/21 20:31:02 Nit: Even shorter: bool was_trimmed = (pre_filt

mrossetti 2011/11/21 21:38:25 Done.

+ if (pre_filter_item_count > kItemsToScoreLimit) {

+ was_trimmed = true;

+ HistoryIDVector history_ids;

+ std::copy(history_id_set.begin(), history_id_set.end(),

+ std::back_inserter(history_ids));

+ // Trim down the set by sorting by typed-count, visit-count, and last

+ // visit.

+ HistoryItemFactorGreater

+ item_factor_functor(private_data_->history_info_map_);

+ std::partial_sort(history_ids.begin(),

+ history_ids.begin() + kItemsToScoreLimit,

+ history_ids.end(),

+ item_factor_functor);

+ history_ids.resize(kItemsToScoreLimit);

Peter Kasting 2011/11/21 20:31:02 Nit: You can eliminate this line and simply use "h

mrossetti 2011/11/21 21:38:25 Delicious!

+ history_id_set.clear();

+ std::copy(history_ids.begin(), history_ids.end(),

+ std::inserter(history_id_set, history_id_set.end()));

+ post_filter_item_count = history_id_set.size();

+ }

- // Don't perform any scoring (and don't return any matches) if the

- // candidate pool is large. (See comments in header.)

- const size_t kItemsToScoreLimit = 500;

- if (history_id_set.size() <= kItemsToScoreLimit) {

- // Pass over all of the candidates filtering out any without a proper

- // substring match, inserting those which pass in order by score.

- scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

- AddHistoryMatch(*this, lower_terms)).ScoredMatches();

+ // Pass over all of the candidates filtering out any without a proper

+ // substring match, inserting those which pass in order by score.

+ history::String16Vector terms;

+ Tokenize(lower_string, kWhitespaceUTF16, &terms);

+ scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

+ AddHistoryMatch(*this, terms)).ScoredMatches();

- // Select and sort only the top kMaxMatches results.

- if (scored_items.size() > AutocompleteProvider::kMaxMatches) {

- std::partial_sort(scored_items.begin(),

- scored_items.begin() +

- AutocompleteProvider::kMaxMatches,

- scored_items.end(),

- ScoredHistoryMatch::MatchScoreGreater);

- scored_items.resize(AutocompleteProvider::kMaxMatches);

- } else {

- std::sort(scored_items.begin(), scored_items.end(),

- ScoredHistoryMatch::MatchScoreGreater);

- }

+ // Select and sort only the top kMaxMatches results.

+ if (scored_items.size() > AutocompleteProvider::kMaxMatches) {

+ std::partial_sort(scored_items.begin(),

+ scored_items.begin() +

+ AutocompleteProvider::kMaxMatches,

+ scored_items.end(),

+ ScoredHistoryMatch::MatchScoreGreater);

+ scored_items.resize(AutocompleteProvider::kMaxMatches);

+ } else {

+ std::sort(scored_items.begin(), scored_items.end(),

+ ScoredHistoryMatch::MatchScoreGreater);

}

+ post_scoring_item_count = scored_items.size();

- // Remove any stale SearchTermCacheItems.

- for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

- cache_iter != search_term_cache_.end(); ) {

- if (!cache_iter->second.used_)

- search_term_cache_.erase(cache_iter++);

- else

- ++cache_iter;

+ if (was_trimmed) {

+ search_term_cache_.clear(); // Invalidate the term cache.

+ } else {

+ // Remove any stale SearchTermCacheItems.

+ for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

+ cache_iter != search_term_cache_.end(); ) {

+ if (!cache_iter->second.used_)

+ search_term_cache_.erase(cache_iter++);

+ else

+ ++cache_iter;

+ }

}

return scored_items;

@@ -469,19 +526,19 @@

}

HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(

- const string16& uni_string) {

+ const String16Vector& unsorted_words) {

// Break the terms down into individual terms (words), get the candidate

// set for each term, and intersect each to get a final candidate list.

// Note that a single 'term' from the user's perspective might be

// a string like "http://www.somewebsite.com" which, from our perspective,

// is four words: 'http', 'www', 'somewebsite', and 'com'.

HistoryIDSet history_id_set;

- String16Vector terms = String16VectorFromString16(uni_string, true);

+ String16Vector words(unsorted_words);

// Sort the terms into the longest first as such are likely to narrow down

// the results quicker. Also, single character terms are the most expensive

// to process so save them for last.

- std::sort(terms.begin(), terms.end(), LengthGreater);

- for (String16Vector::iterator iter = terms.begin(); iter != terms.end();

+ std::sort(words.begin(), words.end(), LengthGreater);

+ for (String16Vector::iterator iter = words.begin(); iter != words.end();

++iter) {

string16 uni_word = *iter;

HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

@@ -489,7 +546,7 @@

history_id_set.clear();

break;

}

- if (iter == terms.begin()) {

+ if (iter == words.begin()) {

history_id_set.swap(term_history_set);

} else {

HistoryIDSet new_history_id_set;

@@ -821,6 +878,8 @@

return ScoreForValue(raw_score, kTermScoreLevel);

}

+// InMemoryURLIndex::AddHistoryMatch -------------------------------------------

InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

const InMemoryURLIndex& index,

const String16Vector& lower_terms)

@@ -854,6 +913,8 @@

return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();

}

+// Cache Management ------------------------------------------------------------

void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {

DCHECK(cache);

cache->set_timestamp(base::Time::Now().ToInternalValue());