chrome/browser/history/in_memory_url_index.cc - Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 9 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« chrome/browser/history/in_memory_url_index.h ('K') | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_types.h » ('j') | chrome/browser/history/in_memory_url_index_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/in_memory_url_index.h"	5 #include "chrome/browser/history/in_memory_url_index.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <functional>	8 #include <functional>

9 #include <iterator>	9 #include <iterator>

10 #include <limits>	10 #include <limits>

(...skipping 382 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
393 // Note that this does not remove any reference to this row from the	393 // Note that this does not remove any reference to this row from the

394 // word_id_history_map_. That map will continue to contain (and return)	394 // word_id_history_map_. That map will continue to contain (and return)

395 // hits against this row until that map is rebuilt, but since the	395 // hits against this row until that map is rebuilt, but since the

396 // history_info_map_ no longer references the row no erroneous results	396 // history_info_map_ no longer references the row no erroneous results

397 // will propagate to the user.	397 // will propagate to the user.

398 private_data_->history_info_map_.erase(row_id);	398 private_data_->history_info_map_.erase(row_id);

399 // This invalidates the word cache.	399 // This invalidates the word cache.

400 search_term_cache_.clear();	400 search_term_cache_.clear();

401 }	401 }

402	402

403 // Searching	403 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------

	404

	405 InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater(

	406 const HistoryInfoMap& history_info_map)

	407 : history_info_map_(history_info_map) {}
	Peter Kasting 2011/11/21 20:31:02 Nit: 4 spaces before colon. I generally prefer {} Nit: 4 spaces before colon. I generally prefer {} on separate lines unless the entire definition is all on one line. mrossetti 2011/11/21 21:38:25 Done. Show quoted text On 2011/11/21 20:31:02, Peter Kasting wrote: > Nit: 4 spaces before colon. I generally prefer {} on separate lines unless the > entire definition is all on one line. Done.
	408

	409 InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}

	410

	411 bool InMemoryURLIndex::HistoryItemFactorGreater::operator()(

	412 const HistoryID h1,

	413 const HistoryID h2) {

	414 const URLRow& r1(history_info_map_.find(h1)->second);

	415 const URLRow& r2(history_info_map_.find(h2)->second);

	416 // First cut: typed count, visit count, recency.

	417 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

	418 // recently visited (within the last 12/24 hours) as highly important. Get

	419 // input from mpearson.

	420 if (r1.typed_count() != r2.typed_count())

	421 return (r1.typed_count() > r2.typed_count());

	422 if (r1.visit_count() != r2.visit_count())

	423 return (r1.visit_count() > r2.visit_count());

	424 return (r1.last_visit() > r2.last_visit());

	425 }

	426

	427 // Searching -------------------------------------------------------------------

404	428

405 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(	429 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(

406 const String16Vector& terms) {	430 const string16& term_string) {

	431 pre_filter_item_count = 0;

	432 post_filter_item_count = 0;

	433 post_scoring_item_count = 0;

	434 string16 clean_string = net::UnescapeURLComponent(term_string,

	435 net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

	436 string16 lower_string(base::i18n::ToLower(clean_string));

	437 history::String16Vector words(

	438 history::String16VectorFromString16(lower_string, false));

	439

407 ScoredHistoryMatches scored_items;	440 ScoredHistoryMatches scored_items;

408	441

409 // Do nothing if we have indexed no words (probably because we've not been	442 // Do nothing if we have indexed no words (probably because we've not been

410 // initialized yet).	443 // initialized yet).

411 if (private_data_->word_list_.empty())	444 if (private_data_->word_list_.empty() \|\| words.empty()) {

	445 search_term_cache_.clear(); // Invalidate the term cache.
	Peter Kasting 2011/11/21 20:31:02 Nit: Why is this line necessary? Wouldn't the cac Nit: Why is this line necessary? Wouldn't the cache already be empty? mrossetti 2011/11/21 21:38:25 I updated the comment since it did not mention tha Show quoted text On 2011/11/21 20:31:02, Peter Kasting wrote: > Nit: Why is this line necessary? Wouldn't the cache already be empty? I updated the comment since it did not mention that the input search string might not have any words.
412 return scored_items;	446 return scored_items;

	447 }

413	448

414 if (!terms.empty()) {	449 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

415 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	450 // approach.

416 // approach.	451 ResetSearchTermCache();

417 ResetSearchTermCache();

418	452

419 // Lowercase the terms.	453 HistoryIDSet history_id_set = HistoryIDSetFromWords(words);

420 // TODO(mrossetti): Another opportunity for a transform algorithm.

421 String16Vector lower_terms;

422 for (String16Vector::const_iterator term_iter = terms.begin();

423 term_iter != terms.end(); ++term_iter)

424 lower_terms.push_back(base::i18n::ToLower(*term_iter));

425	454

426 string16 all_terms(JoinString(lower_terms, ' '));	455 // Trim the candidate pool if it is large. Note that we do not filter out

427 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);	456 // items that do not contain the search terms as proper substrings -- doing

	457 // so is the performance-costly operation we are trying to avoid in order

	458 // to maintain omnibox responsiveness.

	459 const size_t kItemsToScoreLimit = 500;

	460 pre_filter_item_count = history_id_set.size();

	461 // If we trim the results set we do not want to cache the results for next

	462 // time as the user's ultimately desired result could easily be eliminated

	463 // in this early rough filter.

	464 bool was_trimmed = false;
	Peter Kasting 2011/11/21 20:31:02 Nit: Even shorter: bool was_trimmed = (pre_filt Nit: Even shorter: bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit); if (was_trimmed) { ... mrossetti 2011/11/21 21:38:25 Done. Show quoted text On 2011/11/21 20:31:02, Peter Kasting wrote: > Nit: Even shorter: > > bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit); > if (was_trimmed) { ... Done.
	465 if (pre_filter_item_count > kItemsToScoreLimit) {

	466 was_trimmed = true;

	467 HistoryIDVector history_ids;

	468 std::copy(history_id_set.begin(), history_id_set.end(),

	469 std::back_inserter(history_ids));

	470 // Trim down the set by sorting by typed-count, visit-count, and last

	471 // visit.

	472 HistoryItemFactorGreater

	473 item_factor_functor(private_data_->history_info_map_);

	474 std::partial_sort(history_ids.begin(),

	475 history_ids.begin() + kItemsToScoreLimit,

	476 history_ids.end(),

	477 item_factor_functor);

	478 history_ids.resize(kItemsToScoreLimit);
	Peter Kasting 2011/11/21 20:31:02 Nit: You can eliminate this line and simply use "h Nit: You can eliminate this line and simply use "history_ids.begin() + kItemsToScoreLimit" in place of history_ids.end() in the copy() call below. mrossetti 2011/11/21 21:38:25 Delicious! Show quoted text On 2011/11/21 20:31:02, Peter Kasting wrote: > Nit: You can eliminate this line and simply use "history_ids.begin() + > kItemsToScoreLimit" in place of history_ids.end() in the copy() call below. Delicious!
	479 history_id_set.clear();

	480 std::copy(history_ids.begin(), history_ids.end(),

	481 std::inserter(history_id_set, history_id_set.end()));

	482 post_filter_item_count = history_id_set.size();

	483 }

428	484

429 // Don't perform any scoring (and don't return any matches) if the	485 // Pass over all of the candidates filtering out any without a proper

430 // candidate pool is large. (See comments in header.)	486 // substring match, inserting those which pass in order by score.

431 const size_t kItemsToScoreLimit = 500;	487 history::String16Vector terms;

432 if (history_id_set.size() <= kItemsToScoreLimit) {	488 Tokenize(lower_string, kWhitespaceUTF16, &terms);

433 // Pass over all of the candidates filtering out any without a proper	489 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

434 // substring match, inserting those which pass in order by score.	490 AddHistoryMatch(*this, terms)).ScoredMatches();

435 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

436 AddHistoryMatch(*this, lower_terms)).ScoredMatches();

437	491

438 // Select and sort only the top kMaxMatches results.	492 // Select and sort only the top kMaxMatches results.

439 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {	493 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {

440 std::partial_sort(scored_items.begin(),	494 std::partial_sort(scored_items.begin(),

441 scored_items.begin() +	495 scored_items.begin() +

442 AutocompleteProvider::kMaxMatches,	496 AutocompleteProvider::kMaxMatches,

443 scored_items.end(),	497 scored_items.end(),

444 ScoredHistoryMatch::MatchScoreGreater);	498 ScoredHistoryMatch::MatchScoreGreater);

445 scored_items.resize(AutocompleteProvider::kMaxMatches);	499 scored_items.resize(AutocompleteProvider::kMaxMatches);

446 } else {	500 } else {

447 std::sort(scored_items.begin(), scored_items.end(),	501 std::sort(scored_items.begin(), scored_items.end(),

448 ScoredHistoryMatch::MatchScoreGreater);	502 ScoredHistoryMatch::MatchScoreGreater);

449 }	503 }

	504 post_scoring_item_count = scored_items.size();

	505

	506 if (was_trimmed) {

	507 search_term_cache_.clear(); // Invalidate the term cache.

	508 } else {

	509 // Remove any stale SearchTermCacheItems.

	510 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

	511 cache_iter != search_term_cache_.end(); ) {

	512 if (!cache_iter->second.used_)

	513 search_term_cache_.erase(cache_iter++);

	514 else

	515 ++cache_iter;

450 }	516 }

451 }	517 }

452	518

453 // Remove any stale SearchTermCacheItems.

454 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

455 cache_iter != search_term_cache_.end(); ) {

456 if (!cache_iter->second.used_)

457 search_term_cache_.erase(cache_iter++);

458 else

459 ++cache_iter;

460 }

461

462 return scored_items;	519 return scored_items;

463 }	520 }

464	521

465 void InMemoryURLIndex::ResetSearchTermCache() {	522 void InMemoryURLIndex::ResetSearchTermCache() {

466 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();	523 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();

467 iter != search_term_cache_.end(); ++iter)	524 iter != search_term_cache_.end(); ++iter)

468 iter->second.used_ = false;	525 iter->second.used_ = false;

469 }	526 }

470	527

471 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(	528 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(

472 const string16& uni_string) {	529 const String16Vector& unsorted_words) {

473 // Break the terms down into individual terms (words), get the candidate	530 // Break the terms down into individual terms (words), get the candidate

474 // set for each term, and intersect each to get a final candidate list.	531 // set for each term, and intersect each to get a final candidate list.

475 // Note that a single 'term' from the user's perspective might be	532 // Note that a single 'term' from the user's perspective might be

476 // a string like "http://www.somewebsite.com" which, from our perspective,	533 // a string like "http://www.somewebsite.com" which, from our perspective,

477 // is four words: 'http', 'www', 'somewebsite', and 'com'.	534 // is four words: 'http', 'www', 'somewebsite', and 'com'.

478 HistoryIDSet history_id_set;	535 HistoryIDSet history_id_set;

479 String16Vector terms = String16VectorFromString16(uni_string, true);	536 String16Vector words(unsorted_words);

480 // Sort the terms into the longest first as such are likely to narrow down	537 // Sort the terms into the longest first as such are likely to narrow down

481 // the results quicker. Also, single character terms are the most expensive	538 // the results quicker. Also, single character terms are the most expensive

482 // to process so save them for last.	539 // to process so save them for last.

483 std::sort(terms.begin(), terms.end(), LengthGreater);	540 std::sort(words.begin(), words.end(), LengthGreater);

484 for (String16Vector::iterator iter = terms.begin(); iter != terms.end();	541 for (String16Vector::iterator iter = words.begin(); iter != words.end();

485 ++iter) {	542 ++iter) {

486 string16 uni_word = *iter;	543 string16 uni_word = *iter;

487 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);	544 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

488 if (term_history_set.empty()) {	545 if (term_history_set.empty()) {

489 history_id_set.clear();	546 history_id_set.clear();

490 break;	547 break;

491 }	548 }

492 if (iter == terms.begin()) {	549 if (iter == words.begin()) {

493 history_id_set.swap(term_history_set);	550 history_id_set.swap(term_history_set);

494 } else {	551 } else {

495 HistoryIDSet new_history_id_set;	552 HistoryIDSet new_history_id_set;

496 std::set_intersection(history_id_set.begin(), history_id_set.end(),	553 std::set_intersection(history_id_set.begin(), history_id_set.end(),

497 term_history_set.begin(), term_history_set.end(),	554 term_history_set.begin(), term_history_set.end(),

498 std::inserter(new_history_id_set,	555 std::inserter(new_history_id_set,

499 new_history_id_set.begin()));	556 new_history_id_set.begin()));

500 history_id_set.swap(new_history_id_set);	557 history_id_set.swap(new_history_id_set);

501 }	558 }

502 }	559 }

(...skipping 311 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
814 term_length_total * kCompleteMaxValue / max_significant_length;	871 term_length_total * kCompleteMaxValue / max_significant_length;

815	872

816 int raw_score = order_value + start_value + complete_value;	873 int raw_score = order_value + start_value + complete_value;

817 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };	874 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };

818	875

819 // Scale the sum of the three components above into a single score component	876 // Scale the sum of the three components above into a single score component

820 // on the same scale as that used in ScoredMatchForURL().	877 // on the same scale as that used in ScoredMatchForURL().

821 return ScoreForValue(raw_score, kTermScoreLevel);	878 return ScoreForValue(raw_score, kTermScoreLevel);

822 }	879 }

823	880

	881 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------

	882

824 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(	883 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

825 const InMemoryURLIndex& index,	884 const InMemoryURLIndex& index,

826 const String16Vector& lower_terms)	885 const String16Vector& lower_terms)

827 : index_(index),	886 : index_(index),

828 lower_terms_(lower_terms) {}	887 lower_terms_(lower_terms) {}

829	888

830 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}	889 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}

831	890

832 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {	891 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {

833 HistoryInfoMap::const_iterator hist_pos =	892 HistoryInfoMap::const_iterator hist_pos =

(...skipping 13 matching lines...) Expand all Loading...
847 if (history_dir_.empty())	906 if (history_dir_.empty())

848 return false;	907 return false;

849 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));	908 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));

850 return true;	909 return true;

851 }	910 }

852	911

853 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {	912 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {

854 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();	913 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();

855 }	914 }

856	915

	916 // Cache Management ------------------------------------------------------------

	917

857 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {	918 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {

858 DCHECK(cache);	919 DCHECK(cache);

859 cache->set_timestamp(base::Time::Now().ToInternalValue());	920 cache->set_timestamp(base::Time::Now().ToInternalValue());

860 // history_item_count_ is no longer used but rather than change the protobuf	921 // history_item_count_ is no longer used but rather than change the protobuf

861 // definition use a placeholder. This will go away with the switch to SQLite.	922 // definition use a placeholder. This will go away with the switch to SQLite.

862 cache->set_history_item_count(0);	923 cache->set_history_item_count(0);

863 SaveWordList(cache);	924 SaveWordList(cache);

864 SaveWordMap(cache);	925 SaveWordMap(cache);

865 SaveCharWordMap(cache);	926 SaveCharWordMap(cache);

866 SaveWordIDHistoryMap(cache);	927 SaveWordIDHistoryMap(cache);

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1069 if (iter->has_title()) {	1130 if (iter->has_title()) {

1070 string16 title(UTF8ToUTF16(iter->title()));	1131 string16 title(UTF8ToUTF16(iter->title()));

1071 url_row.set_title(title);	1132 url_row.set_title(title);

1072 }	1133 }

1073 private_data_->history_info_map_[history_id] = url_row;	1134 private_data_->history_info_map_[history_id] = url_row;

1074 }	1135 }

1075 return true;	1136 return true;

1076 }	1137 }

1077	1138

1078 } // namespace history	1139 } // namespace history

OLD	NEW