chrome/browser/history/in_memory_url_index.cc - Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 9 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_types.h » ('j') | chrome/browser/history/in_memory_url_index_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/in_memory_url_index.h"	5 #include "chrome/browser/history/in_memory_url_index.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <functional>	8 #include <functional>

9 #include <iterator>	9 #include <iterator>

10 #include <limits>	10 #include <limits>

(...skipping 382 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
393 // Note that this does not remove any reference to this row from the	393 // Note that this does not remove any reference to this row from the

394 // word_id_history_map_. That map will continue to contain (and return)	394 // word_id_history_map_. That map will continue to contain (and return)

395 // hits against this row until that map is rebuilt, but since the	395 // hits against this row until that map is rebuilt, but since the

396 // history_info_map_ no longer references the row no erroneous results	396 // history_info_map_ no longer references the row no erroneous results

397 // will propagate to the user.	397 // will propagate to the user.

398 private_data_->history_info_map_.erase(row_id);	398 private_data_->history_info_map_.erase(row_id);

399 // This invalidates the word cache.	399 // This invalidates the word cache.

400 search_term_cache_.clear();	400 search_term_cache_.clear();

401 }	401 }

402	402

403 // Searching	403 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------

	404

	405 InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater(

	406 const HistoryInfoMap& history_info_map)

	407 : history_info_map_(history_info_map) {

	408 }

	409

	410 InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}

	411

	412 bool InMemoryURLIndex::HistoryItemFactorGreater::operator()(

	413 const HistoryID h1,

	414 const HistoryID h2) {

	415 const URLRow& r1(history_info_map_.find(h1)->second);

	416 const URLRow& r2(history_info_map_.find(h2)->second);

	417 // First cut: typed count, visit count, recency.

	418 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

	419 // recently visited (within the last 12/24 hours) as highly important. Get

	420 // input from mpearson.

	421 if (r1.typed_count() != r2.typed_count())

	422 return (r1.typed_count() > r2.typed_count());

	423 if (r1.visit_count() != r2.visit_count())

	424 return (r1.visit_count() > r2.visit_count());

	425 return (r1.last_visit() > r2.last_visit());

	426 }

	427

	428 // Searching -------------------------------------------------------------------

404	429

405 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(	430 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(

406 const String16Vector& terms) {	431 const string16& term_string) {

	432 pre_filter_item_count = 0;

	433 post_filter_item_count = 0;

	434 post_scoring_item_count = 0;

	435 string16 clean_string = net::UnescapeURLComponent(term_string,

	436 net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

	437 string16 lower_string(base::i18n::ToLower(clean_string));

	438 String16Vector words(

	439 history::String16VectorFromString16(lower_string, false));

407 ScoredHistoryMatches scored_items;	440 ScoredHistoryMatches scored_items;

408	441

409 // Do nothing if we have indexed no words (probably because we've not been	442 // Do nothing if we have indexed no words (probably because we've not been

410 // initialized yet).	443 // initialized yet) or the search string has no words.

411 if (private_data_->word_list_.empty())	444 if (private_data_->word_list_.empty() \|\| words.empty()) {

	445 search_term_cache_.clear(); // Invalidate the term cache.

412 return scored_items;	446 return scored_items;

	447 }

413	448

414 if (!terms.empty()) {	449 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

415 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	450 // approach.

416 // approach.	451 ResetSearchTermCache();

417 ResetSearchTermCache();

418	452

419 // Lowercase the terms.	453 HistoryIDSet history_id_set = HistoryIDSetFromWords(words);

420 // TODO(mrossetti): Another opportunity for a transform algorithm.

421 String16Vector lower_terms;

422 for (String16Vector::const_iterator term_iter = terms.begin();

423 term_iter != terms.end(); ++term_iter)

424 lower_terms.push_back(base::i18n::ToLower(*term_iter));

425	454

426 string16 all_terms(JoinString(lower_terms, ' '));	455 // Trim the candidate pool if it is large. Note that we do not filter out

427 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);	456 // items that do not contain the search terms as proper substrings -- doing

	457 // so is the performance-costly operation we are trying to avoid in order

	458 // to maintain omnibox responsiveness.

	459 const size_t kItemsToScoreLimit = 500;

	460 pre_filter_item_count = history_id_set.size();

	461 // If we trim the results set we do not want to cache the results for next

	462 // time as the user's ultimately desired result could easily be eliminated

	463 // in this early rough filter.

	464 bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit);

	465 if (was_trimmed) {

	466 HistoryIDVector history_ids;

	467 std::copy(history_id_set.begin(), history_id_set.end(),

	468 std::back_inserter(history_ids));

	469 // Trim down the set by sorting by typed-count, visit-count, and last

	470 // visit.

	471 HistoryItemFactorGreater

	472 item_factor_functor(private_data_->history_info_map_);

	473 std::partial_sort(history_ids.begin(),

	474 history_ids.begin() + kItemsToScoreLimit,

	475 history_ids.end(),

	476 item_factor_functor);

	477 history_id_set.clear();

	478 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

	479 std::inserter(history_id_set, history_id_set.end()));

	480 post_filter_item_count = history_id_set.size();

	481 }

428	482

429 // Don't perform any scoring (and don't return any matches) if the	483 // Pass over all of the candidates filtering out any without a proper

430 // candidate pool is large. (See comments in header.)	484 // substring match, inserting those which pass in order by score.

431 const size_t kItemsToScoreLimit = 500;	485 history::String16Vector terms;

432 if (history_id_set.size() <= kItemsToScoreLimit) {	486 Tokenize(lower_string, kWhitespaceUTF16, &terms);

433 // Pass over all of the candidates filtering out any without a proper	487 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

434 // substring match, inserting those which pass in order by score.	488 AddHistoryMatch(*this, terms)).ScoredMatches();

435 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

436 AddHistoryMatch(*this, lower_terms)).ScoredMatches();

437	489

438 // Select and sort only the top kMaxMatches results.	490 // Select and sort only the top kMaxMatches results.

439 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {	491 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {

440 std::partial_sort(scored_items.begin(),	492 std::partial_sort(scored_items.begin(),

441 scored_items.begin() +	493 scored_items.begin() +

442 AutocompleteProvider::kMaxMatches,	494 AutocompleteProvider::kMaxMatches,

443 scored_items.end(),	495 scored_items.end(),

444 ScoredHistoryMatch::MatchScoreGreater);	496 ScoredHistoryMatch::MatchScoreGreater);

445 scored_items.resize(AutocompleteProvider::kMaxMatches);	497 scored_items.resize(AutocompleteProvider::kMaxMatches);

446 } else {	498 } else {

447 std::sort(scored_items.begin(), scored_items.end(),	499 std::sort(scored_items.begin(), scored_items.end(),

448 ScoredHistoryMatch::MatchScoreGreater);	500 ScoredHistoryMatch::MatchScoreGreater);

449 }	501 }

	502 post_scoring_item_count = scored_items.size();

	503

	504 if (was_trimmed) {

	505 search_term_cache_.clear(); // Invalidate the term cache.

	506 } else {

	507 // Remove any stale SearchTermCacheItems.

	508 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

	509 cache_iter != search_term_cache_.end(); ) {

	510 if (!cache_iter->second.used_)

	511 search_term_cache_.erase(cache_iter++);

	512 else

	513 ++cache_iter;

450 }	514 }

451 }	515 }

452	516

453 // Remove any stale SearchTermCacheItems.

454 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

455 cache_iter != search_term_cache_.end(); ) {

456 if (!cache_iter->second.used_)

457 search_term_cache_.erase(cache_iter++);

458 else

459 ++cache_iter;

460 }

461

462 return scored_items;	517 return scored_items;

463 }	518 }

464	519

465 void InMemoryURLIndex::ResetSearchTermCache() {	520 void InMemoryURLIndex::ResetSearchTermCache() {

466 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();	521 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();

467 iter != search_term_cache_.end(); ++iter)	522 iter != search_term_cache_.end(); ++iter)

468 iter->second.used_ = false;	523 iter->second.used_ = false;

469 }	524 }

470	525

471 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(	526 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(

472 const string16& uni_string) {	527 const String16Vector& unsorted_words) {

473 // Break the terms down into individual terms (words), get the candidate	528 // Break the terms down into individual terms (words), get the candidate

474 // set for each term, and intersect each to get a final candidate list.	529 // set for each term, and intersect each to get a final candidate list.

475 // Note that a single 'term' from the user's perspective might be	530 // Note that a single 'term' from the user's perspective might be

476 // a string like "http://www.somewebsite.com" which, from our perspective,	531 // a string like "http://www.somewebsite.com" which, from our perspective,

477 // is four words: 'http', 'www', 'somewebsite', and 'com'.	532 // is four words: 'http', 'www', 'somewebsite', and 'com'.

478 HistoryIDSet history_id_set;	533 HistoryIDSet history_id_set;

479 String16Vector terms = String16VectorFromString16(uni_string, true);	534 String16Vector words(unsorted_words);

480 // Sort the terms into the longest first as such are likely to narrow down	535 // Sort the terms into the longest first as such are likely to narrow down

481 // the results quicker. Also, single character terms are the most expensive	536 // the results quicker. Also, single character terms are the most expensive

482 // to process so save them for last.	537 // to process so save them for last.

483 std::sort(terms.begin(), terms.end(), LengthGreater);	538 std::sort(words.begin(), words.end(), LengthGreater);

484 for (String16Vector::iterator iter = terms.begin(); iter != terms.end();	539 for (String16Vector::iterator iter = words.begin(); iter != words.end();

485 ++iter) {	540 ++iter) {

486 string16 uni_word = *iter;	541 string16 uni_word = *iter;

487 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);	542 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

488 if (term_history_set.empty()) {	543 if (term_history_set.empty()) {

489 history_id_set.clear();	544 history_id_set.clear();

490 break;	545 break;

491 }	546 }

492 if (iter == terms.begin()) {	547 if (iter == words.begin()) {

493 history_id_set.swap(term_history_set);	548 history_id_set.swap(term_history_set);

494 } else {	549 } else {

495 HistoryIDSet new_history_id_set;	550 HistoryIDSet new_history_id_set;

496 std::set_intersection(history_id_set.begin(), history_id_set.end(),	551 std::set_intersection(history_id_set.begin(), history_id_set.end(),

497 term_history_set.begin(), term_history_set.end(),	552 term_history_set.begin(), term_history_set.end(),

498 std::inserter(new_history_id_set,	553 std::inserter(new_history_id_set,

499 new_history_id_set.begin()));	554 new_history_id_set.begin()));

500 history_id_set.swap(new_history_id_set);	555 history_id_set.swap(new_history_id_set);

501 }	556 }

502 }	557 }

(...skipping 311 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
814 term_length_total * kCompleteMaxValue / max_significant_length;	869 term_length_total * kCompleteMaxValue / max_significant_length;

815	870

816 int raw_score = order_value + start_value + complete_value;	871 int raw_score = order_value + start_value + complete_value;

817 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };	872 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };

818	873

819 // Scale the sum of the three components above into a single score component	874 // Scale the sum of the three components above into a single score component

820 // on the same scale as that used in ScoredMatchForURL().	875 // on the same scale as that used in ScoredMatchForURL().

821 return ScoreForValue(raw_score, kTermScoreLevel);	876 return ScoreForValue(raw_score, kTermScoreLevel);

822 }	877 }

823	878

	879 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------

	880

824 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(	881 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

825 const InMemoryURLIndex& index,	882 const InMemoryURLIndex& index,

826 const String16Vector& lower_terms)	883 const String16Vector& lower_terms)

827 : index_(index),	884 : index_(index),

828 lower_terms_(lower_terms) {}	885 lower_terms_(lower_terms) {}

829	886

830 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}	887 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}

831	888

832 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {	889 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {

833 HistoryInfoMap::const_iterator hist_pos =	890 HistoryInfoMap::const_iterator hist_pos =

(...skipping 13 matching lines...) Expand all Loading...
847 if (history_dir_.empty())	904 if (history_dir_.empty())

848 return false;	905 return false;

849 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));	906 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));

850 return true;	907 return true;

851 }	908 }

852	909

853 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {	910 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {

854 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();	911 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();

855 }	912 }

856	913

	914 // Cache Management ------------------------------------------------------------

	915

857 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {	916 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {

858 DCHECK(cache);	917 DCHECK(cache);

859 cache->set_timestamp(base::Time::Now().ToInternalValue());	918 cache->set_timestamp(base::Time::Now().ToInternalValue());

860 // history_item_count_ is no longer used but rather than change the protobuf	919 // history_item_count_ is no longer used but rather than change the protobuf

861 // definition use a placeholder. This will go away with the switch to SQLite.	920 // definition use a placeholder. This will go away with the switch to SQLite.

862 cache->set_history_item_count(0);	921 cache->set_history_item_count(0);

863 SaveWordList(cache);	922 SaveWordList(cache);

864 SaveWordMap(cache);	923 SaveWordMap(cache);

865 SaveCharWordMap(cache);	924 SaveCharWordMap(cache);

866 SaveWordIDHistoryMap(cache);	925 SaveWordIDHistoryMap(cache);

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1069 if (iter->has_title()) {	1128 if (iter->has_title()) {

1070 string16 title(UTF8ToUTF16(iter->title()));	1129 string16 title(UTF8ToUTF16(iter->title()));

1071 url_row.set_title(title);	1130 url_row.set_title(title);

1072 }	1131 }

1073 private_data_->history_info_map_[history_id] = url_row;	1132 private_data_->history_info_map_[history_id] = url_row;

1074 }	1133 }

1075 return true;	1134 return true;

1076 }	1135 }

1077	1136

1078 } // namespace history	1137 } // namespace history

OLD	NEW