chrome/browser/history/in_memory_url_index.cc - Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 9 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/in_memory_url_index.h"	5 #include "chrome/browser/history/in_memory_url_index.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <functional>	8 #include <functional>

9 #include <iterator>	9 #include <iterator>

10 #include <limits>	10 #include <limits>

(...skipping 382 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
393 // Note that this does not remove any reference to this row from the	393 // Note that this does not remove any reference to this row from the

394 // word_id_history_map_. That map will continue to contain (and return)	394 // word_id_history_map_. That map will continue to contain (and return)

395 // hits against this row until that map is rebuilt, but since the	395 // hits against this row until that map is rebuilt, but since the

396 // history_info_map_ no longer references the row no erroneous results	396 // history_info_map_ no longer references the row no erroneous results

397 // will propagate to the user.	397 // will propagate to the user.

398 private_data_->history_info_map_.erase(row_id);	398 private_data_->history_info_map_.erase(row_id);

399 // This invalidates the word cache.	399 // This invalidates the word cache.

400 search_term_cache_.clear();	400 search_term_cache_.clear();

401 }	401 }

402	402

403 // Searching	403 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------

	404

	405 InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater(

	406 const HistoryInfoMap& history_info_map)

	407 : history_info_map_(history_info_map) {

	408 }

	409

	410 InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}

	411

	412 bool InMemoryURLIndex::HistoryItemFactorGreater::operator()(

	413 const HistoryID h1,

	414 const HistoryID h2) {

	415 const URLRow& r1(history_info_map_.find(h1)->second);

	416 const URLRow& r2(history_info_map_.find(h2)->second);

	417 // First cut: typed count, visit count, recency.

	418 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

	419 // recently visited (within the last 12/24 hours) as highly important. Get

	420 // input from mpearson.

	421 if (r1.typed_count() != r2.typed_count())

	422 return (r1.typed_count() > r2.typed_count());

	423 if (r1.visit_count() != r2.visit_count())

	424 return (r1.visit_count() > r2.visit_count());

	425 return (r1.last_visit() > r2.last_visit());

	426 }

	427

	428 // Searching -------------------------------------------------------------------

404	429

405 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(	430 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(

406 const String16Vector& terms) {	431 const string16& term_string) {

	432 pre_filter_item_count = 0;

	433 post_filter_item_count = 0;

	434 post_scoring_item_count = 0;

	435 string16 clean_string = net::UnescapeURLComponent(term_string,

	436 net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

	437 string16 lower_string(base::i18n::ToLower(clean_string));

	438 history::String16Vector words(

	439 history::String16VectorFromString16(lower_string, false));

	440

407 ScoredHistoryMatches scored_items;	441 ScoredHistoryMatches scored_items;

408	442

409 // Do nothing if we have indexed no words (probably because we've not been	443 // Do nothing if we have indexed no words (probably because we've not been

410 // initialized yet).	444 // initialized yet) or the search string has no words.

411 if (private_data_->word_list_.empty())	445 if (private_data_->word_list_.empty() \|\| words.empty()) {

	446 search_term_cache_.clear(); // Invalidate the term cache.

412 return scored_items;	447 return scored_items;

	448 }

413	449

414 if (!terms.empty()) {	450 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

415 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	451 // approach.

416 // approach.	452 ResetSearchTermCache();

417 ResetSearchTermCache();

418	453

419 // Lowercase the terms.	454 HistoryIDSet history_id_set = HistoryIDSetFromWords(words);
	GeorgeY 2011/11/21 22:52:03 I wonder how fast it would work on lowest common d I wonder how fast it would work on lowest common denominator - the Atom netbook (such as Chromebook). I haven't cleared my history in two years on my desktop - I am sure people do not do it on netbooks also :) mrossetti 2011/11/22 23:29:12 This should be very fast as the dataset underlying Show quoted text On 2011/11/21 22:52:03, GeorgeY wrote: > I wonder how fast it would work on lowest common denominator - the Atom netbook > (such as Chromebook). I haven't cleared my history in two years on my desktop - > I am sure people do not do it on netbooks also :) This should be very fast as the dataset underlying is pre-constructed from the history database once and is kept up-to-date after that. In the early days, I measured this and the set of words and characters which has to be permuted is only in the thousands, even with huge history databases.
420 // TODO(mrossetti): Another opportunity for a transform algorithm.

421 String16Vector lower_terms;

422 for (String16Vector::const_iterator term_iter = terms.begin();

423 term_iter != terms.end(); ++term_iter)

424 lower_terms.push_back(base::i18n::ToLower(*term_iter));

425	455

426 string16 all_terms(JoinString(lower_terms, ' '));	456 // Trim the candidate pool if it is large. Note that we do not filter out

427 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);	457 // items that do not contain the search terms as proper substrings -- doing

	458 // so is the performance-costly operation we are trying to avoid in order

	459 // to maintain omnibox responsiveness.

	460 const size_t kItemsToScoreLimit = 500;

	461 pre_filter_item_count = history_id_set.size();

	462 // If we trim the results set we do not want to cache the results for next

	463 // time as the user's ultimately desired result could easily be eliminated

	464 // in this early rough filter.

	465 bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit);

	466 if (was_trimmed) {

	467 HistoryIDVector history_ids;

	468 std::copy(history_id_set.begin(), history_id_set.end(),

	469 std::back_inserter(history_ids));

	470 // Trim down the set by sorting by typed-count, visit-count, and last

	471 // visit.

	472 HistoryItemFactorGreater

	473 item_factor_functor(private_data_->history_info_map_);

	474 std::partial_sort(history_ids.begin(),

	475 history_ids.begin() + kItemsToScoreLimit,

	476 history_ids.end(),

	477 item_factor_functor);

	478 history_id_set.clear();

	479 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

	480 std::inserter(history_id_set, history_id_set.end()));

	481 post_filter_item_count = history_id_set.size();

	482 }

428	483

429 // Don't perform any scoring (and don't return any matches) if the	484 // Pass over all of the candidates filtering out any without a proper

430 // candidate pool is large. (See comments in header.)	485 // substring match, inserting those which pass in order by score.

431 const size_t kItemsToScoreLimit = 500;	486 history::String16Vector terms;

432 if (history_id_set.size() <= kItemsToScoreLimit) {	487 Tokenize(lower_string, kWhitespaceUTF16, &terms);

433 // Pass over all of the candidates filtering out any without a proper	488 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

434 // substring match, inserting those which pass in order by score.	489 AddHistoryMatch(*this, terms)).ScoredMatches();

435 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

436 AddHistoryMatch(*this, lower_terms)).ScoredMatches();

437	490

438 // Select and sort only the top kMaxMatches results.	491 // Select and sort only the top kMaxMatches results.

439 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {	492 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {

440 std::partial_sort(scored_items.begin(),	493 std::partial_sort(scored_items.begin(),

441 scored_items.begin() +	494 scored_items.begin() +

442 AutocompleteProvider::kMaxMatches,	495 AutocompleteProvider::kMaxMatches,

443 scored_items.end(),	496 scored_items.end(),

444 ScoredHistoryMatch::MatchScoreGreater);	497 ScoredHistoryMatch::MatchScoreGreater);

445 scored_items.resize(AutocompleteProvider::kMaxMatches);	498 scored_items.resize(AutocompleteProvider::kMaxMatches);

446 } else {	499 } else {

447 std::sort(scored_items.begin(), scored_items.end(),	500 std::sort(scored_items.begin(), scored_items.end(),

448 ScoredHistoryMatch::MatchScoreGreater);	501 ScoredHistoryMatch::MatchScoreGreater);

449 }	502 }

	503 post_scoring_item_count = scored_items.size();

	504

	505 if (was_trimmed) {

	506 search_term_cache_.clear(); // Invalidate the term cache.

	507 } else {

	508 // Remove any stale SearchTermCacheItems.

	509 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

	510 cache_iter != search_term_cache_.end(); ) {

	511 if (!cache_iter->second.used_)

	512 search_term_cache_.erase(cache_iter++);

	513 else

	514 ++cache_iter;

450 }	515 }

451 }	516 }

452	517

453 // Remove any stale SearchTermCacheItems.

454 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

455 cache_iter != search_term_cache_.end(); ) {

456 if (!cache_iter->second.used_)

457 search_term_cache_.erase(cache_iter++);

458 else

459 ++cache_iter;

460 }

461

462 return scored_items;	518 return scored_items;

463 }	519 }

464	520

465 void InMemoryURLIndex::ResetSearchTermCache() {	521 void InMemoryURLIndex::ResetSearchTermCache() {

466 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();	522 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();

467 iter != search_term_cache_.end(); ++iter)	523 iter != search_term_cache_.end(); ++iter)

468 iter->second.used_ = false;	524 iter->second.used_ = false;

469 }	525 }

470	526

471 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(	527 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(

472 const string16& uni_string) {	528 const String16Vector& unsorted_words) {

473 // Break the terms down into individual terms (words), get the candidate	529 // Break the terms down into individual terms (words), get the candidate

474 // set for each term, and intersect each to get a final candidate list.	530 // set for each term, and intersect each to get a final candidate list.

475 // Note that a single 'term' from the user's perspective might be	531 // Note that a single 'term' from the user's perspective might be

476 // a string like "http://www.somewebsite.com" which, from our perspective,	532 // a string like "http://www.somewebsite.com" which, from our perspective,

477 // is four words: 'http', 'www', 'somewebsite', and 'com'.	533 // is four words: 'http', 'www', 'somewebsite', and 'com'.

478 HistoryIDSet history_id_set;	534 HistoryIDSet history_id_set;

479 String16Vector terms = String16VectorFromString16(uni_string, true);	535 String16Vector words(unsorted_words);

480 // Sort the terms into the longest first as such are likely to narrow down	536 // Sort the terms into the longest first as such are likely to narrow down

481 // the results quicker. Also, single character terms are the most expensive	537 // the results quicker. Also, single character terms are the most expensive

482 // to process so save them for last.	538 // to process so save them for last.

483 std::sort(terms.begin(), terms.end(), LengthGreater);	539 std::sort(words.begin(), words.end(), LengthGreater);

484 for (String16Vector::iterator iter = terms.begin(); iter != terms.end();	540 for (String16Vector::iterator iter = words.begin(); iter != words.end();

485 ++iter) {	541 ++iter) {

486 string16 uni_word = *iter;	542 string16 uni_word = *iter;

487 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);	543 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

488 if (term_history_set.empty()) {	544 if (term_history_set.empty()) {

489 history_id_set.clear();	545 history_id_set.clear();

490 break;	546 break;

491 }	547 }

492 if (iter == terms.begin()) {	548 if (iter == words.begin()) {

493 history_id_set.swap(term_history_set);	549 history_id_set.swap(term_history_set);

494 } else {	550 } else {

495 HistoryIDSet new_history_id_set;	551 HistoryIDSet new_history_id_set;

496 std::set_intersection(history_id_set.begin(), history_id_set.end(),	552 std::set_intersection(history_id_set.begin(), history_id_set.end(),

497 term_history_set.begin(), term_history_set.end(),	553 term_history_set.begin(), term_history_set.end(),

498 std::inserter(new_history_id_set,	554 std::inserter(new_history_id_set,

499 new_history_id_set.begin()));	555 new_history_id_set.begin()));

500 history_id_set.swap(new_history_id_set);	556 history_id_set.swap(new_history_id_set);

501 }	557 }

502 }	558 }

(...skipping 311 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
814 term_length_total * kCompleteMaxValue / max_significant_length;	870 term_length_total * kCompleteMaxValue / max_significant_length;

815	871

816 int raw_score = order_value + start_value + complete_value;	872 int raw_score = order_value + start_value + complete_value;

817 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };	873 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };

818	874

819 // Scale the sum of the three components above into a single score component	875 // Scale the sum of the three components above into a single score component

820 // on the same scale as that used in ScoredMatchForURL().	876 // on the same scale as that used in ScoredMatchForURL().

821 return ScoreForValue(raw_score, kTermScoreLevel);	877 return ScoreForValue(raw_score, kTermScoreLevel);

822 }	878 }

823	879

	880 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------

	881

824 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(	882 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

825 const InMemoryURLIndex& index,	883 const InMemoryURLIndex& index,

826 const String16Vector& lower_terms)	884 const String16Vector& lower_terms)

827 : index_(index),	885 : index_(index),

828 lower_terms_(lower_terms) {}	886 lower_terms_(lower_terms) {}

829	887

830 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}	888 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}

831	889

832 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {	890 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {

833 HistoryInfoMap::const_iterator hist_pos =	891 HistoryInfoMap::const_iterator hist_pos =

(...skipping 13 matching lines...) Expand all Loading...
847 if (history_dir_.empty())	905 if (history_dir_.empty())

848 return false;	906 return false;

849 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));	907 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));

850 return true;	908 return true;

851 }	909 }

852	910

853 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {	911 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {

854 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();	912 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();

855 }	913 }

856	914

	915 // Cache Management ------------------------------------------------------------

	916

857 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {	917 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {

858 DCHECK(cache);	918 DCHECK(cache);

859 cache->set_timestamp(base::Time::Now().ToInternalValue());	919 cache->set_timestamp(base::Time::Now().ToInternalValue());

860 // history_item_count_ is no longer used but rather than change the protobuf	920 // history_item_count_ is no longer used but rather than change the protobuf

861 // definition use a placeholder. This will go away with the switch to SQLite.	921 // definition use a placeholder. This will go away with the switch to SQLite.

862 cache->set_history_item_count(0);	922 cache->set_history_item_count(0);

863 SaveWordList(cache);	923 SaveWordList(cache);

864 SaveWordMap(cache);	924 SaveWordMap(cache);

865 SaveCharWordMap(cache);	925 SaveCharWordMap(cache);

866 SaveWordIDHistoryMap(cache);	926 SaveWordIDHistoryMap(cache);

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1069 if (iter->has_title()) {	1129 if (iter->has_title()) {

1070 string16 title(UTF8ToUTF16(iter->title()));	1130 string16 title(UTF8ToUTF16(iter->title()));

1071 url_row.set_title(title);	1131 url_row.set_title(title);

1072 }	1132 }

1073 private_data_->history_info_map_[history_id] = url_row;	1133 private_data_->history_info_map_[history_id] = url_row;

1074 }	1134 }

1075 return true;	1135 return true;

1076 }	1136 }

1077	1137

1078 } // namespace history	1138 } // namespace history

OLD	NEW

« no previous file with comments | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_types.h » ('j') | no next file with comments »