Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(443)

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8526010: Improve Autocomplete Matches and Handling of Large Results Sets (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/history/in_memory_url_index.h" 5 #include "chrome/browser/history/in_memory_url_index.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <functional> 8 #include <functional>
9 #include <iterator> 9 #include <iterator>
10 #include <limits> 10 #include <limits>
(...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after
393 // Note that this does not remove any reference to this row from the 393 // Note that this does not remove any reference to this row from the
394 // word_id_history_map_. That map will continue to contain (and return) 394 // word_id_history_map_. That map will continue to contain (and return)
395 // hits against this row until that map is rebuilt, but since the 395 // hits against this row until that map is rebuilt, but since the
396 // history_info_map_ no longer references the row no erroneous results 396 // history_info_map_ no longer references the row no erroneous results
397 // will propagate to the user. 397 // will propagate to the user.
398 private_data_->history_info_map_.erase(row_id); 398 private_data_->history_info_map_.erase(row_id);
399 // This invalidates the word cache. 399 // This invalidates the word cache.
400 search_term_cache_.clear(); 400 search_term_cache_.clear();
401 } 401 }
402 402
403 // Searching 403 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------
404
405 InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater(
406 const HistoryInfoMap& history_info_map)
407 : history_info_map_(history_info_map) {}
Peter Kasting 2011/11/21 20:31:02 Nit: 4 spaces before colon. I generally prefer {}
mrossetti 2011/11/21 21:38:25 Done.
408
409 InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
410
411 bool InMemoryURLIndex::HistoryItemFactorGreater::operator()(
412 const HistoryID h1,
413 const HistoryID h2) {
414 const URLRow& r1(history_info_map_.find(h1)->second);
415 const URLRow& r2(history_info_map_.find(h2)->second);
416 // First cut: typed count, visit count, recency.
417 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
418 // recently visited (within the last 12/24 hours) as highly important. Get
419 // input from mpearson.
420 if (r1.typed_count() != r2.typed_count())
421 return (r1.typed_count() > r2.typed_count());
422 if (r1.visit_count() != r2.visit_count())
423 return (r1.visit_count() > r2.visit_count());
424 return (r1.last_visit() > r2.last_visit());
425 }
426
427 // Searching -------------------------------------------------------------------
404 428
405 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( 429 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
406 const String16Vector& terms) { 430 const string16& term_string) {
431 pre_filter_item_count = 0;
432 post_filter_item_count = 0;
433 post_scoring_item_count = 0;
434 string16 clean_string = net::UnescapeURLComponent(term_string,
435 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
436 string16 lower_string(base::i18n::ToLower(clean_string));
437 history::String16Vector words(
438 history::String16VectorFromString16(lower_string, false));
439
407 ScoredHistoryMatches scored_items; 440 ScoredHistoryMatches scored_items;
408 441
409 // Do nothing if we have indexed no words (probably because we've not been 442 // Do nothing if we have indexed no words (probably because we've not been
410 // initialized yet). 443 // initialized yet).
411 if (private_data_->word_list_.empty()) 444 if (private_data_->word_list_.empty() || words.empty()) {
445 search_term_cache_.clear(); // Invalidate the term cache.
Peter Kasting 2011/11/21 20:31:02 Nit: Why is this line necessary? Wouldn't the cac
mrossetti 2011/11/21 21:38:25 I updated the comment since it did not mention tha
412 return scored_items; 446 return scored_items;
447 }
413 448
414 if (!terms.empty()) { 449 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
415 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 450 // approach.
416 // approach. 451 ResetSearchTermCache();
417 ResetSearchTermCache();
418 452
419 // Lowercase the terms. 453 HistoryIDSet history_id_set = HistoryIDSetFromWords(words);
420 // TODO(mrossetti): Another opportunity for a transform algorithm.
421 String16Vector lower_terms;
422 for (String16Vector::const_iterator term_iter = terms.begin();
423 term_iter != terms.end(); ++term_iter)
424 lower_terms.push_back(base::i18n::ToLower(*term_iter));
425 454
426 string16 all_terms(JoinString(lower_terms, ' ')); 455 // Trim the candidate pool if it is large. Note that we do not filter out
427 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 456 // items that do not contain the search terms as proper substrings -- doing
457 // so is the performance-costly operation we are trying to avoid in order
458 // to maintain omnibox responsiveness.
459 const size_t kItemsToScoreLimit = 500;
460 pre_filter_item_count = history_id_set.size();
461 // If we trim the results set we do not want to cache the results for next
462 // time as the user's ultimately desired result could easily be eliminated
463 // in this early rough filter.
464 bool was_trimmed = false;
Peter Kasting 2011/11/21 20:31:02 Nit: Even shorter: bool was_trimmed = (pre_filt
mrossetti 2011/11/21 21:38:25 Done.
465 if (pre_filter_item_count > kItemsToScoreLimit) {
466 was_trimmed = true;
467 HistoryIDVector history_ids;
468 std::copy(history_id_set.begin(), history_id_set.end(),
469 std::back_inserter(history_ids));
470 // Trim down the set by sorting by typed-count, visit-count, and last
471 // visit.
472 HistoryItemFactorGreater
473 item_factor_functor(private_data_->history_info_map_);
474 std::partial_sort(history_ids.begin(),
475 history_ids.begin() + kItemsToScoreLimit,
476 history_ids.end(),
477 item_factor_functor);
478 history_ids.resize(kItemsToScoreLimit);
Peter Kasting 2011/11/21 20:31:02 Nit: You can eliminate this line and simply use "h
mrossetti 2011/11/21 21:38:25 Delicious!
479 history_id_set.clear();
480 std::copy(history_ids.begin(), history_ids.end(),
481 std::inserter(history_id_set, history_id_set.end()));
482 post_filter_item_count = history_id_set.size();
483 }
428 484
429 // Don't perform any scoring (and don't return any matches) if the 485 // Pass over all of the candidates filtering out any without a proper
430 // candidate pool is large. (See comments in header.) 486 // substring match, inserting those which pass in order by score.
431 const size_t kItemsToScoreLimit = 500; 487 history::String16Vector terms;
432 if (history_id_set.size() <= kItemsToScoreLimit) { 488 Tokenize(lower_string, kWhitespaceUTF16, &terms);
433 // Pass over all of the candidates filtering out any without a proper 489 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
434 // substring match, inserting those which pass in order by score. 490 AddHistoryMatch(*this, terms)).ScoredMatches();
435 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
436 AddHistoryMatch(*this, lower_terms)).ScoredMatches();
437 491
438 // Select and sort only the top kMaxMatches results. 492 // Select and sort only the top kMaxMatches results.
439 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { 493 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
440 std::partial_sort(scored_items.begin(), 494 std::partial_sort(scored_items.begin(),
441 scored_items.begin() + 495 scored_items.begin() +
442 AutocompleteProvider::kMaxMatches, 496 AutocompleteProvider::kMaxMatches,
443 scored_items.end(), 497 scored_items.end(),
444 ScoredHistoryMatch::MatchScoreGreater); 498 ScoredHistoryMatch::MatchScoreGreater);
445 scored_items.resize(AutocompleteProvider::kMaxMatches); 499 scored_items.resize(AutocompleteProvider::kMaxMatches);
446 } else { 500 } else {
447 std::sort(scored_items.begin(), scored_items.end(), 501 std::sort(scored_items.begin(), scored_items.end(),
448 ScoredHistoryMatch::MatchScoreGreater); 502 ScoredHistoryMatch::MatchScoreGreater);
449 } 503 }
504 post_scoring_item_count = scored_items.size();
505
506 if (was_trimmed) {
507 search_term_cache_.clear(); // Invalidate the term cache.
508 } else {
509 // Remove any stale SearchTermCacheItems.
510 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
511 cache_iter != search_term_cache_.end(); ) {
512 if (!cache_iter->second.used_)
513 search_term_cache_.erase(cache_iter++);
514 else
515 ++cache_iter;
450 } 516 }
451 } 517 }
452 518
453 // Remove any stale SearchTermCacheItems.
454 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
455 cache_iter != search_term_cache_.end(); ) {
456 if (!cache_iter->second.used_)
457 search_term_cache_.erase(cache_iter++);
458 else
459 ++cache_iter;
460 }
461
462 return scored_items; 519 return scored_items;
463 } 520 }
464 521
465 void InMemoryURLIndex::ResetSearchTermCache() { 522 void InMemoryURLIndex::ResetSearchTermCache() {
466 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); 523 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
467 iter != search_term_cache_.end(); ++iter) 524 iter != search_term_cache_.end(); ++iter)
468 iter->second.used_ = false; 525 iter->second.used_ = false;
469 } 526 }
470 527
471 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( 528 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
472 const string16& uni_string) { 529 const String16Vector& unsorted_words) {
473 // Break the terms down into individual terms (words), get the candidate 530 // Break the terms down into individual terms (words), get the candidate
474 // set for each term, and intersect each to get a final candidate list. 531 // set for each term, and intersect each to get a final candidate list.
475 // Note that a single 'term' from the user's perspective might be 532 // Note that a single 'term' from the user's perspective might be
476 // a string like "http://www.somewebsite.com" which, from our perspective, 533 // a string like "http://www.somewebsite.com" which, from our perspective,
477 // is four words: 'http', 'www', 'somewebsite', and 'com'. 534 // is four words: 'http', 'www', 'somewebsite', and 'com'.
478 HistoryIDSet history_id_set; 535 HistoryIDSet history_id_set;
479 String16Vector terms = String16VectorFromString16(uni_string, true); 536 String16Vector words(unsorted_words);
480 // Sort the terms into the longest first as such are likely to narrow down 537 // Sort the terms into the longest first as such are likely to narrow down
481 // the results quicker. Also, single character terms are the most expensive 538 // the results quicker. Also, single character terms are the most expensive
482 // to process so save them for last. 539 // to process so save them for last.
483 std::sort(terms.begin(), terms.end(), LengthGreater); 540 std::sort(words.begin(), words.end(), LengthGreater);
484 for (String16Vector::iterator iter = terms.begin(); iter != terms.end(); 541 for (String16Vector::iterator iter = words.begin(); iter != words.end();
485 ++iter) { 542 ++iter) {
486 string16 uni_word = *iter; 543 string16 uni_word = *iter;
487 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); 544 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
488 if (term_history_set.empty()) { 545 if (term_history_set.empty()) {
489 history_id_set.clear(); 546 history_id_set.clear();
490 break; 547 break;
491 } 548 }
492 if (iter == terms.begin()) { 549 if (iter == words.begin()) {
493 history_id_set.swap(term_history_set); 550 history_id_set.swap(term_history_set);
494 } else { 551 } else {
495 HistoryIDSet new_history_id_set; 552 HistoryIDSet new_history_id_set;
496 std::set_intersection(history_id_set.begin(), history_id_set.end(), 553 std::set_intersection(history_id_set.begin(), history_id_set.end(),
497 term_history_set.begin(), term_history_set.end(), 554 term_history_set.begin(), term_history_set.end(),
498 std::inserter(new_history_id_set, 555 std::inserter(new_history_id_set,
499 new_history_id_set.begin())); 556 new_history_id_set.begin()));
500 history_id_set.swap(new_history_id_set); 557 history_id_set.swap(new_history_id_set);
501 } 558 }
502 } 559 }
(...skipping 311 matching lines...) Expand 10 before | Expand all | Expand 10 after
814 term_length_total * kCompleteMaxValue / max_significant_length; 871 term_length_total * kCompleteMaxValue / max_significant_length;
815 872
816 int raw_score = order_value + start_value + complete_value; 873 int raw_score = order_value + start_value + complete_value;
817 const int kTermScoreLevel[] = { 1000, 650, 500, 200 }; 874 const int kTermScoreLevel[] = { 1000, 650, 500, 200 };
818 875
819 // Scale the sum of the three components above into a single score component 876 // Scale the sum of the three components above into a single score component
820 // on the same scale as that used in ScoredMatchForURL(). 877 // on the same scale as that used in ScoredMatchForURL().
821 return ScoreForValue(raw_score, kTermScoreLevel); 878 return ScoreForValue(raw_score, kTermScoreLevel);
822 } 879 }
823 880
881 // InMemoryURLIndex::AddHistoryMatch -------------------------------------------
882
824 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 883 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
825 const InMemoryURLIndex& index, 884 const InMemoryURLIndex& index,
826 const String16Vector& lower_terms) 885 const String16Vector& lower_terms)
827 : index_(index), 886 : index_(index),
828 lower_terms_(lower_terms) {} 887 lower_terms_(lower_terms) {}
829 888
830 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 889 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
831 890
832 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) { 891 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {
833 HistoryInfoMap::const_iterator hist_pos = 892 HistoryInfoMap::const_iterator hist_pos =
(...skipping 13 matching lines...) Expand all
847 if (history_dir_.empty()) 906 if (history_dir_.empty())
848 return false; 907 return false;
849 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); 908 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));
850 return true; 909 return true;
851 } 910 }
852 911
853 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const { 912 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {
854 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); 913 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();
855 } 914 }
856 915
916 // Cache Management ------------------------------------------------------------
917
857 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { 918 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {
858 DCHECK(cache); 919 DCHECK(cache);
859 cache->set_timestamp(base::Time::Now().ToInternalValue()); 920 cache->set_timestamp(base::Time::Now().ToInternalValue());
860 // history_item_count_ is no longer used but rather than change the protobuf 921 // history_item_count_ is no longer used but rather than change the protobuf
861 // definition use a placeholder. This will go away with the switch to SQLite. 922 // definition use a placeholder. This will go away with the switch to SQLite.
862 cache->set_history_item_count(0); 923 cache->set_history_item_count(0);
863 SaveWordList(cache); 924 SaveWordList(cache);
864 SaveWordMap(cache); 925 SaveWordMap(cache);
865 SaveCharWordMap(cache); 926 SaveCharWordMap(cache);
866 SaveWordIDHistoryMap(cache); 927 SaveWordIDHistoryMap(cache);
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after
1069 if (iter->has_title()) { 1130 if (iter->has_title()) {
1070 string16 title(UTF8ToUTF16(iter->title())); 1131 string16 title(UTF8ToUTF16(iter->title()));
1071 url_row.set_title(title); 1132 url_row.set_title(title);
1072 } 1133 }
1073 private_data_->history_info_map_[history_id] = url_row; 1134 private_data_->history_info_map_[history_id] = url_row;
1074 } 1135 }
1075 return true; 1136 return true;
1076 } 1137 }
1077 1138
1078 } // namespace history 1139 } // namespace history
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698