Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(58)

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2337953002: Optimizing HQP using vectors manually. This is just a CL to be able (Closed)
Patch Set: Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/url_index_private_data.h" 5 #include "components/omnibox/browser/url_index_private_data.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <functional> 9 #include <functional>
10 #include <iterator> 10 #include <iterator>
(...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after
515 word_id_history_map_.clear(); 515 word_id_history_map_.clear();
516 history_id_word_map_.clear(); 516 history_id_word_map_.clear();
517 history_info_map_.clear(); 517 history_info_map_.clear();
518 word_starts_map_.clear(); 518 word_starts_map_.clear();
519 } 519 }
520 520
521 URLIndexPrivateData::~URLIndexPrivateData() {} 521 URLIndexPrivateData::~URLIndexPrivateData() {}
522 522
523 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( 523 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
524 const String16Vector& unsorted_words) { 524 const String16Vector& unsorted_words) {
525 using HistoryIDMoveIterator = std::move_iterator<HistoryIDVector::iterator>;
525 // Break the terms down into individual terms (words), get the candidate 526 // Break the terms down into individual terms (words), get the candidate
526 // set for each term, and intersect each to get a final candidate list. 527 // set for each term, and intersect each to get a final candidate list.
527 // Note that a single 'term' from the user's perspective might be 528 // Note that a single 'term' from the user's perspective might be
528 // a string like "http://www.somewebsite.com" which, from our perspective, 529 // a string like "http://www.somewebsite.com" which, from our perspective,
529 // is four words: 'http', 'www', 'somewebsite', and 'com'. 530 // is four words: 'http', 'www', 'somewebsite', and 'com'.
530 HistoryIDSet history_id_set; 531 HistoryIDVector history_id_set;
531 String16Vector words(unsorted_words); 532 String16Vector words(unsorted_words);
532 // Sort the words into the longest first as such are likely to narrow down 533 // Sort the words into the longest first as such are likely to narrow down
533 // the results quicker. Also, single character words are the most expensive 534 // the results quicker. Also, single character words are the most expensive
534 // to process so save them for last. 535 // to process so save them for last.
535 std::sort(words.begin(), words.end(), LengthGreater); 536 std::sort(words.begin(), words.end(), LengthGreater);
536 for (String16Vector::iterator iter = words.begin(); iter != words.end(); 537 for (String16Vector::iterator iter = words.begin(); iter != words.end();
537 ++iter) { 538 ++iter) {
538 base::string16 uni_word = *iter; 539 base::string16 uni_word = *iter;
539 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); 540 HistoryIDVector term_history_set = HistoryIDsForTerm(uni_word);
540 if (term_history_set.empty()) { 541 if (term_history_set.empty()) {
541 history_id_set.clear(); 542 history_id_set.clear();
542 break; 543 break;
543 } 544 }
544 if (iter == words.begin()) { 545 if (iter == words.begin()) {
545 history_id_set.swap(term_history_set); 546 history_id_set = std::move(term_history_set);
546 } else { 547 } else {
547 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( 548 history_id_set.erase(
548 history_id_set, term_history_set); 549 std::set_intersection(HistoryIDMoveIterator(term_history_set.begin()),
549 history_id_set.swap(new_history_id_set); 550 HistoryIDMoveIterator(term_history_set.end()),
551 history_id_set.begin(), history_id_set.end(),
552 history_id_set.begin()),
553 history_id_set.end());
550 } 554 }
551 } 555 }
552 return history_id_set; 556 return HistoryIDSet(history_id_set.begin(), history_id_set.end());
553 } 557 }
554 558
555 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( 559 HistoryIDVector URLIndexPrivateData::HistoryIDsForTerm(
556 const base::string16& term) { 560 const base::string16& term) {
557 if (term.empty()) 561 if (term.empty())
558 return HistoryIDSet(); 562 return {};
559 563
560 // TODO(mrossetti): Consider optimizing for very common terms such as 564 // TODO(mrossetti): Consider optimizing for very common terms such as
561 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently 565 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
562 // occuring words in the user's searches. 566 // occuring words in the user's searches.
563 567
564 size_t term_length = term.length(); 568 size_t term_length = term.length();
565 WordIDSet word_id_set; 569 WordIDVector word_id_set;
566 if (term_length > 1) { 570 if (term_length > 1) {
567 // See if this term or a prefix thereof is present in the cache. 571 // See if this term or a prefix thereof is present in the cache.
568 base::string16 term_lower = base::i18n::ToLower(term); 572 base::string16 term_lower = base::i18n::ToLower(term);
569 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); 573 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
570 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 574 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
571 cache_iter != search_term_cache_.end(); ++cache_iter) { 575 cache_iter != search_term_cache_.end(); ++cache_iter) {
572 if (base::StartsWith(term_lower, 576 if (base::StartsWith(term_lower,
573 base::i18n::ToLower(cache_iter->first), 577 base::i18n::ToLower(cache_iter->first),
574 base::CompareCase::SENSITIVE) && 578 base::CompareCase::SENSITIVE) &&
575 (best_prefix == search_term_cache_.end() || 579 (best_prefix == search_term_cache_.end() ||
(...skipping 12 matching lines...) Expand all
588 if (prefix_length == term_length) { 592 if (prefix_length == term_length) {
589 best_prefix->second.used_ = true; 593 best_prefix->second.used_ = true;
590 return best_prefix->second.history_id_set_; 594 return best_prefix->second.history_id_set_;
591 } 595 }
592 596
593 // Otherwise we have a handy starting point. 597 // Otherwise we have a handy starting point.
594 // If there are no history results for this prefix then we can bail early 598 // If there are no history results for this prefix then we can bail early
595 // as there will be no history results for the full term. 599 // as there will be no history results for the full term.
596 if (best_prefix->second.history_id_set_.empty()) { 600 if (best_prefix->second.history_id_set_.empty()) {
597 search_term_cache_[term] = SearchTermCacheItem(); 601 search_term_cache_[term] = SearchTermCacheItem();
598 return HistoryIDSet(); 602 return {};
599 } 603 }
600 word_id_set = best_prefix->second.word_id_set_; 604 word_id_set = best_prefix->second.word_id_set_;
601 prefix_chars = Char16SetFromString16(best_prefix->first); 605 prefix_chars = Char16SetFromString16(best_prefix->first);
602 leftovers = term.substr(prefix_length); 606 leftovers = term.substr(prefix_length);
603 } 607 }
604 608
605 // Filter for each remaining, unique character in the term. 609 // Filter for each remaining, unique character in the term.
606 Char16Set leftover_chars = Char16SetFromString16(leftovers); 610 Char16Set leftover_chars = Char16SetFromString16(leftovers);
607 Char16Set unique_chars = 611 Char16Set unique_chars =
608 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); 612 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
609 613
610 // Reduce the word set with any leftover, unprocessed characters. 614 // Reduce the word set with any leftover, unprocessed characters.
611 if (!unique_chars.empty()) { 615 if (!unique_chars.empty()) {
612 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); 616 WordIDVector leftover_set(WordIDSetForTermChars(unique_chars));
613 // We might come up empty on the leftovers. 617 // We might come up empty on the leftovers.
614 if (leftover_set.empty()) { 618 if (leftover_set.empty()) {
615 search_term_cache_[term] = SearchTermCacheItem(); 619 search_term_cache_[term] = SearchTermCacheItem();
616 return HistoryIDSet(); 620 return {};
617 } 621 }
618 // Or there may not have been a prefix from which to start. 622 // Or there may not have been a prefix from which to start.
619 if (prefix_chars.empty()) { 623 if (prefix_chars.empty()) {
620 word_id_set.swap(leftover_set); 624 word_id_set.swap(leftover_set);
621 } else { 625 } else {
622 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( 626 word_id_set.erase(
623 word_id_set, leftover_set); 627 std::set_intersection(word_id_set.begin(), word_id_set.end(),
624 word_id_set.swap(new_word_id_set); 628 leftover_set.begin(), leftover_set.end(),
629 word_id_set.begin()),
630 word_id_set.end());
625 } 631 }
626 } 632 }
627 633
628 // We must filter the word list because the resulting word set surely 634 // We must filter the word list because the resulting word set surely
629 // contains words which do not have the search term as a proper subset. 635 // contains words which do not have the search term as a proper subset.
630 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); 636 word_id_set.erase(std::remove_if(word_id_set.begin(), word_id_set.end(),
631 word_set_iter != word_id_set.end(); ) { 637 [this, &term](const WordID& id) {
632 if (word_list_[*word_set_iter].find(term) == base::string16::npos) 638 return word_list_[id].find(term) ==
633 word_id_set.erase(word_set_iter++); 639 base::string16::npos;
634 else 640 }),
635 ++word_set_iter; 641 word_id_set.end());
636 }
637 } else { 642 } else {
638 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); 643 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
639 } 644 }
640 645
641 // If any words resulted then we can compose a set of history IDs by unioning 646 // If any words resulted then we can compose a set of history IDs by unioning
642 // the sets from each word. 647 // the sets from each word.
643 HistoryIDSet history_id_set; 648 HistoryIDVector history_id_set;
644 if (!word_id_set.empty()) { 649
645 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 650 for (const WordID word_id : word_id_set) {
646 word_id_iter != word_id_set.end(); ++word_id_iter) { 651 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
647 WordID word_id = *word_id_iter; 652 if (word_iter != word_id_history_map_.end()) {
648 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 653 HistoryIDSet& word_history_id_set(word_iter->second);
649 if (word_iter != word_id_history_map_.end()) { 654 history_id_set.insert(history_id_set.end(), word_history_id_set.begin(),
650 HistoryIDSet& word_history_id_set(word_iter->second); 655 word_history_id_set.end());
651 history_id_set.insert(word_history_id_set.begin(),
652 word_history_id_set.end());
653 }
654 } 656 }
655 } 657 }
656 658
659 std::sort(history_id_set.begin(), history_id_set.end());
660 history_id_set.erase(
661 std::unique(history_id_set.begin(), history_id_set.end()),
662 history_id_set.end());
663
657 // Record a new cache entry for this word if the term is longer than 664 // Record a new cache entry for this word if the term is longer than
658 // a single character. 665 // a single character.
659 if (term_length > 1) 666 if (term_length > 1)
660 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); 667 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
661 668
662 return history_id_set; 669 return history_id_set;
663 } 670 }
664 671
665 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( 672 WordIDVector URLIndexPrivateData::WordIDSetForTermChars(
666 const Char16Set& term_chars) { 673 const Char16Set& term_chars) {
667 WordIDSet word_id_set; 674 WordIDVector word_id_set;
668 for (Char16Set::const_iterator c_iter = term_chars.begin(); 675 for (Char16Set::const_iterator c_iter = term_chars.begin();
669 c_iter != term_chars.end(); ++c_iter) { 676 c_iter != term_chars.end(); ++c_iter) {
670 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); 677 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
671 if (char_iter == char_word_map_.end()) { 678 if (char_iter == char_word_map_.end()) {
672 // A character was not found so there are no matching results: bail. 679 // A character was not found so there are no matching results: bail.
673 word_id_set.clear(); 680 word_id_set.clear();
674 break; 681 break;
675 } 682 }
676 WordIDSet& char_word_id_set(char_iter->second); 683 WordIDSet& char_word_id_set(char_iter->second);
677 // It is possible for there to no longer be any words associated with 684 // It is possible for there to no longer be any words associated with
678 // a particular character. Give up in that case. 685 // a particular character. Give up in that case.
679 if (char_word_id_set.empty()) { 686 if (char_word_id_set.empty()) {
680 word_id_set.clear(); 687 word_id_set.clear();
681 break; 688 break;
682 } 689 }
683 690
684 if (c_iter == term_chars.begin()) { 691 if (c_iter == term_chars.begin()) {
685 // First character results becomes base set of results. 692 // First character results becomes base set of results.
686 word_id_set = char_word_id_set; 693 word_id_set = {char_word_id_set.begin(), char_word_id_set.end()};
687 } else { 694 } else {
688 // Subsequent character results get intersected in. 695 word_id_set.erase(
689 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( 696 std::set_intersection(word_id_set.begin(), word_id_set.end(),
690 word_id_set, char_word_id_set); 697 char_word_id_set.begin(),
691 word_id_set.swap(new_word_id_set); 698 char_word_id_set.end(), word_id_set.begin()),
699 word_id_set.end());
692 } 700 }
693 } 701 }
694 return word_id_set; 702 return word_id_set;
695 } 703 }
696 704
697 bool URLIndexPrivateData::IndexRow( 705 bool URLIndexPrivateData::IndexRow(
698 history::HistoryDatabase* history_db, 706 history::HistoryDatabase* history_db,
699 history::HistoryService* history_service, 707 history::HistoryService* history_service,
700 const history::URLRow& row, 708 const history::URLRow& row,
701 const std::set<std::string>& scheme_whitelist, 709 const std::set<std::string>& scheme_whitelist,
(...skipping 546 matching lines...) Expand 10 before | Expand all | Expand 10 after
1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted( 1256 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1249 const GURL& gurl, 1257 const GURL& gurl,
1250 const std::set<std::string>& whitelist) { 1258 const std::set<std::string>& whitelist) {
1251 return whitelist.find(gurl.scheme()) != whitelist.end(); 1259 return whitelist.find(gurl.scheme()) != whitelist.end();
1252 } 1260 }
1253 1261
1254 1262
1255 // SearchTermCacheItem --------------------------------------------------------- 1263 // SearchTermCacheItem ---------------------------------------------------------
1256 1264
1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( 1265 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1258 const WordIDSet& word_id_set, 1266 WordIDVector word_id_set,
1259 const HistoryIDSet& history_id_set) 1267 HistoryIDVector history_id_set)
1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { 1268 : word_id_set_(std::move(word_id_set)),
1261 } 1269 history_id_set_(std::move(history_id_set)),
1270 used_(true) {}
1262 1271
1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { 1272 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1264 } 1273 }
1265 1274
1266 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( 1275 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1267 const SearchTermCacheItem& other) = default; 1276 const SearchTermCacheItem& other) = default;
1268 1277
1269 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { 1278 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1270 } 1279 }
1271 1280
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
1354 // First cut: typed count, visit count, recency. 1363 // First cut: typed count, visit count, recency.
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1364 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1356 // recently visited (within the last 12/24 hours) as highly important. Get 1365 // recently visited (within the last 12/24 hours) as highly important. Get
1357 // input from mpearson. 1366 // input from mpearson.
1358 if (r1.typed_count() != r2.typed_count()) 1367 if (r1.typed_count() != r2.typed_count())
1359 return (r1.typed_count() > r2.typed_count()); 1368 return (r1.typed_count() > r2.typed_count());
1360 if (r1.visit_count() != r2.visit_count()) 1369 if (r1.visit_count() != r2.visit_count())
1361 return (r1.visit_count() > r2.visit_count()); 1370 return (r1.visit_count() > r2.visit_count());
1362 return (r1.last_visit() > r2.last_visit()); 1371 return (r1.last_visit() > r2.last_visit());
1363 } 1372 }
OLDNEW
« no previous file with comments | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698