OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
6 | 6 |
7 #include <stdint.h> | 7 #include <stdint.h> |
8 | 8 |
9 #include <functional> | 9 #include <functional> |
10 #include <iterator> | 10 #include <iterator> |
(...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
515 word_id_history_map_.clear(); | 515 word_id_history_map_.clear(); |
516 history_id_word_map_.clear(); | 516 history_id_word_map_.clear(); |
517 history_info_map_.clear(); | 517 history_info_map_.clear(); |
518 word_starts_map_.clear(); | 518 word_starts_map_.clear(); |
519 } | 519 } |
520 | 520 |
521 URLIndexPrivateData::~URLIndexPrivateData() {} | 521 URLIndexPrivateData::~URLIndexPrivateData() {} |
522 | 522 |
523 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | 523 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( |
524 const String16Vector& unsorted_words) { | 524 const String16Vector& unsorted_words) { |
| 525 using HistoryIDMoveIterator = std::move_iterator<HistoryIDVector::iterator>; |
525 // Break the terms down into individual terms (words), get the candidate | 526 // Break the terms down into individual terms (words), get the candidate |
526 // set for each term, and intersect each to get a final candidate list. | 527 // set for each term, and intersect each to get a final candidate list. |
527 // Note that a single 'term' from the user's perspective might be | 528 // Note that a single 'term' from the user's perspective might be |
528 // a string like "http://www.somewebsite.com" which, from our perspective, | 529 // a string like "http://www.somewebsite.com" which, from our perspective, |
529 // is four words: 'http', 'www', 'somewebsite', and 'com'. | 530 // is four words: 'http', 'www', 'somewebsite', and 'com'. |
530 HistoryIDSet history_id_set; | 531 HistoryIDVector history_id_set; |
531 String16Vector words(unsorted_words); | 532 String16Vector words(unsorted_words); |
532 // Sort the words into the longest first as such are likely to narrow down | 533 // Sort the words into the longest first as such are likely to narrow down |
533 // the results quicker. Also, single character words are the most expensive | 534 // the results quicker. Also, single character words are the most expensive |
534 // to process so save them for last. | 535 // to process so save them for last. |
535 std::sort(words.begin(), words.end(), LengthGreater); | 536 std::sort(words.begin(), words.end(), LengthGreater); |
536 for (String16Vector::iterator iter = words.begin(); iter != words.end(); | 537 for (String16Vector::iterator iter = words.begin(); iter != words.end(); |
537 ++iter) { | 538 ++iter) { |
538 base::string16 uni_word = *iter; | 539 base::string16 uni_word = *iter; |
539 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | 540 HistoryIDVector term_history_set = HistoryIDsForTerm(uni_word); |
540 if (term_history_set.empty()) { | 541 if (term_history_set.empty()) { |
541 history_id_set.clear(); | 542 history_id_set.clear(); |
542 break; | 543 break; |
543 } | 544 } |
544 if (iter == words.begin()) { | 545 if (iter == words.begin()) { |
545 history_id_set.swap(term_history_set); | 546 history_id_set = std::move(term_history_set); |
546 } else { | 547 } else { |
547 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( | 548 history_id_set.erase( |
548 history_id_set, term_history_set); | 549 std::set_intersection(HistoryIDMoveIterator(term_history_set.begin()), |
549 history_id_set.swap(new_history_id_set); | 550 HistoryIDMoveIterator(term_history_set.end()), |
| 551 history_id_set.begin(), history_id_set.end(), |
| 552 history_id_set.begin()), |
| 553 history_id_set.end()); |
550 } | 554 } |
551 } | 555 } |
552 return history_id_set; | 556 return HistoryIDSet(history_id_set.begin(), history_id_set.end()); |
553 } | 557 } |
554 | 558 |
555 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( | 559 HistoryIDVector URLIndexPrivateData::HistoryIDsForTerm( |
556 const base::string16& term) { | 560 const base::string16& term) { |
557 if (term.empty()) | 561 if (term.empty()) |
558 return HistoryIDSet(); | 562 return {}; |
559 | 563 |
560 // TODO(mrossetti): Consider optimizing for very common terms such as | 564 // TODO(mrossetti): Consider optimizing for very common terms such as |
561 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently | 565 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently |
562 // occuring words in the user's searches. | 566 // occuring words in the user's searches. |
563 | 567 |
564 size_t term_length = term.length(); | 568 size_t term_length = term.length(); |
565 WordIDSet word_id_set; | 569 WordIDVector word_id_set; |
566 if (term_length > 1) { | 570 if (term_length > 1) { |
567 // See if this term or a prefix thereof is present in the cache. | 571 // See if this term or a prefix thereof is present in the cache. |
568 base::string16 term_lower = base::i18n::ToLower(term); | 572 base::string16 term_lower = base::i18n::ToLower(term); |
569 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); | 573 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); |
570 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | 574 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
571 cache_iter != search_term_cache_.end(); ++cache_iter) { | 575 cache_iter != search_term_cache_.end(); ++cache_iter) { |
572 if (base::StartsWith(term_lower, | 576 if (base::StartsWith(term_lower, |
573 base::i18n::ToLower(cache_iter->first), | 577 base::i18n::ToLower(cache_iter->first), |
574 base::CompareCase::SENSITIVE) && | 578 base::CompareCase::SENSITIVE) && |
575 (best_prefix == search_term_cache_.end() || | 579 (best_prefix == search_term_cache_.end() || |
(...skipping 12 matching lines...) Expand all Loading... |
588 if (prefix_length == term_length) { | 592 if (prefix_length == term_length) { |
589 best_prefix->second.used_ = true; | 593 best_prefix->second.used_ = true; |
590 return best_prefix->second.history_id_set_; | 594 return best_prefix->second.history_id_set_; |
591 } | 595 } |
592 | 596 |
593 // Otherwise we have a handy starting point. | 597 // Otherwise we have a handy starting point. |
594 // If there are no history results for this prefix then we can bail early | 598 // If there are no history results for this prefix then we can bail early |
595 // as there will be no history results for the full term. | 599 // as there will be no history results for the full term. |
596 if (best_prefix->second.history_id_set_.empty()) { | 600 if (best_prefix->second.history_id_set_.empty()) { |
597 search_term_cache_[term] = SearchTermCacheItem(); | 601 search_term_cache_[term] = SearchTermCacheItem(); |
598 return HistoryIDSet(); | 602 return {}; |
599 } | 603 } |
600 word_id_set = best_prefix->second.word_id_set_; | 604 word_id_set = best_prefix->second.word_id_set_; |
601 prefix_chars = Char16SetFromString16(best_prefix->first); | 605 prefix_chars = Char16SetFromString16(best_prefix->first); |
602 leftovers = term.substr(prefix_length); | 606 leftovers = term.substr(prefix_length); |
603 } | 607 } |
604 | 608 |
605 // Filter for each remaining, unique character in the term. | 609 // Filter for each remaining, unique character in the term. |
606 Char16Set leftover_chars = Char16SetFromString16(leftovers); | 610 Char16Set leftover_chars = Char16SetFromString16(leftovers); |
607 Char16Set unique_chars = | 611 Char16Set unique_chars = |
608 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); | 612 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); |
609 | 613 |
610 // Reduce the word set with any leftover, unprocessed characters. | 614 // Reduce the word set with any leftover, unprocessed characters. |
611 if (!unique_chars.empty()) { | 615 if (!unique_chars.empty()) { |
612 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); | 616 WordIDVector leftover_set(WordIDSetForTermChars(unique_chars)); |
613 // We might come up empty on the leftovers. | 617 // We might come up empty on the leftovers. |
614 if (leftover_set.empty()) { | 618 if (leftover_set.empty()) { |
615 search_term_cache_[term] = SearchTermCacheItem(); | 619 search_term_cache_[term] = SearchTermCacheItem(); |
616 return HistoryIDSet(); | 620 return {}; |
617 } | 621 } |
618 // Or there may not have been a prefix from which to start. | 622 // Or there may not have been a prefix from which to start. |
619 if (prefix_chars.empty()) { | 623 if (prefix_chars.empty()) { |
620 word_id_set.swap(leftover_set); | 624 word_id_set.swap(leftover_set); |
621 } else { | 625 } else { |
622 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | 626 word_id_set.erase( |
623 word_id_set, leftover_set); | 627 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
624 word_id_set.swap(new_word_id_set); | 628 leftover_set.begin(), leftover_set.end(), |
| 629 word_id_set.begin()), |
| 630 word_id_set.end()); |
625 } | 631 } |
626 } | 632 } |
627 | 633 |
628 // We must filter the word list because the resulting word set surely | 634 // We must filter the word list because the resulting word set surely |
629 // contains words which do not have the search term as a proper subset. | 635 // contains words which do not have the search term as a proper subset. |
630 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); | 636 word_id_set.erase(std::remove_if(word_id_set.begin(), word_id_set.end(), |
631 word_set_iter != word_id_set.end(); ) { | 637 [this, &term](const WordID& id) { |
632 if (word_list_[*word_set_iter].find(term) == base::string16::npos) | 638 return word_list_[id].find(term) == |
633 word_id_set.erase(word_set_iter++); | 639 base::string16::npos; |
634 else | 640 }), |
635 ++word_set_iter; | 641 word_id_set.end()); |
636 } | |
637 } else { | 642 } else { |
638 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); | 643 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); |
639 } | 644 } |
640 | 645 |
641 // If any words resulted then we can compose a set of history IDs by unioning | 646 // If any words resulted then we can compose a set of history IDs by unioning |
642 // the sets from each word. | 647 // the sets from each word. |
643 HistoryIDSet history_id_set; | 648 HistoryIDVector history_id_set; |
644 if (!word_id_set.empty()) { | 649 |
645 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | 650 for (const WordID word_id : word_id_set) { |
646 word_id_iter != word_id_set.end(); ++word_id_iter) { | 651 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); |
647 WordID word_id = *word_id_iter; | 652 if (word_iter != word_id_history_map_.end()) { |
648 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | 653 HistoryIDSet& word_history_id_set(word_iter->second); |
649 if (word_iter != word_id_history_map_.end()) { | 654 history_id_set.insert(history_id_set.end(), word_history_id_set.begin(), |
650 HistoryIDSet& word_history_id_set(word_iter->second); | 655 word_history_id_set.end()); |
651 history_id_set.insert(word_history_id_set.begin(), | |
652 word_history_id_set.end()); | |
653 } | |
654 } | 656 } |
655 } | 657 } |
656 | 658 |
| 659 std::sort(history_id_set.begin(), history_id_set.end()); |
| 660 history_id_set.erase( |
| 661 std::unique(history_id_set.begin(), history_id_set.end()), |
| 662 history_id_set.end()); |
| 663 |
657 // Record a new cache entry for this word if the term is longer than | 664 // Record a new cache entry for this word if the term is longer than |
658 // a single character. | 665 // a single character. |
659 if (term_length > 1) | 666 if (term_length > 1) |
660 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); | 667 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); |
661 | 668 |
662 return history_id_set; | 669 return history_id_set; |
663 } | 670 } |
664 | 671 |
665 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( | 672 WordIDVector URLIndexPrivateData::WordIDSetForTermChars( |
666 const Char16Set& term_chars) { | 673 const Char16Set& term_chars) { |
667 WordIDSet word_id_set; | 674 WordIDVector word_id_set; |
668 for (Char16Set::const_iterator c_iter = term_chars.begin(); | 675 for (Char16Set::const_iterator c_iter = term_chars.begin(); |
669 c_iter != term_chars.end(); ++c_iter) { | 676 c_iter != term_chars.end(); ++c_iter) { |
670 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); | 677 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); |
671 if (char_iter == char_word_map_.end()) { | 678 if (char_iter == char_word_map_.end()) { |
672 // A character was not found so there are no matching results: bail. | 679 // A character was not found so there are no matching results: bail. |
673 word_id_set.clear(); | 680 word_id_set.clear(); |
674 break; | 681 break; |
675 } | 682 } |
676 WordIDSet& char_word_id_set(char_iter->second); | 683 WordIDSet& char_word_id_set(char_iter->second); |
677 // It is possible for there to no longer be any words associated with | 684 // It is possible for there to no longer be any words associated with |
678 // a particular character. Give up in that case. | 685 // a particular character. Give up in that case. |
679 if (char_word_id_set.empty()) { | 686 if (char_word_id_set.empty()) { |
680 word_id_set.clear(); | 687 word_id_set.clear(); |
681 break; | 688 break; |
682 } | 689 } |
683 | 690 |
684 if (c_iter == term_chars.begin()) { | 691 if (c_iter == term_chars.begin()) { |
685 // First character results becomes base set of results. | 692 // First character results becomes base set of results. |
686 word_id_set = char_word_id_set; | 693 word_id_set = {char_word_id_set.begin(), char_word_id_set.end()}; |
687 } else { | 694 } else { |
688 // Subsequent character results get intersected in. | 695 word_id_set.erase( |
689 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | 696 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
690 word_id_set, char_word_id_set); | 697 char_word_id_set.begin(), |
691 word_id_set.swap(new_word_id_set); | 698 char_word_id_set.end(), word_id_set.begin()), |
| 699 word_id_set.end()); |
692 } | 700 } |
693 } | 701 } |
694 return word_id_set; | 702 return word_id_set; |
695 } | 703 } |
696 | 704 |
697 bool URLIndexPrivateData::IndexRow( | 705 bool URLIndexPrivateData::IndexRow( |
698 history::HistoryDatabase* history_db, | 706 history::HistoryDatabase* history_db, |
699 history::HistoryService* history_service, | 707 history::HistoryService* history_service, |
700 const history::URLRow& row, | 708 const history::URLRow& row, |
701 const std::set<std::string>& scheme_whitelist, | 709 const std::set<std::string>& scheme_whitelist, |
(...skipping 546 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | 1256 bool URLIndexPrivateData::URLSchemeIsWhitelisted( |
1249 const GURL& gurl, | 1257 const GURL& gurl, |
1250 const std::set<std::string>& whitelist) { | 1258 const std::set<std::string>& whitelist) { |
1251 return whitelist.find(gurl.scheme()) != whitelist.end(); | 1259 return whitelist.find(gurl.scheme()) != whitelist.end(); |
1252 } | 1260 } |
1253 | 1261 |
1254 | 1262 |
1255 // SearchTermCacheItem --------------------------------------------------------- | 1263 // SearchTermCacheItem --------------------------------------------------------- |
1256 | 1264 |
1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 1265 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
1258 const WordIDSet& word_id_set, | 1266 WordIDVector word_id_set, |
1259 const HistoryIDSet& history_id_set) | 1267 HistoryIDVector history_id_set) |
1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { | 1268 : word_id_set_(std::move(word_id_set)), |
1261 } | 1269 history_id_set_(std::move(history_id_set)), |
| 1270 used_(true) {} |
1262 | 1271 |
1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { | 1272 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { |
1264 } | 1273 } |
1265 | 1274 |
1266 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 1275 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
1267 const SearchTermCacheItem& other) = default; | 1276 const SearchTermCacheItem& other) = default; |
1268 | 1277 |
1269 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { | 1278 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { |
1270 } | 1279 } |
1271 | 1280 |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1354 // First cut: typed count, visit count, recency. | 1363 // First cut: typed count, visit count, recency. |
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1364 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
1356 // recently visited (within the last 12/24 hours) as highly important. Get | 1365 // recently visited (within the last 12/24 hours) as highly important. Get |
1357 // input from mpearson. | 1366 // input from mpearson. |
1358 if (r1.typed_count() != r2.typed_count()) | 1367 if (r1.typed_count() != r2.typed_count()) |
1359 return (r1.typed_count() > r2.typed_count()); | 1368 return (r1.typed_count() > r2.typed_count()); |
1360 if (r1.visit_count() != r2.visit_count()) | 1369 if (r1.visit_count() != r2.visit_count()) |
1361 return (r1.visit_count() > r2.visit_count()); | 1370 return (r1.visit_count() > r2.visit_count()); |
1362 return (r1.last_visit() > r2.last_visit()); | 1371 return (r1.last_visit() > r2.last_visit()); |
1363 } | 1372 } |
OLD | NEW |