| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
| 6 | 6 |
| 7 #include <stdint.h> | 7 #include <stdint.h> |
| 8 | 8 |
| 9 #include <functional> | 9 #include <functional> |
| 10 #include <iterator> | 10 #include <iterator> |
| (...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 515 word_id_history_map_.clear(); | 515 word_id_history_map_.clear(); |
| 516 history_id_word_map_.clear(); | 516 history_id_word_map_.clear(); |
| 517 history_info_map_.clear(); | 517 history_info_map_.clear(); |
| 518 word_starts_map_.clear(); | 518 word_starts_map_.clear(); |
| 519 } | 519 } |
| 520 | 520 |
| 521 URLIndexPrivateData::~URLIndexPrivateData() {} | 521 URLIndexPrivateData::~URLIndexPrivateData() {} |
| 522 | 522 |
| 523 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | 523 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( |
| 524 const String16Vector& unsorted_words) { | 524 const String16Vector& unsorted_words) { |
| 525 using HistoryIDMoveIterator = std::move_iterator<HistoryIDVector::iterator>; |
| 525 // Break the terms down into individual terms (words), get the candidate | 526 // Break the terms down into individual terms (words), get the candidate |
| 526 // set for each term, and intersect each to get a final candidate list. | 527 // set for each term, and intersect each to get a final candidate list. |
| 527 // Note that a single 'term' from the user's perspective might be | 528 // Note that a single 'term' from the user's perspective might be |
| 528 // a string like "http://www.somewebsite.com" which, from our perspective, | 529 // a string like "http://www.somewebsite.com" which, from our perspective, |
| 529 // is four words: 'http', 'www', 'somewebsite', and 'com'. | 530 // is four words: 'http', 'www', 'somewebsite', and 'com'. |
| 530 HistoryIDSet history_id_set; | 531 HistoryIDVector history_id_set; |
| 531 String16Vector words(unsorted_words); | 532 String16Vector words(unsorted_words); |
| 532 // Sort the words into the longest first as such are likely to narrow down | 533 // Sort the words into the longest first as such are likely to narrow down |
| 533 // the results quicker. Also, single character words are the most expensive | 534 // the results quicker. Also, single character words are the most expensive |
| 534 // to process so save them for last. | 535 // to process so save them for last. |
| 535 std::sort(words.begin(), words.end(), LengthGreater); | 536 std::sort(words.begin(), words.end(), LengthGreater); |
| 536 for (String16Vector::iterator iter = words.begin(); iter != words.end(); | 537 for (String16Vector::iterator iter = words.begin(); iter != words.end(); |
| 537 ++iter) { | 538 ++iter) { |
| 538 base::string16 uni_word = *iter; | 539 base::string16 uni_word = *iter; |
| 539 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | 540 HistoryIDVector term_history_set = HistoryIDsForTerm(uni_word); |
| 540 if (term_history_set.empty()) { | 541 if (term_history_set.empty()) { |
| 541 history_id_set.clear(); | 542 history_id_set.clear(); |
| 542 break; | 543 break; |
| 543 } | 544 } |
| 544 if (iter == words.begin()) { | 545 if (iter == words.begin()) { |
| 545 history_id_set.swap(term_history_set); | 546 history_id_set = std::move(term_history_set); |
| 546 } else { | 547 } else { |
| 547 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( | 548 history_id_set.erase( |
| 548 history_id_set, term_history_set); | 549 std::set_intersection(HistoryIDMoveIterator(term_history_set.begin()), |
| 549 history_id_set.swap(new_history_id_set); | 550 HistoryIDMoveIterator(term_history_set.end()), |
| 551 history_id_set.begin(), history_id_set.end(), |
| 552 history_id_set.begin()), |
| 553 history_id_set.end()); |
| 550 } | 554 } |
| 551 } | 555 } |
| 552 return history_id_set; | 556 return HistoryIDSet(history_id_set.begin(), history_id_set.end()); |
| 553 } | 557 } |
| 554 | 558 |
| 555 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( | 559 HistoryIDVector URLIndexPrivateData::HistoryIDsForTerm( |
| 556 const base::string16& term) { | 560 const base::string16& term) { |
| 557 if (term.empty()) | 561 if (term.empty()) |
| 558 return HistoryIDSet(); | 562 return {}; |
| 559 | 563 |
| 560 // TODO(mrossetti): Consider optimizing for very common terms such as | 564 // TODO(mrossetti): Consider optimizing for very common terms such as |
| 561 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently | 565 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently |
| 562 // occuring words in the user's searches. | 566 // occuring words in the user's searches. |
| 563 | 567 |
| 564 size_t term_length = term.length(); | 568 size_t term_length = term.length(); |
| 565 WordIDSet word_id_set; | 569 WordIDVector word_id_set; |
| 566 if (term_length > 1) { | 570 if (term_length > 1) { |
| 567 // See if this term or a prefix thereof is present in the cache. | 571 // See if this term or a prefix thereof is present in the cache. |
| 568 base::string16 term_lower = base::i18n::ToLower(term); | 572 base::string16 term_lower = base::i18n::ToLower(term); |
| 569 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); | 573 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); |
| 570 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | 574 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
| 571 cache_iter != search_term_cache_.end(); ++cache_iter) { | 575 cache_iter != search_term_cache_.end(); ++cache_iter) { |
| 572 if (base::StartsWith(term_lower, | 576 if (base::StartsWith(term_lower, |
| 573 base::i18n::ToLower(cache_iter->first), | 577 base::i18n::ToLower(cache_iter->first), |
| 574 base::CompareCase::SENSITIVE) && | 578 base::CompareCase::SENSITIVE) && |
| 575 (best_prefix == search_term_cache_.end() || | 579 (best_prefix == search_term_cache_.end() || |
| (...skipping 12 matching lines...) Expand all Loading... |
| 588 if (prefix_length == term_length) { | 592 if (prefix_length == term_length) { |
| 589 best_prefix->second.used_ = true; | 593 best_prefix->second.used_ = true; |
| 590 return best_prefix->second.history_id_set_; | 594 return best_prefix->second.history_id_set_; |
| 591 } | 595 } |
| 592 | 596 |
| 593 // Otherwise we have a handy starting point. | 597 // Otherwise we have a handy starting point. |
| 594 // If there are no history results for this prefix then we can bail early | 598 // If there are no history results for this prefix then we can bail early |
| 595 // as there will be no history results for the full term. | 599 // as there will be no history results for the full term. |
| 596 if (best_prefix->second.history_id_set_.empty()) { | 600 if (best_prefix->second.history_id_set_.empty()) { |
| 597 search_term_cache_[term] = SearchTermCacheItem(); | 601 search_term_cache_[term] = SearchTermCacheItem(); |
| 598 return HistoryIDSet(); | 602 return {}; |
| 599 } | 603 } |
| 600 word_id_set = best_prefix->second.word_id_set_; | 604 word_id_set = best_prefix->second.word_id_set_; |
| 601 prefix_chars = Char16SetFromString16(best_prefix->first); | 605 prefix_chars = Char16SetFromString16(best_prefix->first); |
| 602 leftovers = term.substr(prefix_length); | 606 leftovers = term.substr(prefix_length); |
| 603 } | 607 } |
| 604 | 608 |
| 605 // Filter for each remaining, unique character in the term. | 609 // Filter for each remaining, unique character in the term. |
| 606 Char16Set leftover_chars = Char16SetFromString16(leftovers); | 610 Char16Set leftover_chars = Char16SetFromString16(leftovers); |
| 607 Char16Set unique_chars = | 611 Char16Set unique_chars = |
| 608 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); | 612 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); |
| 609 | 613 |
| 610 // Reduce the word set with any leftover, unprocessed characters. | 614 // Reduce the word set with any leftover, unprocessed characters. |
| 611 if (!unique_chars.empty()) { | 615 if (!unique_chars.empty()) { |
| 612 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); | 616 WordIDVector leftover_set(WordIDSetForTermChars(unique_chars)); |
| 613 // We might come up empty on the leftovers. | 617 // We might come up empty on the leftovers. |
| 614 if (leftover_set.empty()) { | 618 if (leftover_set.empty()) { |
| 615 search_term_cache_[term] = SearchTermCacheItem(); | 619 search_term_cache_[term] = SearchTermCacheItem(); |
| 616 return HistoryIDSet(); | 620 return {}; |
| 617 } | 621 } |
| 618 // Or there may not have been a prefix from which to start. | 622 // Or there may not have been a prefix from which to start. |
| 619 if (prefix_chars.empty()) { | 623 if (prefix_chars.empty()) { |
| 620 word_id_set.swap(leftover_set); | 624 word_id_set.swap(leftover_set); |
| 621 } else { | 625 } else { |
| 622 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | 626 word_id_set.erase( |
| 623 word_id_set, leftover_set); | 627 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
| 624 word_id_set.swap(new_word_id_set); | 628 leftover_set.begin(), leftover_set.end(), |
| 629 word_id_set.begin()), |
| 630 word_id_set.end()); |
| 625 } | 631 } |
| 626 } | 632 } |
| 627 | 633 |
| 628 // We must filter the word list because the resulting word set surely | 634 // We must filter the word list because the resulting word set surely |
| 629 // contains words which do not have the search term as a proper subset. | 635 // contains words which do not have the search term as a proper subset. |
| 630 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); | 636 word_id_set.erase(std::remove_if(word_id_set.begin(), word_id_set.end(), |
| 631 word_set_iter != word_id_set.end(); ) { | 637 [this, &term](const WordID& id) { |
| 632 if (word_list_[*word_set_iter].find(term) == base::string16::npos) | 638 return word_list_[id].find(term) == |
| 633 word_id_set.erase(word_set_iter++); | 639 base::string16::npos; |
| 634 else | 640 }), |
| 635 ++word_set_iter; | 641 word_id_set.end()); |
| 636 } | |
| 637 } else { | 642 } else { |
| 638 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); | 643 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); |
| 639 } | 644 } |
| 640 | 645 |
| 641 // If any words resulted then we can compose a set of history IDs by unioning | 646 // If any words resulted then we can compose a set of history IDs by unioning |
| 642 // the sets from each word. | 647 // the sets from each word. |
| 643 HistoryIDSet history_id_set; | 648 HistoryIDVector history_id_set; |
| 644 if (!word_id_set.empty()) { | 649 |
| 645 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | 650 for (const WordID word_id : word_id_set) { |
| 646 word_id_iter != word_id_set.end(); ++word_id_iter) { | 651 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); |
| 647 WordID word_id = *word_id_iter; | 652 if (word_iter != word_id_history_map_.end()) { |
| 648 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | 653 HistoryIDSet& word_history_id_set(word_iter->second); |
| 649 if (word_iter != word_id_history_map_.end()) { | 654 history_id_set.insert(history_id_set.end(), word_history_id_set.begin(), |
| 650 HistoryIDSet& word_history_id_set(word_iter->second); | 655 word_history_id_set.end()); |
| 651 history_id_set.insert(word_history_id_set.begin(), | |
| 652 word_history_id_set.end()); | |
| 653 } | |
| 654 } | 656 } |
| 655 } | 657 } |
| 656 | 658 |
| 659 std::sort(history_id_set.begin(), history_id_set.end()); |
| 660 history_id_set.erase( |
| 661 std::unique(history_id_set.begin(), history_id_set.end()), |
| 662 history_id_set.end()); |
| 663 |
| 657 // Record a new cache entry for this word if the term is longer than | 664 // Record a new cache entry for this word if the term is longer than |
| 658 // a single character. | 665 // a single character. |
| 659 if (term_length > 1) | 666 if (term_length > 1) |
| 660 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); | 667 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); |
| 661 | 668 |
| 662 return history_id_set; | 669 return history_id_set; |
| 663 } | 670 } |
| 664 | 671 |
| 665 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( | 672 WordIDVector URLIndexPrivateData::WordIDSetForTermChars( |
| 666 const Char16Set& term_chars) { | 673 const Char16Set& term_chars) { |
| 667 WordIDSet word_id_set; | 674 WordIDVector word_id_set; |
| 668 for (Char16Set::const_iterator c_iter = term_chars.begin(); | 675 for (Char16Set::const_iterator c_iter = term_chars.begin(); |
| 669 c_iter != term_chars.end(); ++c_iter) { | 676 c_iter != term_chars.end(); ++c_iter) { |
| 670 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); | 677 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); |
| 671 if (char_iter == char_word_map_.end()) { | 678 if (char_iter == char_word_map_.end()) { |
| 672 // A character was not found so there are no matching results: bail. | 679 // A character was not found so there are no matching results: bail. |
| 673 word_id_set.clear(); | 680 word_id_set.clear(); |
| 674 break; | 681 break; |
| 675 } | 682 } |
| 676 WordIDSet& char_word_id_set(char_iter->second); | 683 WordIDSet& char_word_id_set(char_iter->second); |
| 677 // It is possible for there to no longer be any words associated with | 684 // It is possible for there to no longer be any words associated with |
| 678 // a particular character. Give up in that case. | 685 // a particular character. Give up in that case. |
| 679 if (char_word_id_set.empty()) { | 686 if (char_word_id_set.empty()) { |
| 680 word_id_set.clear(); | 687 word_id_set.clear(); |
| 681 break; | 688 break; |
| 682 } | 689 } |
| 683 | 690 |
| 684 if (c_iter == term_chars.begin()) { | 691 if (c_iter == term_chars.begin()) { |
| 685 // First character results becomes base set of results. | 692 // First character results becomes base set of results. |
| 686 word_id_set = char_word_id_set; | 693 word_id_set = {char_word_id_set.begin(), char_word_id_set.end()}; |
| 687 } else { | 694 } else { |
| 688 // Subsequent character results get intersected in. | 695 word_id_set.erase( |
| 689 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | 696 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
| 690 word_id_set, char_word_id_set); | 697 char_word_id_set.begin(), |
| 691 word_id_set.swap(new_word_id_set); | 698 char_word_id_set.end(), word_id_set.begin()), |
| 699 word_id_set.end()); |
| 692 } | 700 } |
| 693 } | 701 } |
| 694 return word_id_set; | 702 return word_id_set; |
| 695 } | 703 } |
| 696 | 704 |
| 697 bool URLIndexPrivateData::IndexRow( | 705 bool URLIndexPrivateData::IndexRow( |
| 698 history::HistoryDatabase* history_db, | 706 history::HistoryDatabase* history_db, |
| 699 history::HistoryService* history_service, | 707 history::HistoryService* history_service, |
| 700 const history::URLRow& row, | 708 const history::URLRow& row, |
| 701 const std::set<std::string>& scheme_whitelist, | 709 const std::set<std::string>& scheme_whitelist, |
| (...skipping 546 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | 1256 bool URLIndexPrivateData::URLSchemeIsWhitelisted( |
| 1249 const GURL& gurl, | 1257 const GURL& gurl, |
| 1250 const std::set<std::string>& whitelist) { | 1258 const std::set<std::string>& whitelist) { |
| 1251 return whitelist.find(gurl.scheme()) != whitelist.end(); | 1259 return whitelist.find(gurl.scheme()) != whitelist.end(); |
| 1252 } | 1260 } |
| 1253 | 1261 |
| 1254 | 1262 |
| 1255 // SearchTermCacheItem --------------------------------------------------------- | 1263 // SearchTermCacheItem --------------------------------------------------------- |
| 1256 | 1264 |
| 1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 1265 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
| 1258 const WordIDSet& word_id_set, | 1266 WordIDVector word_id_set, |
| 1259 const HistoryIDSet& history_id_set) | 1267 HistoryIDVector history_id_set) |
| 1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { | 1268 : word_id_set_(std::move(word_id_set)), |
| 1261 } | 1269 history_id_set_(std::move(history_id_set)), |
| 1270 used_(true) {} |
| 1262 | 1271 |
| 1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { | 1272 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { |
| 1264 } | 1273 } |
| 1265 | 1274 |
| 1266 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 1275 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
| 1267 const SearchTermCacheItem& other) = default; | 1276 const SearchTermCacheItem& other) = default; |
| 1268 | 1277 |
| 1269 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { | 1278 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { |
| 1270 } | 1279 } |
| 1271 | 1280 |
| (...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1354 // First cut: typed count, visit count, recency. | 1363 // First cut: typed count, visit count, recency. |
| 1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1364 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
| 1356 // recently visited (within the last 12/24 hours) as highly important. Get | 1365 // recently visited (within the last 12/24 hours) as highly important. Get |
| 1357 // input from mpearson. | 1366 // input from mpearson. |
| 1358 if (r1.typed_count() != r2.typed_count()) | 1367 if (r1.typed_count() != r2.typed_count()) |
| 1359 return (r1.typed_count() > r2.typed_count()); | 1368 return (r1.typed_count() > r2.typed_count()); |
| 1360 if (r1.visit_count() != r2.visit_count()) | 1369 if (r1.visit_count() != r2.visit_count()) |
| 1361 return (r1.visit_count() > r2.visit_count()); | 1370 return (r1.visit_count() > r2.visit_count()); |
| 1362 return (r1.last_visit() > r2.last_visit()); | 1371 return (r1.last_visit() > r2.last_visit()); |
| 1363 } | 1372 } |
| OLD | NEW |