components/omnibox/browser/url_index_private_data.cc - Issue 2363463002: Generating autocomplete results with and without word breaks in the Omnibox.

Unified Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2363463002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed)

Patch Set: Resubmitting patch for closed code review with iss number 2363463002. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: components/omnibox/browser/url_index_private_data.cc

diff --git a/components/omnibox/browser/url_index_private_data.cc b/components/omnibox/browser/url_index_private_data.cc

index 5e7f99c6cee99578326402f23d3e38fa1bb4d82d..38d82ea1d3a842cc4e4379c49263200e47b945f7 100644

--- a/components/omnibox/browser/url_index_private_data.cc

+++ b/components/omnibox/browser/url_index_private_data.cc

@@ -140,7 +140,6 @@ void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {

UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {

}

// URLIndexPrivateData ---------------------------------------------------------

URLIndexPrivateData::URLIndexPrivateData()

@@ -152,7 +151,7 @@ URLIndexPrivateData::URLIndexPrivateData()

}

ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

- base::string16 search_string,

+ base::string16 original_search_string,

size_t cursor_position,

size_t max_matches,

bookmarks::BookmarkModel* bookmark_model,

@@ -160,164 +159,107 @@ ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

pre_filter_item_count_ = 0;

post_filter_item_count_ = 0;

post_scoring_item_count_ = 0;

- // The search string we receive may contain escaped characters. For reducing

- // the index we need individual, lower-cased words, ignoring escapings. For

- // the final filtering we need whitespace separated substrings possibly

- // containing escaped characters.

- base::string16 lower_raw_string(base::i18n::ToLower(search_string));

- base::string16 lower_unescaped_string =

- net::UnescapeURLComponent(lower_raw_string,

- net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |

- net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

- // Extract individual 'words' (as opposed to 'terms'; see below) from the

- // search string. When the user types "colspec=ID%20Mstone Release" we get

- // four 'words': "colspec", "id", "mstone" and "release".

- String16Vector lower_words(

- String16VectorFromString16(lower_unescaped_string, false, nullptr));

- ScoredHistoryMatches scored_items;

- // Do nothing if we have indexed no words (probably because we've not been

- // initialized yet) or the search string has no words.

- if (word_list_.empty() || lower_words.empty()) {

- search_term_cache_.clear(); // Invalidate the term cache.

- return scored_items;

+ // This list will contain the original search string and any other string

+ // transformations.

+ String16Vector search_strings;

+ search_strings.push_back(original_search_string);

+ if ((cursor_position != base::string16::npos) &&

+ (cursor_position < original_search_string.length()) &&

+ (cursor_position > 0)) {

+ // The original search_string broken at cursor position. This is one type of

+ // transformation.

+ base::string16 transformed_search_string(original_search_string);

+ transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

+ search_strings.push_back(transformed_search_string);

}

+ ScoredHistoryMatches scored_items;

+ // Invalidate the term cache and return if we have indexed no words (probably

+ // because we've not been initialized yet).

+ if (word_list_.empty()) {

+ search_term_cache_.clear();

+ return scored_items;

+ }

// Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

// approach.

ResetSearchTermCache();

- HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

- // Trim the candidate pool if it is large. Note that we do not filter out

- // items that do not contain the search terms as proper substrings -- doing

- // so is the performance-costly operation we are trying to avoid in order

- // to maintain omnibox responsiveness.

- const size_t kItemsToScoreLimit = 500;

- pre_filter_item_count_ = history_id_set.size();

- // If we trim the results set we do not want to cache the results for next

- // time as the user's ultimately desired result could easily be eliminated

- // in this early rough filter.

- bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

- if (was_trimmed) {

- HistoryIDVector history_ids;

- std::copy(history_id_set.begin(), history_id_set.end(),

- std::back_inserter(history_ids));

- // Trim down the set by sorting by typed-count, visit-count, and last

- // visit.

- HistoryItemFactorGreater

- item_factor_functor(history_info_map_);

- std::partial_sort(history_ids.begin(),

- history_ids.begin() + kItemsToScoreLimit,

- history_ids.end(),

- item_factor_functor);

- history_id_set.clear();

- std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

- std::inserter(history_id_set, history_id_set.end()));

- post_filter_item_count_ = history_id_set.size();

- }

+ for (const base::string16& search_string : search_strings) {

+ // The search string we receive may contain escaped characters. For reducing

+ // the index we need individual, lower-cased words, ignoring escapings. For

+ // the final filtering we need whitespace separated substrings possibly

+ // containing escaped characters.

+ base::string16 lower_raw_string(base::i18n::ToLower(search_string));

+ base::string16 lower_unescaped_string = net::UnescapeURLComponent(

+ lower_raw_string,

+ net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |

+ net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

+ // Extract individual 'words' (as opposed to 'terms'; see below) from the

+ // search string. When the user types "colspec=ID%20Mstone Release" we get

+ // four 'words': "colspec", "id", "mstone" and "release".

+ String16Vector lower_words(

+ String16VectorFromString16(lower_unescaped_string, false, nullptr));

+ if (lower_words.empty())

+ continue;

+ HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

+ // Trim the candidate pool if it is large. Note that we do not filter out

+ // items that do not contain the search terms as proper substrings --

+ // doing so is the performance-costly operation we are trying to avoid in

+ // order to maintain omnibox responsiveness.

+ const size_t kItemsToScoreLimit = 500;

+ pre_filter_item_count_ += history_id_set.size();

+ // If we trim the results set we do not want to cache the results for next

+ // time as the user's ultimately desired result could easily be eliminated

+ // in this early rough filter.

+ if (pre_filter_item_count_ > kItemsToScoreLimit) {

+ HistoryIDVector history_ids;

+ std::copy(history_id_set.begin(), history_id_set.end(),

+ std::back_inserter(history_ids));

+ // Trim down the set by sorting by typed-count, visit-count, and last

+ // visit.

+ HistoryItemFactorGreater item_factor_functor(history_info_map_);

+ std::partial_sort(history_ids.begin(),

+ history_ids.begin() + kItemsToScoreLimit,

+ history_ids.end(), item_factor_functor);

+ history_id_set.clear();

+ std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

+ std::inserter(history_id_set, history_id_set.end()));

+ post_filter_item_count_ = post_filter_item_count_ + history_id_set.size();

+ }

+ // Pass over all of the candidates filtering out any without a proper

+ // substring match, inserting those which pass in order by score. Note that

+ // in this step we are using the raw search string complete with escaped

+ // URL elements. When the user has specifically typed something akin to

+ // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

+ // specific substring appears in the URL or page title.

+ // We call these 'terms' (as opposed to 'words'; see above) as in this case

+ // we only want to break up the search string on 'true' whitespace rather

+ // than escaped whitespace. When the user types "colspec=ID%20Mstone

+ // Release" we get two 'terms': "colspec=id%20mstone" and "release".

+ String16Vector lower_raw_terms =

+ base::SplitString(lower_raw_string, base::kWhitespaceUTF16,

+ base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

- // Pass over all of the candidates filtering out any without a proper

- // substring match, inserting those which pass in order by score. Note that

- // in this step we are using the raw search string complete with escaped

- // URL elements. When the user has specifically typed something akin to

- // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

- // specific substring appears in the URL or page title.

- // We call these 'terms' (as opposed to 'words'; see above) as in this case

- // we only want to break up the search string on 'true' whitespace rather than

- // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

- // get two 'terms': "colspec=id%20mstone" and "release".

- String16Vector lower_raw_terms = base::SplitString(

- lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

- base::SPLIT_WANT_NONEMPTY);

- if (lower_raw_terms.empty()) {

// Don't score matches when there are no terms to score against. (It's

// possible that the word break iterater that extracts words to search

// for in the database allows some whitespace "words" whereas SplitString

- // excludes a long list of whitespace.) One could write a scoring

- // function that gives a reasonable order to matches when there

- // are no terms (i.e., all the words are some form of whitespace),

- // but this is such a rare edge case that it's not worth the time.

- return scored_items;

- }

- scored_items =

- std::for_each(

- history_id_set.begin(), history_id_set.end(),

- AddHistoryMatch(bookmark_model, template_url_service, *this,

- lower_raw_string, lower_raw_terms, base::Time::Now()))

- .ScoredMatches();

- // If cursor position is set and useful (not at either end of the string),

- // allow the search string to be broken at cursor position. We do this by

- // pretending there's a space where the cursor is.

- // This phenomena will be referred to as space ghosting.

- // Going forward we perform a similar process for obtaining

- // scored items as above. In fact all of the variable names are the same

- // except that they are prepended with 'transformed_' reflecting that these

- // variables are related to space ghosting.

- if ((cursor_position != base::string16::npos) &&

- (cursor_position < search_string.length()) && (cursor_position > 0)) {

- // The original search_string broken at cursor position.

- base::string16 transformed_search_string(search_string);

- transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

- // The same as lower_raw_string and defined for the same reasons.

- base::string16 transformed_lower_raw_string(

- base::i18n::ToLower(transformed_search_string));

- String16Vector transformed_lower_raw_terms =

- base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16,

- base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

- if (!transformed_lower_raw_terms.empty()) {

- base::string16 transormed_lower_unescaped_string =

- net::UnescapeURLComponent(

- transformed_lower_raw_string,

- net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |

- net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

- String16Vector transformed_lower_words(String16VectorFromString16(

- transormed_lower_unescaped_string, false, nullptr));

- HistoryIDSet transformed_history_id_set =

- HistoryIDSetFromWords(transformed_lower_words);

- if (history_id_set != transformed_history_id_set) {

- pre_filter_item_count_ = history_id_set.size();

- // Trim the candidate pool if it is large as above.

- was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

- if (was_trimmed) {

- HistoryIDVector transformed_history_ids;

- std::copy(transformed_history_id_set.begin(),

- transformed_history_id_set.end(),

- std::back_inserter(transformed_history_ids));

- // Trim down the set by sorting by typed-count, visit-count, and last

- // visit.

- HistoryItemFactorGreater item_factor_functor(history_info_map_);

- std::partial_sort(

- transformed_history_ids.begin(),

- transformed_history_ids.begin() + kItemsToScoreLimit,

- transformed_history_ids.end(), item_factor_functor);

- transformed_history_id_set.clear();

- std::copy(transformed_history_ids.begin(),

- transformed_history_ids.begin() + kItemsToScoreLimit,

- std::inserter(transformed_history_id_set,

- transformed_history_id_set.end()));

- }

- history_id_set.insert(transformed_history_id_set.begin(),

- transformed_history_id_set.end());

- ScoredHistoryMatches transformed_scored_items =

- std::for_each(

- history_id_set.begin(), history_id_set.end(),

- AddHistoryMatch(bookmark_model, template_url_service, *this,

- transformed_lower_raw_string,

- transformed_lower_raw_terms, base::Time::Now()))

- .ScoredMatches();

- scored_items.insert(scored_items.end(),

- transformed_scored_items.begin(),

- transformed_scored_items.end());

- }

+ // excludes a long list of whitespace.) One could write a scoring function

+ // that gives a reasonable order to matches when there are no terms (i.e.,

+ // all the words are some form of whitespace), but this is such a rare edge

+ // case that it's not worth the time.

+ if (lower_raw_terms.empty())

+ continue;

+ ScoredHistoryMatches temp_scored_items =

+ std::for_each(history_id_set.begin(), history_id_set.end(),

+ AddHistoryMatch(bookmark_model, template_url_service,

+ *this, lower_raw_string, lower_raw_terms,

+ base::Time::Now()))

+ .ScoredMatches();

+ scored_items.insert(scored_items.end(), temp_scored_items.begin(),

+ temp_scored_items.end());

}

// Select and sort only the top |max_matches| results.

if (scored_items.size() > max_matches) {

@@ -330,8 +272,7 @@ ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

ScoredHistoryMatch::MatchScoreGreater);

}

post_scoring_item_count_ = scored_items.size();

- if (was_trimmed) {

+ if (pre_filter_item_count_ > post_filter_item_count_) {

search_term_cache_.clear(); // Invalidate the term cache.

} else {

// Remove any stale SearchTermCacheItems.

« no previous file with comments | « no previous file | no next file » | no next file with comments »