Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(840)

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2364553004: Updated and refactored URLIndexPrivateData::HistoryItemsForTerms to handle searching the history fo… (Closed)
Patch Set: Fixed bug to properly increment post_filter_item_count_. Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « components/omnibox/browser/history_quick_provider_unittest.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/url_index_private_data.h" 5 #include "components/omnibox/browser/url_index_private_data.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <functional> 9 #include <functional>
10 #include <iterator> 10 #include <iterator>
(...skipping 135 matching lines...) Expand 10 before | Expand all | Expand 10 after
146 146
147 URLIndexPrivateData::URLIndexPrivateData() 147 URLIndexPrivateData::URLIndexPrivateData()
148 : restored_cache_version_(0), 148 : restored_cache_version_(0),
149 saved_cache_version_(kCurrentCacheFileVersion), 149 saved_cache_version_(kCurrentCacheFileVersion),
150 pre_filter_item_count_(0), 150 pre_filter_item_count_(0),
151 post_filter_item_count_(0), 151 post_filter_item_count_(0),
152 post_scoring_item_count_(0) { 152 post_scoring_item_count_(0) {
153 } 153 }
154 154
155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
156 base::string16 search_string, 156 base::string16 original_search_string,
157 size_t cursor_position, 157 size_t cursor_position,
158 size_t max_matches, 158 size_t max_matches,
159 bookmarks::BookmarkModel* bookmark_model, 159 bookmarks::BookmarkModel* bookmark_model,
160 TemplateURLService* template_url_service) { 160 TemplateURLService* template_url_service) {
161 // If cursor position is set and useful (not at either end of the
162 // string), allow the search string to be broken at cursor position.
163 // We do this by pretending there's a space where the cursor is.
164 if ((cursor_position != base::string16::npos) &&
165 (cursor_position < search_string.length()) &&
166 (cursor_position > 0)) {
167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
168 }
169 pre_filter_item_count_ = 0; 161 pre_filter_item_count_ = 0;
170 post_filter_item_count_ = 0; 162 post_filter_item_count_ = 0;
171 post_scoring_item_count_ = 0; 163 post_scoring_item_count_ = 0;
172 // The search string we receive may contain escaped characters. For reducing 164
173 // the index we need individual, lower-cased words, ignoring escapings. For 165 // This list will contain the original search string and any other string
174 // the final filtering we need whitespace separated substrings possibly 166 // transformations.
175 // containing escaped characters. 167 String16Vector search_strings;
176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); 168 search_strings.push_back(original_search_string);
177 base::string16 lower_unescaped_string = 169 if ((cursor_position != base::string16::npos) &&
178 net::UnescapeURLComponent(lower_raw_string, 170 (cursor_position < original_search_string.length()) &&
179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | 171 (cursor_position > 0)) {
180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); 172 // The original search_string broken at cursor position. This is one type of
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the 173 // transformation.
182 // search string. When the user types "colspec=ID%20Mstone Release" we get 174 base::string16 transformed_search_string(original_search_string);
183 // four 'words': "colspec", "id", "mstone" and "release". 175 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
184 String16Vector lower_words( 176 search_strings.push_back(transformed_search_string);
185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); 177 }
178
186 ScoredHistoryMatches scored_items; 179 ScoredHistoryMatches scored_items;
187 180 // Invalidate the term cache and return if we have indexed no words (probably
188 // Do nothing if we have indexed no words (probably because we've not been 181 // because we've not been initialized yet).
189 // initialized yet) or the search string has no words. 182 if (word_list_.empty()) {
190 if (word_list_.empty() || lower_words.empty()) { 183 search_term_cache_.clear();
191 search_term_cache_.clear(); // Invalidate the term cache.
192 return scored_items; 184 return scored_items;
193 } 185 }
194
195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 186 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
196 // approach. 187 // approach.
197 ResetSearchTermCache(); 188 ResetSearchTermCache();
198 189
199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 190 for (const base::string16& search_string : search_strings) {
191 // The search string we receive may contain escaped characters. For reducing
192 // the index we need individual, lower-cased words, ignoring escapings. For
193 // the final filtering we need whitespace separated substrings possibly
194 // containing escaped characters.
195 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
196 base::string16 lower_unescaped_string = net::UnescapeURLComponent(
197 lower_raw_string,
198 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
199 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
200 200
201 // Trim the candidate pool if it is large. Note that we do not filter out 201 // Extract individual 'words' (as opposed to 'terms'; see below) from the
202 // items that do not contain the search terms as proper substrings -- doing 202 // search string. When the user types "colspec=ID%20Mstone Release" we get
203 // so is the performance-costly operation we are trying to avoid in order 203 // four 'words': "colspec", "id", "mstone" and "release".
204 // to maintain omnibox responsiveness. 204 String16Vector lower_words(
205 const size_t kItemsToScoreLimit = 500; 205 String16VectorFromString16(lower_unescaped_string, false, nullptr));
206 pre_filter_item_count_ = history_id_set.size(); 206 if (lower_words.empty())
207 // If we trim the results set we do not want to cache the results for next 207 continue;
208 // time as the user's ultimately desired result could easily be eliminated 208 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
209 // in this early rough filter. 209 // Trim the candidate pool if it is large. Note that we do not filter out
210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); 210 // items that do not contain the search terms as proper substrings --
211 if (was_trimmed) { 211 // doing so is the performance-costly operation we are trying to avoid in
212 HistoryIDVector history_ids; 212 // order to maintain omnibox responsiveness.
213 std::copy(history_id_set.begin(), history_id_set.end(), 213 const size_t kItemsToScoreLimit = 500;
214 std::back_inserter(history_ids)); 214 pre_filter_item_count_ += history_id_set.size();
215 // Trim down the set by sorting by typed-count, visit-count, and last 215 // If we trim the results set we do not want to cache the results for next
216 // visit. 216 // time as the user's ultimately desired result could easily be eliminated
217 HistoryItemFactorGreater 217 // in this early rough filter.
218 item_factor_functor(history_info_map_); 218 if (pre_filter_item_count_ > kItemsToScoreLimit) {
219 std::partial_sort(history_ids.begin(), 219 HistoryIDVector history_ids;
220 history_ids.begin() + kItemsToScoreLimit, 220 std::copy(history_id_set.begin(), history_id_set.end(),
221 history_ids.end(), 221 std::back_inserter(history_ids));
222 item_factor_functor); 222 // Trim down the set by sorting by typed-count, visit-count, and last
223 history_id_set.clear(); 223 // visit.
224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, 224 HistoryItemFactorGreater item_factor_functor(history_info_map_);
225 std::inserter(history_id_set, history_id_set.end())); 225 std::partial_sort(history_ids.begin(),
226 post_filter_item_count_ = history_id_set.size(); 226 history_ids.begin() + kItemsToScoreLimit,
227 } 227 history_ids.end(), item_factor_functor);
228 history_id_set.clear();
Mark P 2016/09/23 18:02:13 Should we be clearing post_filter_item_count_ here
Lavar Askew 2016/09/23 19:07:48 I disagree because history_id_set is reinitialized
Mark P 2016/09/23 19:32:14 Ah, good point.
229 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
230 std::inserter(history_id_set, history_id_set.end()));
231 post_filter_item_count_ += history_id_set.size();
232 } else {
233 post_filter_item_count_ += pre_filter_item_count_;
234 }
235 // Pass over all of the candidates filtering out any without a proper
236 // substring match, inserting those which pass in order by score. Note that
237 // in this step we are using the raw search string complete with escaped
238 // URL elements. When the user has specifically typed something akin to
239 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
240 // specific substring appears in the URL or page title.
228 241
229 // Pass over all of the candidates filtering out any without a proper 242 // We call these 'terms' (as opposed to 'words'; see above) as in this case
230 // substring match, inserting those which pass in order by score. Note that 243 // we only want to break up the search string on 'true' whitespace rather
231 // in this step we are using the raw search string complete with escaped 244 // than escaped whitespace. When the user types "colspec=ID%20Mstone
232 // URL elements. When the user has specifically typed something akin to 245 // Release" we get two 'terms': "colspec=id%20mstone" and "release".
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 246 String16Vector lower_raw_terms =
234 // specific substring appears in the URL or page title. 247 base::SplitString(lower_raw_string, base::kWhitespaceUTF16,
248 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
235 249
236 // We call these 'terms' (as opposed to 'words'; see above) as in this case
237 // we only want to break up the search string on 'true' whitespace rather than
238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
239 // get two 'terms': "colspec=id%20mstone" and "release".
240 String16Vector lower_raw_terms = base::SplitString(
241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
242 base::SPLIT_WANT_NONEMPTY);
243 if (lower_raw_terms.empty()) {
244 // Don't score matches when there are no terms to score against. (It's 250 // Don't score matches when there are no terms to score against. (It's
245 // possible that the word break iterater that extracts words to search 251 // possible that the word break iterater that extracts words to search
246 // for in the database allows some whitespace "words" whereas SplitString 252 // for in the database allows some whitespace "words" whereas SplitString
247 // excludes a long list of whitespace.) One could write a scoring 253 // excludes a long list of whitespace.) One could write a scoring function
248 // function that gives a reasonable order to matches when there 254 // that gives a reasonable order to matches when there are no terms (i.e.,
249 // are no terms (i.e., all the words are some form of whitespace), 255 // all the words are some form of whitespace), but this is such a rare edge
250 // but this is such a rare edge case that it's not worth the time. 256 // case that it's not worth the time.
251 return scored_items; 257 if (lower_raw_terms.empty())
258 continue;
259 ScoredHistoryMatches temp_scored_items =
260 std::for_each(history_id_set.begin(), history_id_set.end(),
261 AddHistoryMatch(bookmark_model, template_url_service,
262 *this, lower_raw_string, lower_raw_terms,
263 base::Time::Now()))
264 .ScoredMatches();
265 scored_items.insert(scored_items.end(), temp_scored_items.begin(),
266 temp_scored_items.end());
252 } 267 }
253 scored_items =
254 std::for_each(
255 history_id_set.begin(), history_id_set.end(),
256 AddHistoryMatch(bookmark_model, template_url_service, *this,
257 lower_raw_string, lower_raw_terms,
258 base::Time::Now())).ScoredMatches();
259
260 // Select and sort only the top |max_matches| results. 268 // Select and sort only the top |max_matches| results.
261 if (scored_items.size() > max_matches) { 269 if (scored_items.size() > max_matches) {
262 std::partial_sort(scored_items.begin(), 270 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,
263 scored_items.begin() +
264 max_matches,
265 scored_items.end(), 271 scored_items.end(),
266 ScoredHistoryMatch::MatchScoreGreater); 272 ScoredHistoryMatch::MatchScoreGreater);
267 scored_items.resize(max_matches); 273 scored_items.resize(max_matches);
268 } else { 274 } else {
269 std::sort(scored_items.begin(), scored_items.end(), 275 std::sort(scored_items.begin(), scored_items.end(),
270 ScoredHistoryMatch::MatchScoreGreater); 276 ScoredHistoryMatch::MatchScoreGreater);
271 } 277 }
272 post_scoring_item_count_ = scored_items.size(); 278 post_scoring_item_count_ = scored_items.size();
273 279 if (pre_filter_item_count_ > post_filter_item_count_) {
274 if (was_trimmed) {
275 search_term_cache_.clear(); // Invalidate the term cache. 280 search_term_cache_.clear(); // Invalidate the term cache.
276 } else { 281 } else {
277 // Remove any stale SearchTermCacheItems. 282 // Remove any stale SearchTermCacheItems.
278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 283 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
279 cache_iter != search_term_cache_.end(); ) { 284 cache_iter != search_term_cache_.end(); ) {
280 if (!cache_iter->second.used_) 285 if (!cache_iter->second.used_)
281 search_term_cache_.erase(cache_iter++); 286 search_term_cache_.erase(cache_iter++);
282 else 287 else
283 ++cache_iter; 288 ++cache_iter;
284 } 289 }
(...skipping 1069 matching lines...) Expand 10 before | Expand all | Expand 10 after
1354 // First cut: typed count, visit count, recency. 1359 // First cut: typed count, visit count, recency.
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1360 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1356 // recently visited (within the last 12/24 hours) as highly important. Get 1361 // recently visited (within the last 12/24 hours) as highly important. Get
1357 // input from mpearson. 1362 // input from mpearson.
1358 if (r1.typed_count() != r2.typed_count()) 1363 if (r1.typed_count() != r2.typed_count())
1359 return (r1.typed_count() > r2.typed_count()); 1364 return (r1.typed_count() > r2.typed_count());
1360 if (r1.visit_count() != r2.visit_count()) 1365 if (r1.visit_count() != r2.visit_count())
1361 return (r1.visit_count() > r2.visit_count()); 1366 return (r1.visit_count() > r2.visit_count());
1362 return (r1.last_visit() > r2.last_visit()); 1367 return (r1.last_visit() > r2.last_visit());
1363 } 1368 }
OLDNEW
« no previous file with comments | « components/omnibox/browser/history_quick_provider_unittest.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698