Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(687)

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Code review fixes for patch entitled "Generating autocomplete results with and without word breaks … Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/url_index_private_data.h" 5 #include "components/omnibox/browser/url_index_private_data.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <functional> 9 #include <functional>
10 #include <iterator> 10 #include <iterator>
(...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 // URLIndexPrivateData --------------------------------------------------------- 145 // URLIndexPrivateData ---------------------------------------------------------
146 146
147 URLIndexPrivateData::URLIndexPrivateData() 147 URLIndexPrivateData::URLIndexPrivateData()
148 : restored_cache_version_(0), 148 : restored_cache_version_(0),
149 saved_cache_version_(kCurrentCacheFileVersion), 149 saved_cache_version_(kCurrentCacheFileVersion),
150 pre_filter_item_count_(0), 150 pre_filter_item_count_(0),
151 post_filter_item_count_(0), 151 post_filter_item_count_(0),
152 post_scoring_item_count_(0) { 152 post_scoring_item_count_(0) {
153 } 153 }
154 154
155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 155 HistoryIDSet URLIndexPrivateData::HistoryItemsForWords(
156 base::string16 search_string, 156 base::string16 search_string) {
157 size_t cursor_position, 157
158 size_t max_matches, 158 HistoryIDSet history_id_set;
Mark P 2016/08/10 17:46:19 This variable isn't needed; simply return HistoryI
Lavar Askew 2016/08/18 03:07:51 Done.
159 bookmarks::BookmarkModel* bookmark_model, 159
160 TemplateURLService* template_url_service) {
161 // If cursor position is set and useful (not at either end of the
162 // string), allow the search string to be broken at cursor position.
163 // We do this by pretending there's a space where the cursor is.
164 if ((cursor_position != base::string16::npos) &&
165 (cursor_position < search_string.length()) &&
166 (cursor_position > 0)) {
167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
168 }
169 pre_filter_item_count_ = 0;
170 post_filter_item_count_ = 0;
171 post_scoring_item_count_ = 0;
172 // The search string we receive may contain escaped characters. For reducing 160 // The search string we receive may contain escaped characters. For reducing
173 // the index we need individual, lower-cased words, ignoring escapings. For 161 // the index we need individual, lower-cased words, ignoring escapings. For
174 // the final filtering we need whitespace separated substrings possibly 162 // the final filtering we need whitespace separated substrings possibly
175 // containing escaped characters. 163 // containing escaped characters.
176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); 164 base::string16 lower_raw_string(
165 base::i18n::ToLower(search_string));
177 base::string16 lower_unescaped_string = 166 base::string16 lower_unescaped_string =
178 net::UnescapeURLComponent(lower_raw_string, 167 net::UnescapeURLComponent(lower_raw_string,
179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | 168 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); 169 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the 170 // Extract individual 'words' (as opposed to 'terms'; see below) from the
182 // search string. When the user types "colspec=ID%20Mstone Release" we get 171 // search string. When the user types "colspec=ID%20Mstone Release" we get
183 // four 'words': "colspec", "id", "mstone" and "release". 172 // four 'words': "colspec", "id", "mstone" and "release".
184 String16Vector lower_words( 173 String16Vector lower_words(
185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); 174 String16VectorFromString16(
186 ScoredHistoryMatches scored_items; 175 lower_unescaped_string, false, nullptr));
187 176
188 // Do nothing if we have indexed no words (probably because we've not been 177 // Do nothing if we have indexed no words (probably because we've not been
189 // initialized yet) or the search string has no words. 178 // initialized yet) or the search string has no words.
190 if (word_list_.empty() || lower_words.empty()) { 179 if (word_list_.empty() || lower_words.empty()) {
191 search_term_cache_.clear(); // Invalidate the term cache. 180 search_term_cache_.clear(); // Invalidate the term cache.
192 return scored_items; 181 return history_id_set;
193 } 182 }
194 183
195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 184 history_id_set =
196 // approach. 185 HistoryIDSetFromWords(lower_words);
197 ResetSearchTermCache();
198 186
199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 187 return history_id_set;
188 }
200 189
201 // Trim the candidate pool if it is large. Note that we do not filter out 190 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString (
202 // items that do not contain the search terms as proper substrings -- doing 191 base::string16 search_string,
203 // so is the performance-costly operation we are trying to avoid in order 192 HistoryIDSet history_id_set,
204 // to maintain omnibox responsiveness. 193 size_t max_matches,
205 const size_t kItemsToScoreLimit = 500; 194 bookmarks::BookmarkModel* bookmark_model,
206 pre_filter_item_count_ = history_id_set.size(); 195 TemplateURLService* template_url_service) {
207 // If we trim the results set we do not want to cache the results for next 196 ScoredHistoryMatches scored_items;
208 // time as the user's ultimately desired result could easily be eliminated
209 // in this early rough filter.
210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
211 if (was_trimmed) {
212 HistoryIDVector history_ids;
213 std::copy(history_id_set.begin(), history_id_set.end(),
214 std::back_inserter(history_ids));
215 // Trim down the set by sorting by typed-count, visit-count, and last
216 // visit.
217 HistoryItemFactorGreater
218 item_factor_functor(history_info_map_);
219 std::partial_sort(history_ids.begin(),
220 history_ids.begin() + kItemsToScoreLimit,
221 history_ids.end(),
222 item_factor_functor);
223 history_id_set.clear();
224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
225 std::inserter(history_id_set, history_id_set.end()));
226 post_filter_item_count_ = history_id_set.size();
227 }
228 197
229 // Pass over all of the candidates filtering out any without a proper 198 // Pass over all of the candidates filtering out any without a proper
230 // substring match, inserting those which pass in order by score. Note that 199 // substring match, inserting those which pass in order by score. Note that
231 // in this step we are using the raw search string complete with escaped 200 // in this step we are using the raw search string complete with escaped
232 // URL elements. When the user has specifically typed something akin to 201 // URL elements. When the user has specifically typed something akin to
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 202 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
234 // specific substring appears in the URL or page title. 203 // specific substring appears in the URL or page title.
235 204
236 // We call these 'terms' (as opposed to 'words'; see above) as in this case 205 // We call these 'terms' (as opposed to 'words'; see above) as in this case
237 // we only want to break up the search string on 'true' whitespace rather than 206 // we only want to break up the search string on 'true' whitespace rather than
238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we 207 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
239 // get two 'terms': "colspec=id%20mstone" and "release". 208 // get two 'terms': "colspec=id%20mstone" and "release".
209 base::string16 lower_raw_string(
210 base::i18n::ToLower(search_string));
240 String16Vector lower_raw_terms = base::SplitString( 211 String16Vector lower_raw_terms = base::SplitString(
241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, 212 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
242 base::SPLIT_WANT_NONEMPTY); 213 base::SPLIT_WANT_NONEMPTY);
214
243 if (lower_raw_terms.empty()) { 215 if (lower_raw_terms.empty()) {
244 // Don't score matches when there are no terms to score against. (It's 216 // Don't score matches when there are no terms to score against. (It's
245 // possible that the word break iterater that extracts words to search 217 // possible that the word break iterater that extracts words to search
246 // for in the database allows some whitespace "words" whereas SplitString 218 // for in the database allows some whitespace "words" whereas SplitString
247 // excludes a long list of whitespace.) One could write a scoring 219 // excludes a long list of whitespace.) One could write a scoring
248 // function that gives a reasonable order to matches when there 220 // function that gives a reasonable order to matches when there
249 // are no terms (i.e., all the words are some form of whitespace), 221 // are no terms (i.e., all the words are some form of whitespace),
250 // but this is such a rare edge case that it's not worth the time. 222 // but this is such a rare edge case that it's not worth the time.
251 return scored_items; 223 return scored_items;
252 } 224 }
253 scored_items = 225 scored_items =
254 std::for_each( 226 std::for_each(
255 history_id_set.begin(), history_id_set.end(), 227 history_id_set.begin(), history_id_set.end(),
256 AddHistoryMatch(bookmark_model, template_url_service, *this, 228 AddHistoryMatch(bookmark_model, template_url_service, *this,
257 lower_raw_string, lower_raw_terms, 229 lower_raw_string, lower_raw_terms,
258 base::Time::Now())).ScoredMatches(); 230 base::Time::Now())).ScoredMatches();
259 231
260 // Select and sort only the top |max_matches| results. 232 // Select and sort only the top |max_matches| results.
261 if (scored_items.size() > max_matches) { 233 if (scored_items.size() > max_matches) {
262 std::partial_sort(scored_items.begin(), 234 std::partial_sort(scored_items.begin(),
263 scored_items.begin() + 235 scored_items.begin() +
264 max_matches, 236 max_matches,
265 scored_items.end(), 237 scored_items.end(),
266 ScoredHistoryMatch::MatchScoreGreater); 238 ScoredHistoryMatch::MatchScoreGreater);
267 scored_items.resize(max_matches); 239 scored_items.resize(max_matches);
268 } else { 240 }
241 else {
269 std::sort(scored_items.begin(), scored_items.end(), 242 std::sort(scored_items.begin(), scored_items.end(),
270 ScoredHistoryMatch::MatchScoreGreater); 243 ScoredHistoryMatch::MatchScoreGreater);
271 } 244 }
272 post_scoring_item_count_ = scored_items.size();
273
274 if (was_trimmed) {
275 search_term_cache_.clear(); // Invalidate the term cache.
276 } else {
277 // Remove any stale SearchTermCacheItems.
278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
279 cache_iter != search_term_cache_.end(); ) {
280 if (!cache_iter->second.used_)
281 search_term_cache_.erase(cache_iter++);
282 else
283 ++cache_iter;
284 }
285 }
286 245
287 return scored_items; 246 return scored_items;
288 } 247 }
289 248
249
250 bool URLIndexPrivateData::TrimCandidatePool (HistoryIDSet history_id_set) {
251
252 // Trim the candidate pool if it is large. Note that we do not filter out
Mark P 2016/08/10 17:46:19 I assume this whole block was copied with no modif
Lavar Askew 2016/08/18 03:07:51 Yes, you are correct. Done.
253 // items that do not contain the search terms as proper substrings -- doing
254 // so is the performance-costly operation we are trying to avoid in order
255 // to maintain omnibox responsiveness.
256 const size_t kItemsToScoreLimit = 500;
257 pre_filter_item_count_ = history_id_set.size();
258 // If we trim the results set we do not want to cache the results for next
259 // time as the user's ultimately desired result could easily be eliminated
260 // in this early rough filter.
261 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
262 if (was_trimmed) {
263 HistoryIDVector history_ids;
264 std::copy(history_id_set.begin(), history_id_set.end(),
265 std::back_inserter(history_ids));
266 // Trim down the set by sorting by typed-count, visit-count, and last
267 // visit.
268 HistoryItemFactorGreater
269 item_factor_functor(history_info_map_);
270 std::partial_sort(history_ids.begin(),
271 history_ids.begin() + kItemsToScoreLimit,
272 history_ids.end(),
273 item_factor_functor);
274 history_id_set.clear();
275 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
276 std::inserter(history_id_set, history_id_set.end()));
277 post_filter_item_count_ = history_id_set.size();
278 }
279
280 return was_trimmed;
281 }
282
283 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
284 base::string16 search_string,
285 size_t cursor_position,
286 size_t max_matches,
287 bookmarks::BookmarkModel* bookmark_model,
288 TemplateURLService* template_url_service) {
289 pre_filter_item_count_ = 0;
290 post_filter_item_count_ = 0;
291 post_scoring_item_count_ = 0;
292
293 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
294 // approach.
295 ResetSearchTermCache();
296
297 HistoryIDSet history_id_set = HistoryItemsForWords(search_string);
298
299 bool history_id_set_without_word_break_was_trimmed =
300 TrimCandidatePool(history_id_set);
301
302 ScoredHistoryMatches scored_items_without_word_break =
303 GetScoredItemsForSearchString(
304 search_string,
305 history_id_set,
306 max_matches,
307 bookmark_model,
308 template_url_service);
309
310 ScoredHistoryMatches all_scored_items;
311 ScoredHistoryMatches scored_items_with_word_break;
312
313 bool history_id_set_with_word_break_was_trimmed = false;
314
315 // If cursor position is set and useful (not at either end of the
316 // string), allow the search string to be broken at cursor position.
317 // We do this by pretending there's a space where the cursor is.
318 if ((cursor_position != base::string16::npos) &&
319 (cursor_position < search_string.length()) &&
320 (cursor_position > 0)) {
321 base::string16 search_string_with_word_break = search_string;
322 HistoryIDSet history_id_set_with_word_break;
323
324 base::string16 word_break =
325 base::ASCIIToUTF16(" ");
326 search_string_with_word_break.insert(cursor_position, word_break);
327
328 // Add to history_id_set the ids that are related to the original
329 // search string, but with the word break.
330 history_id_set_with_word_break =
331 HistoryItemsForWords(search_string_with_word_break);
Mark P 2016/08/10 17:46:19 This might do unnecessary work if the new search s
332
333 history_id_set_with_word_break_was_trimmed =
334 TrimCandidatePool (history_id_set_with_word_break);
335
336 scored_items_with_word_break = GetScoredItemsForSearchString(
337 search_string_with_word_break,
338 history_id_set_with_word_break,
339 max_matches,
340 bookmark_model,
341 template_url_service);
342 }
343
344 if (history_id_set_without_word_break_was_trimmed ||
345 history_id_set_with_word_break_was_trimmed) {
346 search_term_cache_.clear(); // Invalidate the term cache.
347 }
348 else {
349 // Remove any stale SearchTermCacheItems.
350 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
351 cache_iter != search_term_cache_.end(); ) {
352 if (!cache_iter->second.used_) {
353 search_term_cache_.erase(cache_iter++);
354 }
355 else {
356 ++cache_iter;
357 }
358 }
359 }
360
361 // all_scored_items represents the unification the
362 //ScoredHistoryMatches for the search string with and without the word break.
Mark P 2016/08/10 17:46:19 The code below can be made more efficient and easi
Lavar Askew 2016/08/18 03:07:51 Done.
363 if (scored_items_without_word_break.size() > 0 &&
364 scored_items_with_word_break.size() > 0) {
365 post_scoring_item_count_ =
366 scored_items_without_word_break.size()
367 + scored_items_with_word_break.size();
368
369 all_scored_items.reserve(post_scoring_item_count_);
370 all_scored_items.insert(all_scored_items.end(),
371 scored_items_without_word_break.begin(),
372 scored_items_without_word_break.end());
373 all_scored_items.insert(all_scored_items.end(),
374 scored_items_with_word_break.begin(),
375 scored_items_with_word_break.end());
376 }
377 else if (scored_items_without_word_break.size() == 0 &&
378 scored_items_with_word_break.size() > 0) {
379 all_scored_items = scored_items_with_word_break;
380 }
381 else if (scored_items_without_word_break.size() > 0 &&
382 scored_items_with_word_break.size() == 0) {
383 all_scored_items = scored_items_without_word_break;
384 }
385
Mark P 2016/08/10 17:46:19 I could imagine situations where the items overlap
Lavar Askew 2016/08/18 03:07:51 Done.
386 return all_scored_items;
Mark P 2016/08/10 17:46:19 Also, the number of items we're supposed to return
Lavar Askew 2016/08/18 03:07:52 Done.
387 }
388
290 bool URLIndexPrivateData::UpdateURL( 389 bool URLIndexPrivateData::UpdateURL(
291 history::HistoryService* history_service, 390 history::HistoryService* history_service,
292 const history::URLRow& row, 391 const history::URLRow& row,
293 const std::set<std::string>& scheme_whitelist, 392 const std::set<std::string>& scheme_whitelist,
294 base::CancelableTaskTracker* tracker) { 393 base::CancelableTaskTracker* tracker) {
295 // The row may or may not already be in our index. If it is not already 394 // The row may or may not already be in our index. If it is not already
296 // indexed and it qualifies then it gets indexed. If it is already 395 // indexed and it qualifies then it gets indexed. If it is already
297 // indexed and still qualifies then it gets updated, otherwise it 396 // indexed and still qualifies then it gets updated, otherwise it
298 // is deleted from the index. 397 // is deleted from the index.
299 bool row_was_updated = false; 398 bool row_was_updated = false;
(...skipping 1054 matching lines...) Expand 10 before | Expand all | Expand 10 after
1354 // First cut: typed count, visit count, recency. 1453 // First cut: typed count, visit count, recency.
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1454 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1356 // recently visited (within the last 12/24 hours) as highly important. Get 1455 // recently visited (within the last 12/24 hours) as highly important. Get
1357 // input from mpearson. 1456 // input from mpearson.
1358 if (r1.typed_count() != r2.typed_count()) 1457 if (r1.typed_count() != r2.typed_count())
1359 return (r1.typed_count() > r2.typed_count()); 1458 return (r1.typed_count() > r2.typed_count());
1360 if (r1.visit_count() != r2.visit_count()) 1459 if (r1.visit_count() != r2.visit_count())
1361 return (r1.visit_count() > r2.visit_count()); 1460 return (r1.visit_count() > r2.visit_count());
1362 return (r1.last_visit() > r2.last_visit()); 1461 return (r1.last_visit() > r2.last_visit());
1363 } 1462 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698