Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(428)

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: I tried to remove duplicates and improve effciency. I appreciated the comments. Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/url_index_private_data.h" 5 #include "components/omnibox/browser/url_index_private_data.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <functional> 9 #include <functional>
10 #include <iterator> 10 #include <iterator>
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
74 WordStartsMapEntry; 74 WordStartsMapEntry;
75 75
76 // Algorithm Functions --------------------------------------------------------- 76 // Algorithm Functions ---------------------------------------------------------
77 77
78 // Comparison function for sorting search terms by descending length. 78 // Comparison function for sorting search terms by descending length.
79 bool LengthGreater(const base::string16& string_a, 79 bool LengthGreater(const base::string16& string_a,
80 const base::string16& string_b) { 80 const base::string16& string_b) {
81 return string_a.length() > string_b.length(); 81 return string_a.length() > string_b.length();
82 } 82 }
83 83
84 bool Less(const base::string16& string_a, const base::string16& string_b) {
85 return (string_a.compare(string_b) <= 0);
86 }
84 87
85 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- 88 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
86 89
87 // HistoryDBTask used to update the recent visit data for a particular 90 // HistoryDBTask used to update the recent visit data for a particular
88 // row from the history database. 91 // row from the history database.
89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { 92 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
90 public: 93 public:
91 explicit UpdateRecentVisitsFromHistoryDBTask( 94 explicit UpdateRecentVisitsFromHistoryDBTask(
92 URLIndexPrivateData* private_data, 95 URLIndexPrivateData* private_data,
93 history::URLID url_id); 96 history::URLID url_id);
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 // URLIndexPrivateData --------------------------------------------------------- 148 // URLIndexPrivateData ---------------------------------------------------------
146 149
147 URLIndexPrivateData::URLIndexPrivateData() 150 URLIndexPrivateData::URLIndexPrivateData()
148 : restored_cache_version_(0), 151 : restored_cache_version_(0),
149 saved_cache_version_(kCurrentCacheFileVersion), 152 saved_cache_version_(kCurrentCacheFileVersion),
150 pre_filter_item_count_(0), 153 pre_filter_item_count_(0),
151 post_filter_item_count_(0), 154 post_filter_item_count_(0),
152 post_scoring_item_count_(0) { 155 post_scoring_item_count_(0) {
153 } 156 }
154 157
155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 158 String16Vector URLIndexPrivateData::ExtractIndividualWordVector(
156 base::string16 search_string, 159 const base::string16& search_string) {
157 size_t cursor_position,
158 size_t max_matches,
159 bookmarks::BookmarkModel* bookmark_model,
160 TemplateURLService* template_url_service) {
161 // If cursor position is set and useful (not at either end of the
162 // string), allow the search string to be broken at cursor position.
163 // We do this by pretending there's a space where the cursor is.
164 if ((cursor_position != base::string16::npos) &&
165 (cursor_position < search_string.length()) &&
166 (cursor_position > 0)) {
167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
168 }
169 pre_filter_item_count_ = 0;
170 post_filter_item_count_ = 0;
171 post_scoring_item_count_ = 0;
172 // The search string we receive may contain escaped characters. For reducing 160 // The search string we receive may contain escaped characters. For reducing
173 // the index we need individual, lower-cased words, ignoring escapings. For 161 // the index we need individual, lower-cased words, ignoring escapings. For
174 // the final filtering we need whitespace separated substrings possibly 162 // the final filtering we need whitespace separated substrings possibly
175 // containing escaped characters. 163 // containing escaped characters.
176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); 164 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
177 base::string16 lower_unescaped_string = 165 base::string16 lower_unescaped_string = net::UnescapeURLComponent(
178 net::UnescapeURLComponent(lower_raw_string, 166 lower_raw_string,
179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | 167 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); 168 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the 169 // Extract individual 'words' (as opposed to 'terms'; see below) from the
182 // search string. When the user types "colspec=ID%20Mstone Release" we get 170 // search string. When the user types "colspec=ID%20Mstone Release" we get
183 // four 'words': "colspec", "id", "mstone" and "release". 171 // four 'words': "colspec", "id", "mstone" and "release".
184 String16Vector lower_words( 172 return String16VectorFromString16(lower_unescaped_string, false, nullptr);
185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); 173 }
186 ScoredHistoryMatches scored_items;
187 174
175 HistoryIDSet URLIndexPrivateData::GetHistoryIDSet(
176 const String16Vector& lower_words) {
188 // Do nothing if we have indexed no words (probably because we've not been 177 // Do nothing if we have indexed no words (probably because we've not been
189 // initialized yet) or the search string has no words. 178 // initialized yet) or the search string has no words.
190 if (word_list_.empty() || lower_words.empty()) { 179 if (word_list_.empty() || lower_words.empty()) {
191 search_term_cache_.clear(); // Invalidate the term cache. 180 search_term_cache_.clear(); // Invalidate the term cache.
192 return scored_items; 181 return HistoryIDSet();
193 } 182 }
194 183
195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 184 return HistoryIDSetFromWords(lower_words);
196 // approach. 185 }
197 ResetSearchTermCache();
198 186
199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 187 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString(
200 188 const base::string16& search_string,
201 // Trim the candidate pool if it is large. Note that we do not filter out 189 const HistoryIDSet& history_id_set,
202 // items that do not contain the search terms as proper substrings -- doing 190 const size_t& max_matches,
203 // so is the performance-costly operation we are trying to avoid in order 191 bookmarks::BookmarkModel* bookmark_model,
204 // to maintain omnibox responsiveness. 192 TemplateURLService* template_url_service) {
205 const size_t kItemsToScoreLimit = 500; 193 ScoredHistoryMatches scored_items;
206 pre_filter_item_count_ = history_id_set.size();
207 // If we trim the results set we do not want to cache the results for next
208 // time as the user's ultimately desired result could easily be eliminated
209 // in this early rough filter.
210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
211 if (was_trimmed) {
212 HistoryIDVector history_ids;
213 std::copy(history_id_set.begin(), history_id_set.end(),
214 std::back_inserter(history_ids));
215 // Trim down the set by sorting by typed-count, visit-count, and last
216 // visit.
217 HistoryItemFactorGreater
218 item_factor_functor(history_info_map_);
219 std::partial_sort(history_ids.begin(),
220 history_ids.begin() + kItemsToScoreLimit,
221 history_ids.end(),
222 item_factor_functor);
223 history_id_set.clear();
224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
225 std::inserter(history_id_set, history_id_set.end()));
226 post_filter_item_count_ = history_id_set.size();
227 }
228 194
229 // Pass over all of the candidates filtering out any without a proper 195 // Pass over all of the candidates filtering out any without a proper
230 // substring match, inserting those which pass in order by score. Note that 196 // substring match, inserting those which pass in order by score. Note that
231 // in this step we are using the raw search string complete with escaped 197 // in this step we are using the raw search string complete with escaped
232 // URL elements. When the user has specifically typed something akin to 198 // URL elements. When the user has specifically typed something akin to
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 199 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
234 // specific substring appears in the URL or page title. 200 // specific substring appears in the URL or page title.
235 201
236 // We call these 'terms' (as opposed to 'words'; see above) as in this case 202 // We call these 'terms' (as opposed to 'words'; see above) as in this case
237 // we only want to break up the search string on 'true' whitespace rather than 203 // we only want to break up the search string on 'true' whitespace rather than
238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we 204 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
239 // get two 'terms': "colspec=id%20mstone" and "release". 205 // get two 'terms': "colspec=id%20mstone" and "release".
206 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
240 String16Vector lower_raw_terms = base::SplitString( 207 String16Vector lower_raw_terms = base::SplitString(
241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, 208 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
242 base::SPLIT_WANT_NONEMPTY); 209 base::SPLIT_WANT_NONEMPTY);
210
243 if (lower_raw_terms.empty()) { 211 if (lower_raw_terms.empty()) {
244 // Don't score matches when there are no terms to score against. (It's 212 // Don't score matches when there are no terms to score against. (It's
245 // possible that the word break iterater that extracts words to search 213 // possible that the word break iterater that extracts words to search
246 // for in the database allows some whitespace "words" whereas SplitString 214 // for in the database allows some whitespace "words" whereas SplitString
247 // excludes a long list of whitespace.) One could write a scoring 215 // excludes a long list of whitespace.) One could write a scoring
248 // function that gives a reasonable order to matches when there 216 // function that gives a reasonable order to matches when there
249 // are no terms (i.e., all the words are some form of whitespace), 217 // are no terms (i.e., all the words are some form of whitespace),
250 // but this is such a rare edge case that it's not worth the time. 218 // but this is such a rare edge case that it's not worth the time.
251 return scored_items; 219 return scored_items;
252 } 220 }
253 scored_items = 221 scored_items =
254 std::for_each( 222 std::for_each(
255 history_id_set.begin(), history_id_set.end(), 223 history_id_set.begin(), history_id_set.end(),
256 AddHistoryMatch(bookmark_model, template_url_service, *this, 224 AddHistoryMatch(bookmark_model, template_url_service, *this,
257 lower_raw_string, lower_raw_terms, 225 lower_raw_string, lower_raw_terms,
258 base::Time::Now())).ScoredMatches(); 226 base::Time::Now())).ScoredMatches();
259 227
260 // Select and sort only the top |max_matches| results. 228 // Select and sort only the top |max_matches| results.
229
Mark P 2016/08/18 19:36:53 nit: don't add a blank line
Lavar Askew 2016/08/19 04:46:07 Done.
261 if (scored_items.size() > max_matches) { 230 if (scored_items.size() > max_matches) {
262 std::partial_sort(scored_items.begin(), 231 std::partial_sort(scored_items.begin(),
263 scored_items.begin() + 232 scored_items.begin() +
264 max_matches, 233 max_matches,
265 scored_items.end(), 234 scored_items.end(),
266 ScoredHistoryMatch::MatchScoreGreater); 235 ScoredHistoryMatch::MatchScoreGreater);
267 scored_items.resize(max_matches); 236 scored_items.resize(max_matches);
268 } else { 237 } else {
269 std::sort(scored_items.begin(), scored_items.end(), 238 std::sort(scored_items.begin(), scored_items.end(),
270 ScoredHistoryMatch::MatchScoreGreater); 239 ScoredHistoryMatch::MatchScoreGreater);
271 } 240 }
272 post_scoring_item_count_ = scored_items.size(); 241
242 return scored_items;
243 }
244
245 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
246 base::string16 search_string,
247 size_t cursor_position,
248 size_t max_matches,
249 bookmarks::BookmarkModel* bookmark_model,
250 TemplateURLService* template_url_service) {
251 pre_filter_item_count_ = 0;
252 post_filter_item_count_ = 0;
253 post_scoring_item_count_ = 0;
254
255 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
256 // approach.
257 ResetSearchTermCache();
258
259 String16Vector lower_words = ExtractIndividualWordVector(search_string);
260 HistoryIDSet history_id_set = GetHistoryIDSet(lower_words);
261
262 // If cursor position is set and useful (not at either end of the
263 // string), allow the search string to be broken at cursor position.
264 // We do this by pretending there's a space where the cursor is.
265 if ((cursor_position != base::string16::npos) &&
266 (cursor_position < search_string.length()) && (cursor_position > 0)) {
267 base::string16 search_string_with_word_break(search_string);
268 HistoryIDSet history_id_set_with_word_break;
Mark P 2016/08/18 19:36:53 This declaration doesn't need to be at this level;
Lavar Askew 2016/08/19 04:46:07 Can't believe I missed that...Done.
269
270 base::string16 word_break = base::ASCIIToUTF16(" ");
271 search_string_with_word_break.insert(cursor_position, word_break);
272
273 // Add to history_id_set the ids that are related to the original
274 // search string, but with the word break.
275 String16Vector lower_words_with_word_break =
276 ExtractIndividualWordVector(search_string_with_word_break);
277
278 // check to see if lower_words and lower_words_with_word_break
Mark P 2016/08/18 19:36:53 nit: start sentence with capital letter ("Check"..
Lavar Askew 2016/08/19 04:46:07 Done.
279 // are the same size. If they are then this could indicate that
280 // the word vectors contain the same words
281 if (lower_words.size() != lower_words_with_word_break.size()) {
282 std::sort(lower_words.begin(), lower_words.end(), Less);
283 std::sort(lower_words_with_word_break.begin(),
284 lower_words_with_word_break.end(), Less);
285
Mark P 2016/08/18 19:36:53 nit: no need for this blank line I think
286 if (lower_words != lower_words_with_word_break) {
287 history_id_set_with_word_break =
288 GetHistoryIDSet(lower_words_with_word_break);
289
Mark P 2016/08/18 19:36:53 nit: no need for this blank line I think
290 history_id_set.insert(history_id_set_with_word_break.begin(),
291 history_id_set_with_word_break.end());
292 }
293 }
294 }
295
296 // Trim the candidate pool if it is large. Note that we do not filter out
297 // items that do not contain the search terms as proper substrings -- doing
298 // so is the performance-costly operation we are trying to avoid in order
299 // to maintain omnibox responsiveness.
300 pre_filter_item_count_ = history_id_set.size();
301 // If we trim the results set we do not want to cache the results for next
302 // time as the user's ultimately desired result could easily be eliminated
303 // in this early rough filter.
304 bool was_trimmed = (pre_filter_item_count_ > max_matches);
305 if (was_trimmed) {
306 HistoryIDVector history_ids;
307 std::copy(history_id_set.begin(), history_id_set.end(),
308 std::back_inserter(history_ids));
309 // Trim down the set by sorting by typed-count, visit-count, and last
310 // visit.
311 HistoryItemFactorGreater item_factor_functor(history_info_map_);
312 std::partial_sort(history_ids.begin(), history_ids.begin() + max_matches,
313 history_ids.end(), item_factor_functor);
314 history_id_set.clear();
315 std::copy(history_ids.begin(), history_ids.begin() + max_matches,
316 std::inserter(history_id_set, history_id_set.end()));
317 post_filter_item_count_ = history_id_set.size();
318 }
319
320 ScoredHistoryMatches scored_items =
321 GetScoredItemsForSearchString(search_string, history_id_set, max_matches,
322 bookmark_model, template_url_service);
273 323
274 if (was_trimmed) { 324 if (was_trimmed) {
275 search_term_cache_.clear(); // Invalidate the term cache. 325 search_term_cache_.clear(); // Invalidate the term cache.
276 } else { 326 } else {
277 // Remove any stale SearchTermCacheItems. 327 // Remove any stale SearchTermCacheItems.
278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 328 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
279 cache_iter != search_term_cache_.end(); ) { 329 cache_iter != search_term_cache_.end(); ) {
280 if (!cache_iter->second.used_) 330 if (!cache_iter->second.used_) {
281 search_term_cache_.erase(cache_iter++); 331 search_term_cache_.erase(cache_iter++);
282 else 332 } else {
283 ++cache_iter; 333 ++cache_iter;
334 }
284 } 335 }
285 } 336 }
286 337
287 return scored_items; 338 return scored_items;
288 } 339 }
289 340
290 bool URLIndexPrivateData::UpdateURL( 341 bool URLIndexPrivateData::UpdateURL(
291 history::HistoryService* history_service, 342 history::HistoryService* history_service,
292 const history::URLRow& row, 343 const history::URLRow& row,
293 const std::set<std::string>& scheme_whitelist, 344 const std::set<std::string>& scheme_whitelist,
(...skipping 1060 matching lines...) Expand 10 before | Expand all | Expand 10 after
1354 // First cut: typed count, visit count, recency. 1405 // First cut: typed count, visit count, recency.
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1406 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1356 // recently visited (within the last 12/24 hours) as highly important. Get 1407 // recently visited (within the last 12/24 hours) as highly important. Get
1357 // input from mpearson. 1408 // input from mpearson.
1358 if (r1.typed_count() != r2.typed_count()) 1409 if (r1.typed_count() != r2.typed_count())
1359 return (r1.typed_count() > r2.typed_count()); 1410 return (r1.typed_count() > r2.typed_count());
1360 if (r1.visit_count() != r2.visit_count()) 1411 if (r1.visit_count() != r2.visit_count())
1361 return (r1.visit_count() > r2.visit_count()); 1412 return (r1.visit_count() > r2.visit_count());
1362 return (r1.last_visit() > r2.last_visit()); 1413 return (r1.last_visit() > r2.last_visit());
1363 } 1414 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698