Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(508)

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2363463002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed)
Patch Set: Resubmitting patch for closed code review with iss number 2363463002. Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/url_index_private_data.h" 5 #include "components/omnibox/browser/url_index_private_data.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <functional> 9 #include <functional>
10 #include <iterator> 10 #include <iterator>
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 } 133 }
134 134
135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { 135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
136 if (succeeded_) 136 if (succeeded_)
137 private_data_->UpdateRecentVisits(url_id_, recent_visits_); 137 private_data_->UpdateRecentVisits(url_id_, recent_visits_);
138 } 138 }
139 139
140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { 140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
141 } 141 }
142 142
143
144 // URLIndexPrivateData --------------------------------------------------------- 143 // URLIndexPrivateData ---------------------------------------------------------
145 144
146 URLIndexPrivateData::URLIndexPrivateData() 145 URLIndexPrivateData::URLIndexPrivateData()
147 : restored_cache_version_(0), 146 : restored_cache_version_(0),
148 saved_cache_version_(kCurrentCacheFileVersion), 147 saved_cache_version_(kCurrentCacheFileVersion),
149 pre_filter_item_count_(0), 148 pre_filter_item_count_(0),
150 post_filter_item_count_(0), 149 post_filter_item_count_(0),
151 post_scoring_item_count_(0) { 150 post_scoring_item_count_(0) {
152 } 151 }
153 152
154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 153 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
155 base::string16 search_string, 154 base::string16 original_search_string,
156 size_t cursor_position, 155 size_t cursor_position,
157 size_t max_matches, 156 size_t max_matches,
158 bookmarks::BookmarkModel* bookmark_model, 157 bookmarks::BookmarkModel* bookmark_model,
159 TemplateURLService* template_url_service) { 158 TemplateURLService* template_url_service) {
160 pre_filter_item_count_ = 0; 159 pre_filter_item_count_ = 0;
161 post_filter_item_count_ = 0; 160 post_filter_item_count_ = 0;
162 post_scoring_item_count_ = 0; 161 post_scoring_item_count_ = 0;
163 // The search string we receive may contain escaped characters. For reducing
164 // the index we need individual, lower-cased words, ignoring escapings. For
165 // the final filtering we need whitespace separated substrings possibly
166 // containing escaped characters.
167 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
168 base::string16 lower_unescaped_string =
169 net::UnescapeURLComponent(lower_raw_string,
170 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
171 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
172 162
173 // Extract individual 'words' (as opposed to 'terms'; see below) from the 163 // This list will contain the original search string and any other string
174 // search string. When the user types "colspec=ID%20Mstone Release" we get 164 // transformations.
175 // four 'words': "colspec", "id", "mstone" and "release". 165 String16Vector search_strings;
176 String16Vector lower_words( 166 search_strings.push_back(original_search_string);
177 String16VectorFromString16(lower_unescaped_string, false, nullptr)); 167 if ((cursor_position != base::string16::npos) &&
168 (cursor_position < original_search_string.length()) &&
169 (cursor_position > 0)) {
170 // The original search_string broken at cursor position. This is one type of
171 // transformation.
172 base::string16 transformed_search_string(original_search_string);
173 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
174 search_strings.push_back(transformed_search_string);
175 }
176
178 ScoredHistoryMatches scored_items; 177 ScoredHistoryMatches scored_items;
179 178 // Invalidate the term cache and return if we have indexed no words (probably
180 // Do nothing if we have indexed no words (probably because we've not been 179 // because we've not been initialized yet).
181 // initialized yet) or the search string has no words. 180 if (word_list_.empty()) {
182 if (word_list_.empty() || lower_words.empty()) { 181 search_term_cache_.clear();
183 search_term_cache_.clear(); // Invalidate the term cache.
184 return scored_items; 182 return scored_items;
185 } 183 }
186
187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 184 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
188 // approach. 185 // approach.
189 ResetSearchTermCache(); 186 ResetSearchTermCache();
190 187
191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 188 for (const base::string16& search_string : search_strings) {
189 // The search string we receive may contain escaped characters. For reducing
190 // the index we need individual, lower-cased words, ignoring escapings. For
191 // the final filtering we need whitespace separated substrings possibly
192 // containing escaped characters.
193 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
194 base::string16 lower_unescaped_string = net::UnescapeURLComponent(
195 lower_raw_string,
196 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
197 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
192 198
193 // Trim the candidate pool if it is large. Note that we do not filter out 199 // Extract individual 'words' (as opposed to 'terms'; see below) from the
194 // items that do not contain the search terms as proper substrings -- doing 200 // search string. When the user types "colspec=ID%20Mstone Release" we get
195 // so is the performance-costly operation we are trying to avoid in order 201 // four 'words': "colspec", "id", "mstone" and "release".
196 // to maintain omnibox responsiveness. 202 String16Vector lower_words(
197 const size_t kItemsToScoreLimit = 500; 203 String16VectorFromString16(lower_unescaped_string, false, nullptr));
198 pre_filter_item_count_ = history_id_set.size(); 204 if (lower_words.empty())
199 // If we trim the results set we do not want to cache the results for next 205 continue;
200 // time as the user's ultimately desired result could easily be eliminated 206 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
201 // in this early rough filter. 207 // Trim the candidate pool if it is large. Note that we do not filter out
202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); 208 // items that do not contain the search terms as proper substrings --
203 if (was_trimmed) { 209 // doing so is the performance-costly operation we are trying to avoid in
204 HistoryIDVector history_ids; 210 // order to maintain omnibox responsiveness.
205 std::copy(history_id_set.begin(), history_id_set.end(), 211 const size_t kItemsToScoreLimit = 500;
206 std::back_inserter(history_ids)); 212 pre_filter_item_count_ += history_id_set.size();
207 // Trim down the set by sorting by typed-count, visit-count, and last 213 // If we trim the results set we do not want to cache the results for next
208 // visit. 214 // time as the user's ultimately desired result could easily be eliminated
209 HistoryItemFactorGreater 215 // in this early rough filter.
210 item_factor_functor(history_info_map_); 216 if (pre_filter_item_count_ > kItemsToScoreLimit) {
211 std::partial_sort(history_ids.begin(), 217 HistoryIDVector history_ids;
212 history_ids.begin() + kItemsToScoreLimit, 218 std::copy(history_id_set.begin(), history_id_set.end(),
213 history_ids.end(), 219 std::back_inserter(history_ids));
214 item_factor_functor); 220 // Trim down the set by sorting by typed-count, visit-count, and last
215 history_id_set.clear(); 221 // visit.
216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, 222 HistoryItemFactorGreater item_factor_functor(history_info_map_);
217 std::inserter(history_id_set, history_id_set.end())); 223 std::partial_sort(history_ids.begin(),
218 post_filter_item_count_ = history_id_set.size(); 224 history_ids.begin() + kItemsToScoreLimit,
219 } 225 history_ids.end(), item_factor_functor);
226 history_id_set.clear();
227 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
228 std::inserter(history_id_set, history_id_set.end()));
229 post_filter_item_count_ = post_filter_item_count_ + history_id_set.size();
230 }
231 // Pass over all of the candidates filtering out any without a proper
232 // substring match, inserting those which pass in order by score. Note that
233 // in this step we are using the raw search string complete with escaped
234 // URL elements. When the user has specifically typed something akin to
235 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
236 // specific substring appears in the URL or page title.
220 237
221 // Pass over all of the candidates filtering out any without a proper 238 // We call these 'terms' (as opposed to 'words'; see above) as in this case
222 // substring match, inserting those which pass in order by score. Note that 239 // we only want to break up the search string on 'true' whitespace rather
223 // in this step we are using the raw search string complete with escaped 240 // than escaped whitespace. When the user types "colspec=ID%20Mstone
224 // URL elements. When the user has specifically typed something akin to 241 // Release" we get two 'terms': "colspec=id%20mstone" and "release".
225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 242 String16Vector lower_raw_terms =
226 // specific substring appears in the URL or page title. 243 base::SplitString(lower_raw_string, base::kWhitespaceUTF16,
244 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
227 245
228 // We call these 'terms' (as opposed to 'words'; see above) as in this case
229 // we only want to break up the search string on 'true' whitespace rather than
230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
231 // get two 'terms': "colspec=id%20mstone" and "release".
232 String16Vector lower_raw_terms = base::SplitString(
233 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
234 base::SPLIT_WANT_NONEMPTY);
235
236 if (lower_raw_terms.empty()) {
237 // Don't score matches when there are no terms to score against. (It's 246 // Don't score matches when there are no terms to score against. (It's
238 // possible that the word break iterater that extracts words to search 247 // possible that the word break iterater that extracts words to search
239 // for in the database allows some whitespace "words" whereas SplitString 248 // for in the database allows some whitespace "words" whereas SplitString
240 // excludes a long list of whitespace.) One could write a scoring 249 // excludes a long list of whitespace.) One could write a scoring function
241 // function that gives a reasonable order to matches when there 250 // that gives a reasonable order to matches when there are no terms (i.e.,
242 // are no terms (i.e., all the words are some form of whitespace), 251 // all the words are some form of whitespace), but this is such a rare edge
243 // but this is such a rare edge case that it's not worth the time. 252 // case that it's not worth the time.
244 return scored_items; 253 if (lower_raw_terms.empty())
245 } 254 continue;
246 255 ScoredHistoryMatches temp_scored_items =
247 scored_items = 256 std::for_each(history_id_set.begin(), history_id_set.end(),
248 std::for_each( 257 AddHistoryMatch(bookmark_model, template_url_service,
249 history_id_set.begin(), history_id_set.end(), 258 *this, lower_raw_string, lower_raw_terms,
250 AddHistoryMatch(bookmark_model, template_url_service, *this, 259 base::Time::Now()))
251 lower_raw_string, lower_raw_terms, base::Time::Now())) 260 .ScoredMatches();
252 .ScoredMatches(); 261 scored_items.insert(scored_items.end(), temp_scored_items.begin(),
253 262 temp_scored_items.end());
254 // If cursor position is set and useful (not at either end of the string),
255 // allow the search string to be broken at cursor position. We do this by
256 // pretending there's a space where the cursor is.
257 // This phenomena will be referred to as space ghosting.
258 // Going forward we perform a similar process for obtaining
259 // scored items as above. In fact all of the variable names are the same
260 // except that they are prepended with 'transformed_' reflecting that these
261 // variables are related to space ghosting.
262 if ((cursor_position != base::string16::npos) &&
263 (cursor_position < search_string.length()) && (cursor_position > 0)) {
264 // The original search_string broken at cursor position.
265 base::string16 transformed_search_string(search_string);
266 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
267
268 // The same as lower_raw_string and defined for the same reasons.
269 base::string16 transformed_lower_raw_string(
270 base::i18n::ToLower(transformed_search_string));
271 String16Vector transformed_lower_raw_terms =
272 base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16,
273 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
274 if (!transformed_lower_raw_terms.empty()) {
275 base::string16 transormed_lower_unescaped_string =
276 net::UnescapeURLComponent(
277 transformed_lower_raw_string,
278 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
279 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
280 String16Vector transformed_lower_words(String16VectorFromString16(
281 transormed_lower_unescaped_string, false, nullptr));
282 HistoryIDSet transformed_history_id_set =
283 HistoryIDSetFromWords(transformed_lower_words);
284
285 if (history_id_set != transformed_history_id_set) {
286 pre_filter_item_count_ = history_id_set.size();
287 // Trim the candidate pool if it is large as above.
288 was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
289 if (was_trimmed) {
290 HistoryIDVector transformed_history_ids;
291 std::copy(transformed_history_id_set.begin(),
292 transformed_history_id_set.end(),
293 std::back_inserter(transformed_history_ids));
294 // Trim down the set by sorting by typed-count, visit-count, and last
295 // visit.
296 HistoryItemFactorGreater item_factor_functor(history_info_map_);
297 std::partial_sort(
298 transformed_history_ids.begin(),
299 transformed_history_ids.begin() + kItemsToScoreLimit,
300 transformed_history_ids.end(), item_factor_functor);
301 transformed_history_id_set.clear();
302 std::copy(transformed_history_ids.begin(),
303 transformed_history_ids.begin() + kItemsToScoreLimit,
304 std::inserter(transformed_history_id_set,
305 transformed_history_id_set.end()));
306 }
307 history_id_set.insert(transformed_history_id_set.begin(),
308 transformed_history_id_set.end());
309 ScoredHistoryMatches transformed_scored_items =
310 std::for_each(
311 history_id_set.begin(), history_id_set.end(),
312 AddHistoryMatch(bookmark_model, template_url_service, *this,
313 transformed_lower_raw_string,
314 transformed_lower_raw_terms, base::Time::Now()))
315 .ScoredMatches();
316 scored_items.insert(scored_items.end(),
317 transformed_scored_items.begin(),
318 transformed_scored_items.end());
319 }
320 }
321 } 263 }
322 // Select and sort only the top |max_matches| results. 264 // Select and sort only the top |max_matches| results.
323 if (scored_items.size() > max_matches) { 265 if (scored_items.size() > max_matches) {
324 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, 266 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,
325 scored_items.end(), 267 scored_items.end(),
326 ScoredHistoryMatch::MatchScoreGreater); 268 ScoredHistoryMatch::MatchScoreGreater);
327 scored_items.resize(max_matches); 269 scored_items.resize(max_matches);
328 } else { 270 } else {
329 std::sort(scored_items.begin(), scored_items.end(), 271 std::sort(scored_items.begin(), scored_items.end(),
330 ScoredHistoryMatch::MatchScoreGreater); 272 ScoredHistoryMatch::MatchScoreGreater);
331 } 273 }
332 post_scoring_item_count_ = scored_items.size(); 274 post_scoring_item_count_ = scored_items.size();
333 275 if (pre_filter_item_count_ > post_filter_item_count_) {
334 if (was_trimmed) {
335 search_term_cache_.clear(); // Invalidate the term cache. 276 search_term_cache_.clear(); // Invalidate the term cache.
336 } else { 277 } else {
337 // Remove any stale SearchTermCacheItems. 278 // Remove any stale SearchTermCacheItems.
338 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 279 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
339 cache_iter != search_term_cache_.end(); ) { 280 cache_iter != search_term_cache_.end(); ) {
340 if (!cache_iter->second.used_) 281 if (!cache_iter->second.used_)
341 search_term_cache_.erase(cache_iter++); 282 search_term_cache_.erase(cache_iter++);
342 else 283 else
343 ++cache_iter; 284 ++cache_iter;
344 } 285 }
(...skipping 1068 matching lines...) Expand 10 before | Expand all | Expand 10 after
1413 // First cut: typed count, visit count, recency. 1354 // First cut: typed count, visit count, recency.
1414 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1415 // recently visited (within the last 12/24 hours) as highly important. Get 1356 // recently visited (within the last 12/24 hours) as highly important. Get
1416 // input from mpearson. 1357 // input from mpearson.
1417 if (r1.typed_count() != r2.typed_count()) 1358 if (r1.typed_count() != r2.typed_count())
1418 return (r1.typed_count() > r2.typed_count()); 1359 return (r1.typed_count() > r2.typed_count());
1419 if (r1.visit_count() != r2.visit_count()) 1360 if (r1.visit_count() != r2.visit_count())
1420 return (r1.visit_count() > r2.visit_count()); 1361 return (r1.visit_count() > r2.visit_count());
1421 return (r1.last_visit() > r2.last_visit()); 1362 return (r1.last_visit() > r2.last_visit());
1422 } 1363 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698