Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(169)

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Correct code causing tests in HistoryQuickProviderTest to fail. Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/url_index_private_data.h" 5 #include "components/omnibox/browser/url_index_private_data.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <functional> 9 #include <functional>
10 #include <iterator> 10 #include <iterator>
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
74 WordStartsMapEntry; 74 WordStartsMapEntry;
75 75
76 // Algorithm Functions --------------------------------------------------------- 76 // Algorithm Functions ---------------------------------------------------------
77 77
78 // Comparison function for sorting search terms by descending length. 78 // Comparison function for sorting search terms by descending length.
79 bool LengthGreater(const base::string16& string_a, 79 bool LengthGreater(const base::string16& string_a,
80 const base::string16& string_b) { 80 const base::string16& string_b) {
81 return string_a.length() > string_b.length(); 81 return string_a.length() > string_b.length();
82 } 82 }
83 83
84
Mark P 2016/09/13 03:58:19 style comment: if you're trying to change the styl
Lavar Askew 2016/09/13 18:14:30 Done.
85 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- 84 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
86 85
87 // HistoryDBTask used to update the recent visit data for a particular 86 // HistoryDBTask used to update the recent visit data for a particular
88 // row from the history database. 87 // row from the history database.
89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { 88 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
90 public: 89 public:
91 explicit UpdateRecentVisitsFromHistoryDBTask( 90 explicit UpdateRecentVisitsFromHistoryDBTask(
92 URLIndexPrivateData* private_data, 91 URLIndexPrivateData* private_data,
93 history::URLID url_id); 92 history::URLID url_id);
94 93
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
151 post_filter_item_count_(0), 150 post_filter_item_count_(0),
152 post_scoring_item_count_(0) { 151 post_scoring_item_count_(0) {
153 } 152 }
154 153
155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
156 base::string16 search_string, 155 base::string16 search_string,
157 size_t cursor_position, 156 size_t cursor_position,
158 size_t max_matches, 157 size_t max_matches,
159 bookmarks::BookmarkModel* bookmark_model, 158 bookmarks::BookmarkModel* bookmark_model,
160 TemplateURLService* template_url_service) { 159 TemplateURLService* template_url_service) {
161 // If cursor position is set and useful (not at either end of the
162 // string), allow the search string to be broken at cursor position.
163 // We do this by pretending there's a space where the cursor is.
164 if ((cursor_position != base::string16::npos) &&
165 (cursor_position < search_string.length()) &&
166 (cursor_position > 0)) {
167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
168 }
169 pre_filter_item_count_ = 0; 160 pre_filter_item_count_ = 0;
170 post_filter_item_count_ = 0; 161 post_filter_item_count_ = 0;
171 post_scoring_item_count_ = 0; 162 post_scoring_item_count_ = 0;
172 // The search string we receive may contain escaped characters. For reducing 163 // The search string we receive may contain escaped characters. For reducing
173 // the index we need individual, lower-cased words, ignoring escapings. For 164 // the index we need individual, lower-cased words, ignoring escapings. For
174 // the final filtering we need whitespace separated substrings possibly 165 // the final filtering we need whitespace separated substrings possibly
175 // containing escaped characters. 166 // containing escaped characters.
176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); 167 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
177 base::string16 lower_unescaped_string = 168 base::string16 lower_unescaped_string =
178 net::UnescapeURLComponent(lower_raw_string, 169 net::UnescapeURLComponent(lower_raw_string,
179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | 170 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); 171 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
172
Mark P 2016/09/13 03:58:19 The lines above naturally go with the lines below.
Lavar Askew 2016/09/13 18:14:30 Done.
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the 173 // Extract individual 'words' (as opposed to 'terms'; see below) from the
182 // search string. When the user types "colspec=ID%20Mstone Release" we get 174 // search string. When the user types "colspec=ID%20Mstone Release" we get
183 // four 'words': "colspec", "id", "mstone" and "release". 175 // four 'words': "colspec", "id", "mstone" and "release".
184 String16Vector lower_words( 176 String16Vector lower_words(
185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); 177 String16VectorFromString16(lower_unescaped_string, false, nullptr));
186 ScoredHistoryMatches scored_items; 178 ScoredHistoryMatches scored_items;
187 179
188 // Do nothing if we have indexed no words (probably because we've not been 180 // Do nothing if we have indexed no words (probably because we've not been
189 // initialized yet) or the search string has no words. 181 // initialized yet) or the search string has no words.
190 if (word_list_.empty() || lower_words.empty()) { 182 if (word_list_.empty() || lower_words.empty()) {
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
234 // specific substring appears in the URL or page title. 226 // specific substring appears in the URL or page title.
235 227
236 // We call these 'terms' (as opposed to 'words'; see above) as in this case 228 // We call these 'terms' (as opposed to 'words'; see above) as in this case
237 // we only want to break up the search string on 'true' whitespace rather than 229 // we only want to break up the search string on 'true' whitespace rather than
238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we 230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
239 // get two 'terms': "colspec=id%20mstone" and "release". 231 // get two 'terms': "colspec=id%20mstone" and "release".
240 String16Vector lower_raw_terms = base::SplitString( 232 String16Vector lower_raw_terms = base::SplitString(
241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, 233 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
242 base::SPLIT_WANT_NONEMPTY); 234 base::SPLIT_WANT_NONEMPTY);
235
Mark P 2016/09/13 03:58:19 ditto
Lavar Askew 2016/09/13 18:14:30 Done.
243 if (lower_raw_terms.empty()) { 236 if (lower_raw_terms.empty()) {
244 // Don't score matches when there are no terms to score against. (It's 237 // Don't score matches when there are no terms to score against. (It's
245 // possible that the word break iterater that extracts words to search 238 // possible that the word break iterater that extracts words to search
246 // for in the database allows some whitespace "words" whereas SplitString 239 // for in the database allows some whitespace "words" whereas SplitString
247 // excludes a long list of whitespace.) One could write a scoring 240 // excludes a long list of whitespace.) One could write a scoring
248 // function that gives a reasonable order to matches when there 241 // function that gives a reasonable order to matches when there
249 // are no terms (i.e., all the words are some form of whitespace), 242 // are no terms (i.e., all the words are some form of whitespace),
250 // but this is such a rare edge case that it's not worth the time. 243 // but this is such a rare edge case that it's not worth the time.
251 return scored_items; 244 return scored_items;
252 } 245 }
246
253 scored_items = 247 scored_items =
254 std::for_each( 248 std::for_each(
255 history_id_set.begin(), history_id_set.end(), 249 history_id_set.begin(), history_id_set.end(),
256 AddHistoryMatch(bookmark_model, template_url_service, *this, 250 AddHistoryMatch(bookmark_model, template_url_service, *this,
257 lower_raw_string, lower_raw_terms, 251 lower_raw_string, lower_raw_terms, base::Time::Now()))
258 base::Time::Now())).ScoredMatches(); 252 .ScoredMatches();
259 253
254 // If cursor position is set and useful (not at either end of the string),
255 // allow the search string to be broken at cursor position. We do this by
256 // pretending there's a space where the cursor is.
Mark P 2016/09/13 03:58:19 Please make this comment normal paragraph style, i
Lavar Askew 2016/09/13 18:14:30 Done.
257 // This phenomena will be referred to as space ghosting.
Mark P 2016/09/13 03:58:19 Don't introduce a term ("space ghosting") only to
Lavar Askew 2016/09/13 18:14:30 Done.
258 // Going forward we perform a similar process for obtaining
259 // scored items as above. In fact all of the variable names are the same
260 // except that they are prepended with 'transformed_' reflecting that these
261 // variables are related to space ghosting.
262 if ((cursor_position != base::string16::npos) &&
263 (cursor_position < search_string.length()) && (cursor_position > 0)) {
264 // The original search_string broken at cursor position.
265 base::string16 transformed_search_string(search_string);
266 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
267
268 // The same as lower_raw_string and defined for the same reasons.
269 base::string16 transformed_lower_raw_string(
270 base::i18n::ToLower(transformed_search_string));
271 String16Vector transformed_lower_raw_terms =
272 base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16,
273 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
274 if (!transformed_lower_raw_terms.empty()) {
275 base::string16 transormed_lower_unescaped_string =
276 net::UnescapeURLComponent(
277 transformed_lower_raw_string,
278 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
279 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
280 String16Vector transformed_lower_words(String16VectorFromString16(
281 transormed_lower_unescaped_string, false, nullptr));
282 HistoryIDSet transformed_history_id_set =
283 HistoryIDSetFromWords(transformed_lower_words);
284
285 if (history_id_set != transformed_history_id_set) {
286 pre_filter_item_count_ = history_id_set.size();
287 // Trim the candidate pool if it is large as above.
288 was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
289 if (was_trimmed) {
290 HistoryIDVector transformed_history_ids;
291 std::copy(transformed_history_id_set.begin(),
292 transformed_history_id_set.end(),
293 std::back_inserter(transformed_history_ids));
294 // Trim down the set by sorting by typed-count, visit-count, and last
295 // visit.
296 HistoryItemFactorGreater item_factor_functor(history_info_map_);
297 std::partial_sort(
298 transformed_history_ids.begin(),
299 transformed_history_ids.begin() + kItemsToScoreLimit,
300 transformed_history_ids.end(), item_factor_functor);
301 transformed_history_id_set.clear();
302 std::copy(transformed_history_ids.begin(),
303 transformed_history_ids.begin() + kItemsToScoreLimit,
304 std::inserter(transformed_history_id_set,
305 transformed_history_id_set.end()));
306 }
307 history_id_set.insert(transformed_history_id_set.begin(),
308 transformed_history_id_set.end());
309 ScoredHistoryMatches transformed_scored_items =
310 std::for_each(
311 history_id_set.begin(), history_id_set.end(),
312 AddHistoryMatch(bookmark_model, template_url_service, *this,
313 transformed_lower_raw_string,
314 transformed_lower_raw_terms, base::Time::Now()))
315 .ScoredMatches();
316 scored_items.insert(scored_items.end(),
317 transformed_scored_items.begin(),
318 transformed_scored_items.end());
319 }
320 }
321 }
260 // Select and sort only the top |max_matches| results. 322 // Select and sort only the top |max_matches| results.
261 if (scored_items.size() > max_matches) { 323 if (scored_items.size() > max_matches) {
262 std::partial_sort(scored_items.begin(), 324 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,
263 scored_items.begin() +
264 max_matches,
265 scored_items.end(), 325 scored_items.end(),
266 ScoredHistoryMatch::MatchScoreGreater); 326 ScoredHistoryMatch::MatchScoreGreater);
267 scored_items.resize(max_matches); 327 scored_items.resize(max_matches);
268 } else { 328 } else {
269 std::sort(scored_items.begin(), scored_items.end(), 329 std::sort(scored_items.begin(), scored_items.end(),
270 ScoredHistoryMatch::MatchScoreGreater); 330 ScoredHistoryMatch::MatchScoreGreater);
271 } 331 }
272 post_scoring_item_count_ = scored_items.size(); 332 post_scoring_item_count_ = scored_items.size();
273 333
274 if (was_trimmed) { 334 if (was_trimmed) {
275 search_term_cache_.clear(); // Invalidate the term cache. 335 search_term_cache_.clear(); // Invalidate the term cache.
276 } else { 336 } else {
277 // Remove any stale SearchTermCacheItems. 337 // Remove any stale SearchTermCacheItems.
(...skipping 966 matching lines...) Expand 10 before | Expand all | Expand 10 after
1244 return true; 1304 return true;
1245 } 1305 }
1246 1306
1247 // static 1307 // static
1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted( 1308 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1249 const GURL& gurl, 1309 const GURL& gurl,
1250 const std::set<std::string>& whitelist) { 1310 const std::set<std::string>& whitelist) {
1251 return whitelist.find(gurl.scheme()) != whitelist.end(); 1311 return whitelist.find(gurl.scheme()) != whitelist.end();
1252 } 1312 }
1253 1313
1254
Mark P 2016/09/13 03:58:19 ditto style comment about single versus double bla
1255 // SearchTermCacheItem --------------------------------------------------------- 1314 // SearchTermCacheItem ---------------------------------------------------------
1256 1315
1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( 1316 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1258 const WordIDSet& word_id_set, 1317 const WordIDSet& word_id_set,
1259 const HistoryIDSet& history_id_set) 1318 const HistoryIDSet& history_id_set)
1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { 1319 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {
1261 } 1320 }
1262 1321
1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { 1322 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1264 } 1323 }
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
1354 // First cut: typed count, visit count, recency. 1413 // First cut: typed count, visit count, recency.
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1414 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1356 // recently visited (within the last 12/24 hours) as highly important. Get 1415 // recently visited (within the last 12/24 hours) as highly important. Get
1357 // input from mpearson. 1416 // input from mpearson.
1358 if (r1.typed_count() != r2.typed_count()) 1417 if (r1.typed_count() != r2.typed_count())
1359 return (r1.typed_count() > r2.typed_count()); 1418 return (r1.typed_count() > r2.typed_count());
1360 if (r1.visit_count() != r2.visit_count()) 1419 if (r1.visit_count() != r2.visit_count())
1361 return (r1.visit_count() > r2.visit_count()); 1420 return (r1.visit_count() > r2.visit_count());
1362 return (r1.last_visit() > r2.last_visit()); 1421 return (r1.last_visit() > r2.last_visit());
1363 } 1422 }
OLDNEW
« no previous file with comments | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698