components/omnibox/browser/url_index_private_data.cc - Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox.

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Correct code causing tests in HistoryQuickProviderTest to fail. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
74 WordStartsMapEntry;	74 WordStartsMapEntry;

75	75

76 // Algorithm Functions ---------------------------------------------------------	76 // Algorithm Functions ---------------------------------------------------------

77	77

78 // Comparison function for sorting search terms by descending length.	78 // Comparison function for sorting search terms by descending length.

79 bool LengthGreater(const base::string16& string_a,	79 bool LengthGreater(const base::string16& string_a,

80 const base::string16& string_b) {	80 const base::string16& string_b) {

81 return string_a.length() > string_b.length();	81 return string_a.length() > string_b.length();

82 }	82 }

83	83

84
Mark P 2016/09/13 03:58:19 style comment: if you're trying to change the styl style comment: if you're trying to change the style by using only one blank line before section breaks in this file, you should do so consistently. Instead, used one here but two in the section (// URLIndexPrivateData) immediately below. Lavar Askew 2016/09/13 18:14:30 Done. Show quoted text On 2016/09/13 03:58:19, Mark P wrote: > style comment: if you're trying to change the style by using only one blank line > before section breaks in this file, you should do so consistently. Instead, > used one here but two in the section (// URLIndexPrivateData) immediately below. Done.
85 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------	84 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

86	85

87 // HistoryDBTask used to update the recent visit data for a particular	86 // HistoryDBTask used to update the recent visit data for a particular

88 // row from the history database.	87 // row from the history database.

89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {	88 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {

90 public:	89 public:

91 explicit UpdateRecentVisitsFromHistoryDBTask(	90 explicit UpdateRecentVisitsFromHistoryDBTask(

92 URLIndexPrivateData* private_data,	91 URLIndexPrivateData* private_data,

93 history::URLID url_id);	92 history::URLID url_id);

94	93

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
151 post_filter_item_count_(0),	150 post_filter_item_count_(0),

152 post_scoring_item_count_(0) {	151 post_scoring_item_count_(0) {

153 }	152 }

154	153

155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

156 base::string16 search_string,	155 base::string16 search_string,

157 size_t cursor_position,	156 size_t cursor_position,

158 size_t max_matches,	157 size_t max_matches,

159 bookmarks::BookmarkModel* bookmark_model,	158 bookmarks::BookmarkModel* bookmark_model,

160 TemplateURLService* template_url_service) {	159 TemplateURLService* template_url_service) {

161 // If cursor position is set and useful (not at either end of the

162 // string), allow the search string to be broken at cursor position.

163 // We do this by pretending there's a space where the cursor is.

164 if ((cursor_position != base::string16::npos) &&

165 (cursor_position < search_string.length()) &&

166 (cursor_position > 0)) {

167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

168 }

169 pre_filter_item_count_ = 0;	160 pre_filter_item_count_ = 0;

170 post_filter_item_count_ = 0;	161 post_filter_item_count_ = 0;

171 post_scoring_item_count_ = 0;	162 post_scoring_item_count_ = 0;

172 // The search string we receive may contain escaped characters. For reducing	163 // The search string we receive may contain escaped characters. For reducing

173 // the index we need individual, lower-cased words, ignoring escapings. For	164 // the index we need individual, lower-cased words, ignoring escapings. For

174 // the final filtering we need whitespace separated substrings possibly	165 // the final filtering we need whitespace separated substrings possibly

175 // containing escaped characters.	166 // containing escaped characters.

176 base::string16 lower_raw_string(base::i18n::ToLower(search_string));	167 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

177 base::string16 lower_unescaped_string =	168 base::string16 lower_unescaped_string =

178 net::UnescapeURLComponent(lower_raw_string,	169 net::UnescapeURLComponent(lower_raw_string,

179 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|	170 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);	171 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

	172
	Mark P 2016/09/13 03:58:19 The lines above naturally go with the lines below. The lines above naturally go with the lines below. I wouldn't add a blank line here. Lavar Askew 2016/09/13 18:14:30 Done. Show quoted text On 2016/09/13 03:58:19, Mark P wrote: > The lines above naturally go with the lines below. I wouldn't add a blank line > here. Done.
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the	173 // Extract individual 'words' (as opposed to 'terms'; see below) from the

182 // search string. When the user types "colspec=ID%20Mstone Release" we get	174 // search string. When the user types "colspec=ID%20Mstone Release" we get

183 // four 'words': "colspec", "id", "mstone" and "release".	175 // four 'words': "colspec", "id", "mstone" and "release".

184 String16Vector lower_words(	176 String16Vector lower_words(

185 String16VectorFromString16(lower_unescaped_string, false, nullptr));	177 String16VectorFromString16(lower_unescaped_string, false, nullptr));

186 ScoredHistoryMatches scored_items;	178 ScoredHistoryMatches scored_items;

187	179

188 // Do nothing if we have indexed no words (probably because we've not been	180 // Do nothing if we have indexed no words (probably because we've not been

189 // initialized yet) or the search string has no words.	181 // initialized yet) or the search string has no words.

190 if (word_list_.empty() \|\| lower_words.empty()) {	182 if (word_list_.empty() \|\| lower_words.empty()) {

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that	225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

234 // specific substring appears in the URL or page title.	226 // specific substring appears in the URL or page title.

235	227

236 // We call these 'terms' (as opposed to 'words'; see above) as in this case	228 // We call these 'terms' (as opposed to 'words'; see above) as in this case

237 // we only want to break up the search string on 'true' whitespace rather than	229 // we only want to break up the search string on 'true' whitespace rather than

238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we	230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

239 // get two 'terms': "colspec=id%20mstone" and "release".	231 // get two 'terms': "colspec=id%20mstone" and "release".

240 String16Vector lower_raw_terms = base::SplitString(	232 String16Vector lower_raw_terms = base::SplitString(

241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,	233 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

242 base::SPLIT_WANT_NONEMPTY);	234 base::SPLIT_WANT_NONEMPTY);

	235
	Mark P 2016/09/13 03:58:19 ditto ditto Lavar Askew 2016/09/13 18:14:30 Done. Show quoted text On 2016/09/13 03:58:19, Mark P wrote: > ditto Done.
243 if (lower_raw_terms.empty()) {	236 if (lower_raw_terms.empty()) {

244 // Don't score matches when there are no terms to score against. (It's	237 // Don't score matches when there are no terms to score against. (It's

245 // possible that the word break iterater that extracts words to search	238 // possible that the word break iterater that extracts words to search

246 // for in the database allows some whitespace "words" whereas SplitString	239 // for in the database allows some whitespace "words" whereas SplitString

247 // excludes a long list of whitespace.) One could write a scoring	240 // excludes a long list of whitespace.) One could write a scoring

248 // function that gives a reasonable order to matches when there	241 // function that gives a reasonable order to matches when there

249 // are no terms (i.e., all the words are some form of whitespace),	242 // are no terms (i.e., all the words are some form of whitespace),

250 // but this is such a rare edge case that it's not worth the time.	243 // but this is such a rare edge case that it's not worth the time.

251 return scored_items;	244 return scored_items;

252 }	245 }

	246

253 scored_items =	247 scored_items =

254 std::for_each(	248 std::for_each(

255 history_id_set.begin(), history_id_set.end(),	249 history_id_set.begin(), history_id_set.end(),

256 AddHistoryMatch(bookmark_model, template_url_service, *this,	250 AddHistoryMatch(bookmark_model, template_url_service, *this,

257 lower_raw_string, lower_raw_terms,	251 lower_raw_string, lower_raw_terms, base::Time::Now()))

258 base::Time::Now())).ScoredMatches();	252 .ScoredMatches();

259	253

	254 // If cursor position is set and useful (not at either end of the string),

	255 // allow the search string to be broken at cursor position. We do this by

	256 // pretending there's a space where the cursor is.
	Mark P 2016/09/13 03:58:19 Please make this comment normal paragraph style, i Please make this comment normal paragraph style, i.e., wrap closer to 80 characters instead of its current ragged state. Lavar Askew 2016/09/13 18:14:30 Done. Show quoted text On 2016/09/13 03:58:19, Mark P wrote: > Please make this comment normal paragraph style, i.e., wrap closer to 80 > characters instead of its current ragged state. Done.
	257 // This phenomena will be referred to as space ghosting.
	Mark P 2016/09/13 03:58:19 Don't introduce a term ("space ghosting") only to Don't introduce a term ("space ghosting") only to use it once. I think you can remove this sentence and simply revise the last sentence (the single place you use the term). Lavar Askew 2016/09/13 18:14:30 Done. Show quoted text On 2016/09/13 03:58:19, Mark P wrote: > Don't introduce a term ("space ghosting") only to use it once. I think you can > remove this sentence and simply revise the last sentence (the single place you > use the term). Done.
	258 // Going forward we perform a similar process for obtaining

	259 // scored items as above. In fact all of the variable names are the same

	260 // except that they are prepended with 'transformed_' reflecting that these

	261 // variables are related to space ghosting.

	262 if ((cursor_position != base::string16::npos) &&

	263 (cursor_position < search_string.length()) && (cursor_position > 0)) {

	264 // The original search_string broken at cursor position.

	265 base::string16 transformed_search_string(search_string);

	266 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

	267

	268 // The same as lower_raw_string and defined for the same reasons.

	269 base::string16 transformed_lower_raw_string(

	270 base::i18n::ToLower(transformed_search_string));

	271 String16Vector transformed_lower_raw_terms =

	272 base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16,

	273 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	274 if (!transformed_lower_raw_terms.empty()) {

	275 base::string16 transormed_lower_unescaped_string =

	276 net::UnescapeURLComponent(

	277 transformed_lower_raw_string,

	278 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

	279 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

	280 String16Vector transformed_lower_words(String16VectorFromString16(

	281 transormed_lower_unescaped_string, false, nullptr));

	282 HistoryIDSet transformed_history_id_set =

	283 HistoryIDSetFromWords(transformed_lower_words);

	284

	285 if (history_id_set != transformed_history_id_set) {

	286 pre_filter_item_count_ = history_id_set.size();

	287 // Trim the candidate pool if it is large as above.

	288 was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

	289 if (was_trimmed) {

	290 HistoryIDVector transformed_history_ids;

	291 std::copy(transformed_history_id_set.begin(),

	292 transformed_history_id_set.end(),

	293 std::back_inserter(transformed_history_ids));

	294 // Trim down the set by sorting by typed-count, visit-count, and last

	295 // visit.

	296 HistoryItemFactorGreater item_factor_functor(history_info_map_);

	297 std::partial_sort(

	298 transformed_history_ids.begin(),

	299 transformed_history_ids.begin() + kItemsToScoreLimit,

	300 transformed_history_ids.end(), item_factor_functor);

	301 transformed_history_id_set.clear();

	302 std::copy(transformed_history_ids.begin(),

	303 transformed_history_ids.begin() + kItemsToScoreLimit,

	304 std::inserter(transformed_history_id_set,

	305 transformed_history_id_set.end()));

	306 }

	307 history_id_set.insert(transformed_history_id_set.begin(),

	308 transformed_history_id_set.end());

	309 ScoredHistoryMatches transformed_scored_items =

	310 std::for_each(

	311 history_id_set.begin(), history_id_set.end(),

	312 AddHistoryMatch(bookmark_model, template_url_service, *this,

	313 transformed_lower_raw_string,

	314 transformed_lower_raw_terms, base::Time::Now()))

	315 .ScoredMatches();

	316 scored_items.insert(scored_items.end(),

	317 transformed_scored_items.begin(),

	318 transformed_scored_items.end());

	319 }

	320 }

	321 }

260 // Select and sort only the top \|max_matches\| results.	322 // Select and sort only the top \|max_matches\| results.

261 if (scored_items.size() > max_matches) {	323 if (scored_items.size() > max_matches) {

262 std::partial_sort(scored_items.begin(),	324 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,

263 scored_items.begin() +

264 max_matches,

265 scored_items.end(),	325 scored_items.end(),

266 ScoredHistoryMatch::MatchScoreGreater);	326 ScoredHistoryMatch::MatchScoreGreater);

267 scored_items.resize(max_matches);	327 scored_items.resize(max_matches);

268 } else {	328 } else {

269 std::sort(scored_items.begin(), scored_items.end(),	329 std::sort(scored_items.begin(), scored_items.end(),

270 ScoredHistoryMatch::MatchScoreGreater);	330 ScoredHistoryMatch::MatchScoreGreater);

271 }	331 }

272 post_scoring_item_count_ = scored_items.size();	332 post_scoring_item_count_ = scored_items.size();

273	333

274 if (was_trimmed) {	334 if (was_trimmed) {

275 search_term_cache_.clear(); // Invalidate the term cache.	335 search_term_cache_.clear(); // Invalidate the term cache.

276 } else {	336 } else {

277 // Remove any stale SearchTermCacheItems.	337 // Remove any stale SearchTermCacheItems.

(...skipping 966 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1244 return true;	1304 return true;

1245 }	1305 }

1246	1306

1247 // static	1307 // static

1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted(	1308 bool URLIndexPrivateData::URLSchemeIsWhitelisted(

1249 const GURL& gurl,	1309 const GURL& gurl,

1250 const std::set<std::string>& whitelist) {	1310 const std::set<std::string>& whitelist) {

1251 return whitelist.find(gurl.scheme()) != whitelist.end();	1311 return whitelist.find(gurl.scheme()) != whitelist.end();

1252 }	1312 }

1253	1313

1254
Mark P 2016/09/13 03:58:19 ditto style comment about single versus double bla ditto style comment about single versus double blank lines. Consistency.
1255 // SearchTermCacheItem ---------------------------------------------------------	1314 // SearchTermCacheItem ---------------------------------------------------------

1256	1315

1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(	1316 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(

1258 const WordIDSet& word_id_set,	1317 const WordIDSet& word_id_set,

1259 const HistoryIDSet& history_id_set)	1318 const HistoryIDSet& history_id_set)

1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {	1319 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {

1261 }	1320 }

1262	1321

1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {	1322 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {

1264 }	1323 }

(...skipping 89 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1354 // First cut: typed count, visit count, recency.	1413 // First cut: typed count, visit count, recency.

1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1414 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1356 // recently visited (within the last 12/24 hours) as highly important. Get	1415 // recently visited (within the last 12/24 hours) as highly important. Get

1357 // input from mpearson.	1416 // input from mpearson.

1358 if (r1.typed_count() != r2.typed_count())	1417 if (r1.typed_count() != r2.typed_count())

1359 return (r1.typed_count() > r2.typed_count());	1418 return (r1.typed_count() > r2.typed_count());

1360 if (r1.visit_count() != r2.visit_count())	1419 if (r1.visit_count() != r2.visit_count())

1361 return (r1.visit_count() > r2.visit_count());	1420 return (r1.visit_count() > r2.visit_count());

1362 return (r1.last_visit() > r2.last_visit());	1421 return (r1.last_visit() > r2.last_visit());

1363 }	1422 }

OLD	NEW

« no previous file with comments | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »