components/omnibox/browser/url_index_private_data.cc - Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox.

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Unit test added. Grammatical errors corrected. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
74 WordStartsMapEntry;	74 WordStartsMapEntry;

75	75

76 // Algorithm Functions ---------------------------------------------------------	76 // Algorithm Functions ---------------------------------------------------------

77	77

78 // Comparison function for sorting search terms by descending length.	78 // Comparison function for sorting search terms by descending length.

79 bool LengthGreater(const base::string16& string_a,	79 bool LengthGreater(const base::string16& string_a,

80 const base::string16& string_b) {	80 const base::string16& string_b) {

81 return string_a.length() > string_b.length();	81 return string_a.length() > string_b.length();

82 }	82 }

83	83

	84 bool Less(const base::string16& string_a, const base::string16& string_b) {

	85 return (string_a.compare(string_b) <= 0);

	86 }

84	87

85 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------	88 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

86	89

87 // HistoryDBTask used to update the recent visit data for a particular	90 // HistoryDBTask used to update the recent visit data for a particular

88 // row from the history database.	91 // row from the history database.

89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {	92 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {

90 public:	93 public:

91 explicit UpdateRecentVisitsFromHistoryDBTask(	94 explicit UpdateRecentVisitsFromHistoryDBTask(

92 URLIndexPrivateData* private_data,	95 URLIndexPrivateData* private_data,

93 history::URLID url_id);	96 history::URLID url_id);

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 // URLIndexPrivateData ---------------------------------------------------------	148 // URLIndexPrivateData ---------------------------------------------------------

146	149

147 URLIndexPrivateData::URLIndexPrivateData()	150 URLIndexPrivateData::URLIndexPrivateData()

148 : restored_cache_version_(0),	151 : restored_cache_version_(0),

149 saved_cache_version_(kCurrentCacheFileVersion),	152 saved_cache_version_(kCurrentCacheFileVersion),

150 pre_filter_item_count_(0),	153 pre_filter_item_count_(0),

151 post_filter_item_count_(0),	154 post_filter_item_count_(0),

152 post_scoring_item_count_(0) {	155 post_scoring_item_count_(0) {

153 }	156 }

154	157

155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	158 String16Vector URLIndexPrivateData::ExtractIndividualWordVector(

156 base::string16 search_string,	159 const base::string16& search_string) {

157 size_t cursor_position,

158 size_t max_matches,

159 bookmarks::BookmarkModel* bookmark_model,

160 TemplateURLService* template_url_service) {

161 // If cursor position is set and useful (not at either end of the

162 // string), allow the search string to be broken at cursor position.

163 // We do this by pretending there's a space where the cursor is.

164 if ((cursor_position != base::string16::npos) &&

165 (cursor_position < search_string.length()) &&

166 (cursor_position > 0)) {

167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

168 }

169 pre_filter_item_count_ = 0;

170 post_filter_item_count_ = 0;

171 post_scoring_item_count_ = 0;

172 // The search string we receive may contain escaped characters. For reducing	160 // The search string we receive may contain escaped characters. For reducing

173 // the index we need individual, lower-cased words, ignoring escapings. For	161 // the index we need individual, lower-cased words, ignoring escapings. For

174 // the final filtering we need whitespace separated substrings possibly	162 // the final filtering we need whitespace separated substrings possibly

175 // containing escaped characters.	163 // containing escaped characters.

176 base::string16 lower_raw_string(base::i18n::ToLower(search_string));	164 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

177 base::string16 lower_unescaped_string =	165 base::string16 lower_unescaped_string = net::UnescapeURLComponent(

178 net::UnescapeURLComponent(lower_raw_string,	166 lower_raw_string,

179 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|	167 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);	168 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

181 // Extract individual 'words' (as opposed to 'terms'; see below) from the	169 // Extract individual 'words' (as opposed to 'terms'; see below) from the

182 // search string. When the user types "colspec=ID%20Mstone Release" we get	170 // search string. When the user types "colspec=ID%20Mstone Release" we get

183 // four 'words': "colspec", "id", "mstone" and "release".	171 // four 'words': "colspec", "id", "mstone" and "release".

184 String16Vector lower_words(	172 return String16VectorFromString16(lower_unescaped_string, false, nullptr);

185 String16VectorFromString16(lower_unescaped_string, false, nullptr));	173 }

186 ScoredHistoryMatches scored_items;

187	174

	175 HistoryIDSet URLIndexPrivateData::GetHistoryIDSet(

	176 const String16Vector& lower_words) {

188 // Do nothing if we have indexed no words (probably because we've not been	177 // Do nothing if we have indexed no words (probably because we've not been

189 // initialized yet) or the search string has no words.	178 // initialized yet) or the search string has no words.

190 if (word_list_.empty() \|\| lower_words.empty()) {	179 if (word_list_.empty() \|\| lower_words.empty()) {

191 search_term_cache_.clear(); // Invalidate the term cache.	180 search_term_cache_.clear(); // Invalidate the term cache.

192 return scored_items;	181 return HistoryIDSet();

193 }	182 }

194	183

195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	184 return HistoryIDSetFromWords(lower_words);

196 // approach.	185 }

197 ResetSearchTermCache();

198	186

199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);	187 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString(
	Mark P 2016/08/23 22:14:46 Not quite right. The .h file order is fine. Now Show quoted text On 2016/08/23 20:38:19, Lavar Askew wrote: > On 2016/08/22 18:24:11, Mark P wrote: > > Please move this function to where it used to be in this file. > > (Chromium style says the .h and .cc should have the functions in the same > > order.) > > Done. > > I hope I got the order right. I moved the signatures down to the private > section of the .h file like you asked though. Not quite right. The .h file order is fine. Now the .cc needs to correspond with that the .h is. This means putting ExtractIndividualWordVector(), GetHistoryIDSet(), and GetScoredItemsForSearchString() (in that order) immediately after the implementation of URLSchemeIsWhitelisted(). (Right before the line // SearchTermCacheItem -------------------------- .) Lavar Askew 2016/08/24 19:53:14 Done. Show quoted text On 2016/08/23 22:14:46, Mark P wrote: > On 2016/08/23 20:38:19, Lavar Askew wrote: > > On 2016/08/22 18:24:11, Mark P wrote: > > > Please move this function to where it used to be in this file. > > > (Chromium style says the .h and .cc should have the functions in the same > > > order.) > > > > Done. > > > > I hope I got the order right. I moved the signatures down to the private > > section of the .h file like you asked though. > > Not quite right. The .h file order is fine. Now the .cc needs to correspond > with that the .h is. This means putting ExtractIndividualWordVector(), > GetHistoryIDSet(), and GetScoredItemsForSearchString() (in that order) > immediately after the implementation of URLSchemeIsWhitelisted(). > (Right before the line > // SearchTermCacheItem -------------------------- > .) Done.
200	188 const base::string16& search_string,

201 // Trim the candidate pool if it is large. Note that we do not filter out	189 const HistoryIDSet& history_id_set,

202 // items that do not contain the search terms as proper substrings -- doing	190 const size_t max_matches,

203 // so is the performance-costly operation we are trying to avoid in order	191 bookmarks::BookmarkModel* bookmark_model,

204 // to maintain omnibox responsiveness.	192 TemplateURLService* template_url_service) {

205 const size_t kItemsToScoreLimit = 500;	193 ScoredHistoryMatches scored_items;

206 pre_filter_item_count_ = history_id_set.size();

207 // If we trim the results set we do not want to cache the results for next

208 // time as the user's ultimately desired result could easily be eliminated

209 // in this early rough filter.

210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

211 if (was_trimmed) {

212 HistoryIDVector history_ids;

213 std::copy(history_id_set.begin(), history_id_set.end(),

214 std::back_inserter(history_ids));

215 // Trim down the set by sorting by typed-count, visit-count, and last

216 // visit.

217 HistoryItemFactorGreater

218 item_factor_functor(history_info_map_);

219 std::partial_sort(history_ids.begin(),

220 history_ids.begin() + kItemsToScoreLimit,

221 history_ids.end(),

222 item_factor_functor);

223 history_id_set.clear();

224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

225 std::inserter(history_id_set, history_id_set.end()));

226 post_filter_item_count_ = history_id_set.size();

227 }

228	194

229 // Pass over all of the candidates filtering out any without a proper	195 // Pass over all of the candidates filtering out any without a proper

230 // substring match, inserting those which pass in order by score. Note that	196 // substring match, inserting those which pass in order by score. Note that

231 // in this step we are using the raw search string complete with escaped	197 // in this step we are using the raw search string complete with escaped

232 // URL elements. When the user has specifically typed something akin to	198 // URL elements. When the user has specifically typed something akin to

233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that	199 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

234 // specific substring appears in the URL or page title.	200 // specific substring appears in the URL or page title.

235	201

236 // We call these 'terms' (as opposed to 'words'; see above) as in this case	202 // We call these 'terms' (as opposed to 'words'; see above) as in this case

237 // we only want to break up the search string on 'true' whitespace rather than	203 // we only want to break up the search string on 'true' whitespace rather than

238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we	204 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

239 // get two 'terms': "colspec=id%20mstone" and "release".	205 // get two 'terms': "colspec=id%20mstone" and "release".

	206 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

240 String16Vector lower_raw_terms = base::SplitString(	207 String16Vector lower_raw_terms = base::SplitString(

241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,	208 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

242 base::SPLIT_WANT_NONEMPTY);	209 base::SPLIT_WANT_NONEMPTY);

	210

243 if (lower_raw_terms.empty()) {	211 if (lower_raw_terms.empty()) {

244 // Don't score matches when there are no terms to score against. (It's	212 // Don't score matches when there are no terms to score against. (It's

245 // possible that the word break iterater that extracts words to search	213 // possible that the word break iterater that extracts words to search

246 // for in the database allows some whitespace "words" whereas SplitString	214 // for in the database allows some whitespace "words" whereas SplitString

247 // excludes a long list of whitespace.) One could write a scoring	215 // excludes a long list of whitespace.) One could write a scoring

248 // function that gives a reasonable order to matches when there	216 // function that gives a reasonable order to matches when there

249 // are no terms (i.e., all the words are some form of whitespace),	217 // are no terms (i.e., all the words are some form of whitespace),

250 // but this is such a rare edge case that it's not worth the time.	218 // but this is such a rare edge case that it's not worth the time.

251 return scored_items;	219 return scored_items;

252 }	220 }

253 scored_items =	221 scored_items =

254 std::for_each(	222 std::for_each(

255 history_id_set.begin(), history_id_set.end(),	223 history_id_set.begin(), history_id_set.end(),

256 AddHistoryMatch(bookmark_model, template_url_service, *this,	224 AddHistoryMatch(bookmark_model, template_url_service, *this,

257 lower_raw_string, lower_raw_terms,	225 lower_raw_string, lower_raw_terms,

258 base::Time::Now())).ScoredMatches();	226 base::Time::Now())).ScoredMatches();

259	227

260 // Select and sort only the top \|max_matches\| results.	228 // Select and sort only the top \|max_matches\| results.

261 if (scored_items.size() > max_matches) {	229 if (scored_items.size() > max_matches) {

262 std::partial_sort(scored_items.begin(),	230 std::partial_sort(scored_items.begin(),

263 scored_items.begin() +	231 scored_items.begin() +

264 max_matches,	232 max_matches,

265 scored_items.end(),	233 scored_items.end(),

266 ScoredHistoryMatch::MatchScoreGreater);	234 ScoredHistoryMatch::MatchScoreGreater);

267 scored_items.resize(max_matches);	235 scored_items.resize(max_matches);

268 } else {	236 } else {

269 std::sort(scored_items.begin(), scored_items.end(),	237 std::sort(scored_items.begin(), scored_items.end(),

270 ScoredHistoryMatch::MatchScoreGreater);	238 ScoredHistoryMatch::MatchScoreGreater);

271 }	239 }

272 post_scoring_item_count_ = scored_items.size();

273	240

	241 return scored_items;

	242 }

	243

	244 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

	245 base::string16 search_string,

	246 size_t cursor_position,

	247 size_t max_matches,

	248 bookmarks::BookmarkModel* bookmark_model,

	249 TemplateURLService* template_url_service) {

	250 pre_filter_item_count_ = 0;

	251 post_filter_item_count_ = 0;

	252 post_scoring_item_count_ = 0;

	253

	254 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

	255 // approach.

	256 ResetSearchTermCache();

	257

	258 String16Vector lower_words = ExtractIndividualWordVector(search_string);

	259 HistoryIDSet history_id_set = GetHistoryIDSet(lower_words);

	260

	261 // If cursor position is set and useful (not at either end of the

	262 // string), allow the search string to be broken at cursor position.

	263 // We do this by pretending there's a space where the cursor is.

	264 if ((cursor_position != base::string16::npos) &&

	265 (cursor_position < search_string.length()) && (cursor_position > 0)) {

	266 base::string16 search_string_with_word_break(search_string);

	267

	268 base::string16 word_break = base::ASCIIToUTF16(" ");

	269 search_string_with_word_break.insert(cursor_position, word_break);

	270

	271 // Add to history_id_set the ids that are related to the original search

	272 // string, but with the word break.

	273 String16Vector lower_words_with_word_break =

	274 ExtractIndividualWordVector(search_string_with_word_break);

	275

	276 // Check to see if lower_words and lower_words_with_word_break are the same

	277 // size. If they are then this could indicate that the word vectors

	278 // contain the same words.

	279 if (lower_words.size() != lower_words_with_word_break.size()) {

	280 std::sort(lower_words.begin(), lower_words.end(), Less);

	281 std::sort(lower_words_with_word_break.begin(),

	282 lower_words_with_word_break.end(), Less);

	283 if (lower_words != lower_words_with_word_break) {

	284 HistoryIDSet history_id_set_with_word_break =

	285 GetHistoryIDSet(lower_words_with_word_break);

	286 history_id_set.insert(history_id_set_with_word_break.begin(),

	287 history_id_set_with_word_break.end());

	288 }

	289 }

	290 }

	291 // Trim the candidate pool if it is large. Note that we do not filter out

	292 // items that do not contain the search terms as proper substrings -- doing

	293 // so is the performance-costly operation we are trying to avoid in order

	294 // to maintain omnibox responsiveness.

	295 pre_filter_item_count_ = history_id_set.size();

	296 // If we trim the results set we do not want to cache the results for next

	297 // time as the user's ultimately desired result could easily be eliminated

	298 // in this early rough filter.

	299 bool was_trimmed = (pre_filter_item_count_ > max_matches);

	300 if (was_trimmed) {

	301 HistoryIDVector history_ids;

	302 std::copy(history_id_set.begin(), history_id_set.end(),

	303 std::back_inserter(history_ids));

	304 // Trim down the set by sorting by typed-count, visit-count, and last

	305 // visit.

	306 HistoryItemFactorGreater item_factor_functor(history_info_map_);

	307 std::partial_sort(history_ids.begin(), history_ids.begin() + max_matches,

	308 history_ids.end(), item_factor_functor);

	309 history_id_set.clear();

	310 std::copy(history_ids.begin(), history_ids.begin() + max_matches,

	311 std::inserter(history_id_set, history_id_set.end()));

	312 post_filter_item_count_ = history_id_set.size();

	313 }

	314 ScoredHistoryMatches scored_items =

	315 GetScoredItemsForSearchString(search_string, history_id_set, max_matches,

	316 bookmark_model, template_url_service);

274 if (was_trimmed) {	317 if (was_trimmed) {

275 search_term_cache_.clear(); // Invalidate the term cache.	318 search_term_cache_.clear(); // Invalidate the term cache.

276 } else {	319 } else {

277 // Remove any stale SearchTermCacheItems.	320 // Remove any stale SearchTermCacheItems.

278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();	321 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

279 cache_iter != search_term_cache_.end(); ) {	322 cache_iter != search_term_cache_.end(); ) {

280 if (!cache_iter->second.used_)	323 if (!cache_iter->second.used_) {

281 search_term_cache_.erase(cache_iter++);	324 search_term_cache_.erase(cache_iter++);

282 else	325 } else {

283 ++cache_iter;	326 ++cache_iter;

	327 }

284 }	328 }

285 }	329 }

286

287 return scored_items;	330 return scored_items;

288 }	331 }

289	332

290 bool URLIndexPrivateData::UpdateURL(	333 bool URLIndexPrivateData::UpdateURL(

291 history::HistoryService* history_service,	334 history::HistoryService* history_service,

292 const history::URLRow& row,	335 const history::URLRow& row,

293 const std::set<std::string>& scheme_whitelist,	336 const std::set<std::string>& scheme_whitelist,

294 base::CancelableTaskTracker* tracker) {	337 base::CancelableTaskTracker* tracker) {

295 // The row may or may not already be in our index. If it is not already	338 // The row may or may not already be in our index. If it is not already

296 // indexed and it qualifies then it gets indexed. If it is already	339 // indexed and it qualifies then it gets indexed. If it is already

(...skipping 1057 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1354 // First cut: typed count, visit count, recency.	1397 // First cut: typed count, visit count, recency.

1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1398 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1356 // recently visited (within the last 12/24 hours) as highly important. Get	1399 // recently visited (within the last 12/24 hours) as highly important. Get

1357 // input from mpearson.	1400 // input from mpearson.

1358 if (r1.typed_count() != r2.typed_count())	1401 if (r1.typed_count() != r2.typed_count())

1359 return (r1.typed_count() > r2.typed_count());	1402 return (r1.typed_count() > r2.typed_count());

1360 if (r1.visit_count() != r2.visit_count())	1403 if (r1.visit_count() != r2.visit_count())

1361 return (r1.visit_count() > r2.visit_count());	1404 return (r1.visit_count() > r2.visit_count());

1362 return (r1.last_visit() > r2.last_visit());	1405 return (r1.last_visit() > r2.last_visit());

1363 }	1406 }

OLD	NEW

« components/omnibox/browser/url_index_private_data.h ('K') | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »