components/omnibox/browser/url_index_private_data.cc - Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox.

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Code review fixes for patch entitled "Generating autocomplete results with and without word breaks … Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 134 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 // URLIndexPrivateData ---------------------------------------------------------	145 // URLIndexPrivateData ---------------------------------------------------------

146	146

147 URLIndexPrivateData::URLIndexPrivateData()	147 URLIndexPrivateData::URLIndexPrivateData()

148 : restored_cache_version_(0),	148 : restored_cache_version_(0),

149 saved_cache_version_(kCurrentCacheFileVersion),	149 saved_cache_version_(kCurrentCacheFileVersion),

150 pre_filter_item_count_(0),	150 pre_filter_item_count_(0),

151 post_filter_item_count_(0),	151 post_filter_item_count_(0),

152 post_scoring_item_count_(0) {	152 post_scoring_item_count_(0) {

153 }	153 }

154	154

155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	155 HistoryIDSet URLIndexPrivateData::HistoryItemsForWords(

156 base::string16 search_string,	156 base::string16 search_string) {

157 size_t cursor_position,	157

158 size_t max_matches,	158 HistoryIDSet history_id_set;
	Mark P 2016/08/10 17:46:19 This variable isn't needed; simply return HistoryI This variable isn't needed; simply return HistoryIDSet() at the first return below and return HistoryIDSetFromWords(lower_words) at the second return, no need to create an intermediate copy within the function. Lavar Askew 2016/08/18 03:07:51 Done. Show quoted text On 2016/08/10 17:46:19, Mark P wrote: > This variable isn't needed; simply return HistoryIDSet() at the first return > below and return HistoryIDSetFromWords(lower_words) at the second return, no > need to create an intermediate copy within the function. Done.
159 bookmarks::BookmarkModel* bookmark_model,	159

160 TemplateURLService* template_url_service) {

161 // If cursor position is set and useful (not at either end of the

162 // string), allow the search string to be broken at cursor position.

163 // We do this by pretending there's a space where the cursor is.

164 if ((cursor_position != base::string16::npos) &&

165 (cursor_position < search_string.length()) &&

166 (cursor_position > 0)) {

167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

168 }

169 pre_filter_item_count_ = 0;

170 post_filter_item_count_ = 0;

171 post_scoring_item_count_ = 0;

172 // The search string we receive may contain escaped characters. For reducing	160 // The search string we receive may contain escaped characters. For reducing

173 // the index we need individual, lower-cased words, ignoring escapings. For	161 // the index we need individual, lower-cased words, ignoring escapings. For

174 // the final filtering we need whitespace separated substrings possibly	162 // the final filtering we need whitespace separated substrings possibly

175 // containing escaped characters.	163 // containing escaped characters.

176 base::string16 lower_raw_string(base::i18n::ToLower(search_string));	164 base::string16 lower_raw_string(

	165 base::i18n::ToLower(search_string));

177 base::string16 lower_unescaped_string =	166 base::string16 lower_unescaped_string =

178 net::UnescapeURLComponent(lower_raw_string,	167 net::UnescapeURLComponent(lower_raw_string,

179 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|	168 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);	169 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

181 // Extract individual 'words' (as opposed to 'terms'; see below) from the	170 // Extract individual 'words' (as opposed to 'terms'; see below) from the

182 // search string. When the user types "colspec=ID%20Mstone Release" we get	171 // search string. When the user types "colspec=ID%20Mstone Release" we get

183 // four 'words': "colspec", "id", "mstone" and "release".	172 // four 'words': "colspec", "id", "mstone" and "release".

184 String16Vector lower_words(	173 String16Vector lower_words(

185 String16VectorFromString16(lower_unescaped_string, false, nullptr));	174 String16VectorFromString16(

186 ScoredHistoryMatches scored_items;	175 lower_unescaped_string, false, nullptr));

187	176

188 // Do nothing if we have indexed no words (probably because we've not been	177 // Do nothing if we have indexed no words (probably because we've not been

189 // initialized yet) or the search string has no words.	178 // initialized yet) or the search string has no words.

190 if (word_list_.empty() \|\| lower_words.empty()) {	179 if (word_list_.empty() \|\| lower_words.empty()) {

191 search_term_cache_.clear(); // Invalidate the term cache.	180 search_term_cache_.clear(); // Invalidate the term cache.

192 return scored_items;	181 return history_id_set;

193 }	182 }

194	183

195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	184 history_id_set =

196 // approach.	185 HistoryIDSetFromWords(lower_words);

197 ResetSearchTermCache();

198	186

199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);	187 return history_id_set;

	188 }

200	189

201 // Trim the candidate pool if it is large. Note that we do not filter out	190 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString (

202 // items that do not contain the search terms as proper substrings -- doing	191 base::string16 search_string,

203 // so is the performance-costly operation we are trying to avoid in order	192 HistoryIDSet history_id_set,

204 // to maintain omnibox responsiveness.	193 size_t max_matches,

205 const size_t kItemsToScoreLimit = 500;	194 bookmarks::BookmarkModel* bookmark_model,

206 pre_filter_item_count_ = history_id_set.size();	195 TemplateURLService* template_url_service) {

207 // If we trim the results set we do not want to cache the results for next	196 ScoredHistoryMatches scored_items;

208 // time as the user's ultimately desired result could easily be eliminated

209 // in this early rough filter.

210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

211 if (was_trimmed) {

212 HistoryIDVector history_ids;

213 std::copy(history_id_set.begin(), history_id_set.end(),

214 std::back_inserter(history_ids));

215 // Trim down the set by sorting by typed-count, visit-count, and last

216 // visit.

217 HistoryItemFactorGreater

218 item_factor_functor(history_info_map_);

219 std::partial_sort(history_ids.begin(),

220 history_ids.begin() + kItemsToScoreLimit,

221 history_ids.end(),

222 item_factor_functor);

223 history_id_set.clear();

224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

225 std::inserter(history_id_set, history_id_set.end()));

226 post_filter_item_count_ = history_id_set.size();

227 }

228	197

229 // Pass over all of the candidates filtering out any without a proper	198 // Pass over all of the candidates filtering out any without a proper

230 // substring match, inserting those which pass in order by score. Note that	199 // substring match, inserting those which pass in order by score. Note that

231 // in this step we are using the raw search string complete with escaped	200 // in this step we are using the raw search string complete with escaped

232 // URL elements. When the user has specifically typed something akin to	201 // URL elements. When the user has specifically typed something akin to

233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that	202 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

234 // specific substring appears in the URL or page title.	203 // specific substring appears in the URL or page title.

235	204

236 // We call these 'terms' (as opposed to 'words'; see above) as in this case	205 // We call these 'terms' (as opposed to 'words'; see above) as in this case

237 // we only want to break up the search string on 'true' whitespace rather than	206 // we only want to break up the search string on 'true' whitespace rather than

238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we	207 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

239 // get two 'terms': "colspec=id%20mstone" and "release".	208 // get two 'terms': "colspec=id%20mstone" and "release".

	209 base::string16 lower_raw_string(

	210 base::i18n::ToLower(search_string));

240 String16Vector lower_raw_terms = base::SplitString(	211 String16Vector lower_raw_terms = base::SplitString(

241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,	212 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

242 base::SPLIT_WANT_NONEMPTY);	213 base::SPLIT_WANT_NONEMPTY);

	214

243 if (lower_raw_terms.empty()) {	215 if (lower_raw_terms.empty()) {

244 // Don't score matches when there are no terms to score against. (It's	216 // Don't score matches when there are no terms to score against. (It's

245 // possible that the word break iterater that extracts words to search	217 // possible that the word break iterater that extracts words to search

246 // for in the database allows some whitespace "words" whereas SplitString	218 // for in the database allows some whitespace "words" whereas SplitString

247 // excludes a long list of whitespace.) One could write a scoring	219 // excludes a long list of whitespace.) One could write a scoring

248 // function that gives a reasonable order to matches when there	220 // function that gives a reasonable order to matches when there

249 // are no terms (i.e., all the words are some form of whitespace),	221 // are no terms (i.e., all the words are some form of whitespace),

250 // but this is such a rare edge case that it's not worth the time.	222 // but this is such a rare edge case that it's not worth the time.

251 return scored_items;	223 return scored_items;

252 }	224 }

253 scored_items =	225 scored_items =

254 std::for_each(	226 std::for_each(

255 history_id_set.begin(), history_id_set.end(),	227 history_id_set.begin(), history_id_set.end(),

256 AddHistoryMatch(bookmark_model, template_url_service, *this,	228 AddHistoryMatch(bookmark_model, template_url_service, *this,

257 lower_raw_string, lower_raw_terms,	229 lower_raw_string, lower_raw_terms,

258 base::Time::Now())).ScoredMatches();	230 base::Time::Now())).ScoredMatches();

259	231

260 // Select and sort only the top \|max_matches\| results.	232 // Select and sort only the top \|max_matches\| results.

261 if (scored_items.size() > max_matches) {	233 if (scored_items.size() > max_matches) {

262 std::partial_sort(scored_items.begin(),	234 std::partial_sort(scored_items.begin(),

263 scored_items.begin() +	235 scored_items.begin() +

264 max_matches,	236 max_matches,

265 scored_items.end(),	237 scored_items.end(),

266 ScoredHistoryMatch::MatchScoreGreater);	238 ScoredHistoryMatch::MatchScoreGreater);

267 scored_items.resize(max_matches);	239 scored_items.resize(max_matches);

268 } else {	240 }

	241 else {

269 std::sort(scored_items.begin(), scored_items.end(),	242 std::sort(scored_items.begin(), scored_items.end(),

270 ScoredHistoryMatch::MatchScoreGreater);	243 ScoredHistoryMatch::MatchScoreGreater);

271 }	244 }

272 post_scoring_item_count_ = scored_items.size();

273

274 if (was_trimmed) {

275 search_term_cache_.clear(); // Invalidate the term cache.

276 } else {

277 // Remove any stale SearchTermCacheItems.

278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

279 cache_iter != search_term_cache_.end(); ) {

280 if (!cache_iter->second.used_)

281 search_term_cache_.erase(cache_iter++);

282 else

283 ++cache_iter;

284 }

285 }

286	245

287 return scored_items;	246 return scored_items;

288 }	247 }

289	248

	249

	250 bool URLIndexPrivateData::TrimCandidatePool (HistoryIDSet history_id_set) {

	251

	252 // Trim the candidate pool if it is large. Note that we do not filter out
	Mark P 2016/08/10 17:46:19 I assume this whole block was copied with no modif I assume this whole block was copied with no modifications. Lavar Askew 2016/08/18 03:07:51 Yes, you are correct. Done. Show quoted text On 2016/08/10 17:46:19, Mark P wrote: > I assume this whole block was copied with no modifications. Yes, you are correct. Done.
	253 // items that do not contain the search terms as proper substrings -- doing

	254 // so is the performance-costly operation we are trying to avoid in order

	255 // to maintain omnibox responsiveness.

	256 const size_t kItemsToScoreLimit = 500;

	257 pre_filter_item_count_ = history_id_set.size();

	258 // If we trim the results set we do not want to cache the results for next

	259 // time as the user's ultimately desired result could easily be eliminated

	260 // in this early rough filter.

	261 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

	262 if (was_trimmed) {

	263 HistoryIDVector history_ids;

	264 std::copy(history_id_set.begin(), history_id_set.end(),

	265 std::back_inserter(history_ids));

	266 // Trim down the set by sorting by typed-count, visit-count, and last

	267 // visit.

	268 HistoryItemFactorGreater

	269 item_factor_functor(history_info_map_);

	270 std::partial_sort(history_ids.begin(),

	271 history_ids.begin() + kItemsToScoreLimit,

	272 history_ids.end(),

	273 item_factor_functor);

	274 history_id_set.clear();

	275 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

	276 std::inserter(history_id_set, history_id_set.end()));

	277 post_filter_item_count_ = history_id_set.size();

	278 }

	279

	280 return was_trimmed;

	281 }

	282

	283 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

	284 base::string16 search_string,

	285 size_t cursor_position,

	286 size_t max_matches,

	287 bookmarks::BookmarkModel* bookmark_model,

	288 TemplateURLService* template_url_service) {

	289 pre_filter_item_count_ = 0;

	290 post_filter_item_count_ = 0;

	291 post_scoring_item_count_ = 0;

	292

	293 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

	294 // approach.

	295 ResetSearchTermCache();

	296

	297 HistoryIDSet history_id_set = HistoryItemsForWords(search_string);

	298

	299 bool history_id_set_without_word_break_was_trimmed =

	300 TrimCandidatePool(history_id_set);

	301

	302 ScoredHistoryMatches scored_items_without_word_break =

	303 GetScoredItemsForSearchString(

	304 search_string,

	305 history_id_set,

	306 max_matches,

	307 bookmark_model,

	308 template_url_service);

	309

	310 ScoredHistoryMatches all_scored_items;

	311 ScoredHistoryMatches scored_items_with_word_break;

	312

	313 bool history_id_set_with_word_break_was_trimmed = false;

	314

	315 // If cursor position is set and useful (not at either end of the

	316 // string), allow the search string to be broken at cursor position.

	317 // We do this by pretending there's a space where the cursor is.

	318 if ((cursor_position != base::string16::npos) &&

	319 (cursor_position < search_string.length()) &&

	320 (cursor_position > 0)) {

	321 base::string16 search_string_with_word_break = search_string;

	322 HistoryIDSet history_id_set_with_word_break;

	323

	324 base::string16 word_break =

	325 base::ASCIIToUTF16(" ");

	326 search_string_with_word_break.insert(cursor_position, word_break);

	327

	328 // Add to history_id_set the ids that are related to the original

	329 // search string, but with the word break.

	330 history_id_set_with_word_break =

	331 HistoryItemsForWords(search_string_with_word_break);
	Mark P 2016/08/10 17:46:19 This might do unnecessary work if the new search s This might do unnecessary work if the new search string breaks into the same words as the old one. The search term cache may help avoid some of this unnecessary work, but I think we can do better here. Can you please improve this? Let's not slow down users unnecessarily.
	332

	333 history_id_set_with_word_break_was_trimmed =

	334 TrimCandidatePool (history_id_set_with_word_break);

	335

	336 scored_items_with_word_break = GetScoredItemsForSearchString(

	337 search_string_with_word_break,

	338 history_id_set_with_word_break,

	339 max_matches,

	340 bookmark_model,

	341 template_url_service);

	342 }

	343

	344 if (history_id_set_without_word_break_was_trimmed \|\|

	345 history_id_set_with_word_break_was_trimmed) {

	346 search_term_cache_.clear(); // Invalidate the term cache.

	347 }

	348 else {

	349 // Remove any stale SearchTermCacheItems.

	350 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

	351 cache_iter != search_term_cache_.end(); ) {

	352 if (!cache_iter->second.used_) {

	353 search_term_cache_.erase(cache_iter++);

	354 }

	355 else {

	356 ++cache_iter;

	357 }

	358 }

	359 }

	360

	361 // all_scored_items represents the unification the

	362 //ScoredHistoryMatches for the search string with and without the word break.
	Mark P 2016/08/10 17:46:19 The code below can be made more efficient and easi The code below can be made more efficient and easier to read. Approximately: if (without_word_break.empty()) return with_word_break with_word_break.insert(with_word_break.end(), without.begin(), without.end()) return with_word_break; ( http://stackoverflow.com/a/2208313 is good reading by the way about what I skipped the reserve. Also this technique doesn't need another variable, no need to copy all those entries to a third variable.) Lavar Askew 2016/08/18 03:07:51 Done. Show quoted text On 2016/08/10 17:46:19, Mark P wrote: > The code below can be made more efficient and easier to read. Approximately: > if (without_word_break.empty()) > return with_word_break > with_word_break.insert(with_word_break.end(), without.begin(), without.end()) > return with_word_break; > > ( http://stackoverflow.com/a/2208313 is good reading by the way about what I > skipped the reserve. Also this technique doesn't need another variable, no need > to copy all those entries to a third variable.) Done.
	363 if (scored_items_without_word_break.size() > 0 &&

	364 scored_items_with_word_break.size() > 0) {

	365 post_scoring_item_count_ =

	366 scored_items_without_word_break.size()

	367 + scored_items_with_word_break.size();

	368

	369 all_scored_items.reserve(post_scoring_item_count_);

	370 all_scored_items.insert(all_scored_items.end(),

	371 scored_items_without_word_break.begin(),

	372 scored_items_without_word_break.end());

	373 all_scored_items.insert(all_scored_items.end(),

	374 scored_items_with_word_break.begin(),

	375 scored_items_with_word_break.end());

	376 }

	377 else if (scored_items_without_word_break.size() == 0 &&

	378 scored_items_with_word_break.size() > 0) {

	379 all_scored_items = scored_items_with_word_break;

	380 }

	381 else if (scored_items_without_word_break.size() > 0 &&

	382 scored_items_with_word_break.size() == 0) {

	383 all_scored_items = scored_items_without_word_break;

	384 }

	385
	Mark P 2016/08/10 17:46:19 I could imagine situations where the items overlap I could imagine situations where the items overlap. You should be removing duplicates (smartly) here. Lavar Askew 2016/08/18 03:07:51 Done. Show quoted text On 2016/08/10 17:46:19, Mark P wrote: > I could imagine situations where the items overlap. You should be removing > duplicates (smartly) here. Done.
	386 return all_scored_items;
	Mark P 2016/08/10 17:46:19 Also, the number of items we're supposed to return Also, the number of items we're supposed to return should be no more than max_matches. You need to truncate the list if necessary. Right now you may be returning twice that number. Lavar Askew 2016/08/18 03:07:52 Done. Show quoted text On 2016/08/10 17:46:19, Mark P wrote: > Also, the number of items we're supposed to return should be no more than > max_matches. You need to truncate the list if necessary. Right now you may be > returning twice that number. Done.
	387 }

	388

290 bool URLIndexPrivateData::UpdateURL(	389 bool URLIndexPrivateData::UpdateURL(

291 history::HistoryService* history_service,	390 history::HistoryService* history_service,

292 const history::URLRow& row,	391 const history::URLRow& row,

293 const std::set<std::string>& scheme_whitelist,	392 const std::set<std::string>& scheme_whitelist,

294 base::CancelableTaskTracker* tracker) {	393 base::CancelableTaskTracker* tracker) {

295 // The row may or may not already be in our index. If it is not already	394 // The row may or may not already be in our index. If it is not already

296 // indexed and it qualifies then it gets indexed. If it is already	395 // indexed and it qualifies then it gets indexed. If it is already

297 // indexed and still qualifies then it gets updated, otherwise it	396 // indexed and still qualifies then it gets updated, otherwise it

298 // is deleted from the index.	397 // is deleted from the index.

299 bool row_was_updated = false;	398 bool row_was_updated = false;

(...skipping 1054 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1354 // First cut: typed count, visit count, recency.	1453 // First cut: typed count, visit count, recency.

1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1454 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1356 // recently visited (within the last 12/24 hours) as highly important. Get	1455 // recently visited (within the last 12/24 hours) as highly important. Get

1357 // input from mpearson.	1456 // input from mpearson.

1358 if (r1.typed_count() != r2.typed_count())	1457 if (r1.typed_count() != r2.typed_count())

1359 return (r1.typed_count() > r2.typed_count());	1458 return (r1.typed_count() > r2.typed_count());

1360 if (r1.visit_count() != r2.visit_count())	1459 if (r1.visit_count() != r2.visit_count())

1361 return (r1.visit_count() > r2.visit_count());	1460 return (r1.visit_count() > r2.visit_count());

1362 return (r1.last_visit() > r2.last_visit());	1461 return (r1.last_visit() > r2.last_visit());

1363 }	1462 }

OLD	NEW

« components/omnibox/browser/url_index_private_data.h ('K') | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »