components/omnibox/browser/url_index_private_data.cc - Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox.

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: I tried to remove duplicates and improve effciency. I appreciated the comments. Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
74 WordStartsMapEntry;	74 WordStartsMapEntry;

75	75

76 // Algorithm Functions ---------------------------------------------------------	76 // Algorithm Functions ---------------------------------------------------------

77	77

78 // Comparison function for sorting search terms by descending length.	78 // Comparison function for sorting search terms by descending length.

79 bool LengthGreater(const base::string16& string_a,	79 bool LengthGreater(const base::string16& string_a,

80 const base::string16& string_b) {	80 const base::string16& string_b) {

81 return string_a.length() > string_b.length();	81 return string_a.length() > string_b.length();

82 }	82 }

83	83

	84 bool Less(const base::string16& string_a, const base::string16& string_b) {

	85 return (string_a.compare(string_b) <= 0);

	86 }

84	87

85 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------	88 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

86	89

87 // HistoryDBTask used to update the recent visit data for a particular	90 // HistoryDBTask used to update the recent visit data for a particular

88 // row from the history database.	91 // row from the history database.

89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {	92 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {

90 public:	93 public:

91 explicit UpdateRecentVisitsFromHistoryDBTask(	94 explicit UpdateRecentVisitsFromHistoryDBTask(

92 URLIndexPrivateData* private_data,	95 URLIndexPrivateData* private_data,

93 history::URLID url_id);	96 history::URLID url_id);

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 // URLIndexPrivateData ---------------------------------------------------------	148 // URLIndexPrivateData ---------------------------------------------------------

146	149

147 URLIndexPrivateData::URLIndexPrivateData()	150 URLIndexPrivateData::URLIndexPrivateData()

148 : restored_cache_version_(0),	151 : restored_cache_version_(0),

149 saved_cache_version_(kCurrentCacheFileVersion),	152 saved_cache_version_(kCurrentCacheFileVersion),

150 pre_filter_item_count_(0),	153 pre_filter_item_count_(0),

151 post_filter_item_count_(0),	154 post_filter_item_count_(0),

152 post_scoring_item_count_(0) {	155 post_scoring_item_count_(0) {

153 }	156 }

154	157

155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	158 String16Vector URLIndexPrivateData::ExtractIndividualWordVector(

156 base::string16 search_string,	159 const base::string16& search_string) {

157 size_t cursor_position,

158 size_t max_matches,

159 bookmarks::BookmarkModel* bookmark_model,

160 TemplateURLService* template_url_service) {

161 // If cursor position is set and useful (not at either end of the

162 // string), allow the search string to be broken at cursor position.

163 // We do this by pretending there's a space where the cursor is.

164 if ((cursor_position != base::string16::npos) &&

165 (cursor_position < search_string.length()) &&

166 (cursor_position > 0)) {

167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

168 }

169 pre_filter_item_count_ = 0;

170 post_filter_item_count_ = 0;

171 post_scoring_item_count_ = 0;

172 // The search string we receive may contain escaped characters. For reducing	160 // The search string we receive may contain escaped characters. For reducing

173 // the index we need individual, lower-cased words, ignoring escapings. For	161 // the index we need individual, lower-cased words, ignoring escapings. For

174 // the final filtering we need whitespace separated substrings possibly	162 // the final filtering we need whitespace separated substrings possibly

175 // containing escaped characters.	163 // containing escaped characters.

176 base::string16 lower_raw_string(base::i18n::ToLower(search_string));	164 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

177 base::string16 lower_unescaped_string =	165 base::string16 lower_unescaped_string = net::UnescapeURLComponent(

178 net::UnescapeURLComponent(lower_raw_string,	166 lower_raw_string,

179 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|	167 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);	168 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

181 // Extract individual 'words' (as opposed to 'terms'; see below) from the	169 // Extract individual 'words' (as opposed to 'terms'; see below) from the

182 // search string. When the user types "colspec=ID%20Mstone Release" we get	170 // search string. When the user types "colspec=ID%20Mstone Release" we get

183 // four 'words': "colspec", "id", "mstone" and "release".	171 // four 'words': "colspec", "id", "mstone" and "release".

184 String16Vector lower_words(	172 return String16VectorFromString16(lower_unescaped_string, false, nullptr);

185 String16VectorFromString16(lower_unescaped_string, false, nullptr));	173 }

186 ScoredHistoryMatches scored_items;

187	174

	175 HistoryIDSet URLIndexPrivateData::GetHistoryIDSet(

	176 const String16Vector& lower_words) {

188 // Do nothing if we have indexed no words (probably because we've not been	177 // Do nothing if we have indexed no words (probably because we've not been

189 // initialized yet) or the search string has no words.	178 // initialized yet) or the search string has no words.

190 if (word_list_.empty() \|\| lower_words.empty()) {	179 if (word_list_.empty() \|\| lower_words.empty()) {

191 search_term_cache_.clear(); // Invalidate the term cache.	180 search_term_cache_.clear(); // Invalidate the term cache.

192 return scored_items;	181 return HistoryIDSet();

193 }	182 }

194	183

195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	184 return HistoryIDSetFromWords(lower_words);

196 // approach.	185 }

197 ResetSearchTermCache();

198	186

199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);	187 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString(

200	188 const base::string16& search_string,

201 // Trim the candidate pool if it is large. Note that we do not filter out	189 const HistoryIDSet& history_id_set,

202 // items that do not contain the search terms as proper substrings -- doing	190 const size_t& max_matches,

203 // so is the performance-costly operation we are trying to avoid in order	191 bookmarks::BookmarkModel* bookmark_model,

204 // to maintain omnibox responsiveness.	192 TemplateURLService* template_url_service) {

205 const size_t kItemsToScoreLimit = 500;	193 ScoredHistoryMatches scored_items;

206 pre_filter_item_count_ = history_id_set.size();

207 // If we trim the results set we do not want to cache the results for next

208 // time as the user's ultimately desired result could easily be eliminated

209 // in this early rough filter.

210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

211 if (was_trimmed) {

212 HistoryIDVector history_ids;

213 std::copy(history_id_set.begin(), history_id_set.end(),

214 std::back_inserter(history_ids));

215 // Trim down the set by sorting by typed-count, visit-count, and last

216 // visit.

217 HistoryItemFactorGreater

218 item_factor_functor(history_info_map_);

219 std::partial_sort(history_ids.begin(),

220 history_ids.begin() + kItemsToScoreLimit,

221 history_ids.end(),

222 item_factor_functor);

223 history_id_set.clear();

224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

225 std::inserter(history_id_set, history_id_set.end()));

226 post_filter_item_count_ = history_id_set.size();

227 }

228	194

229 // Pass over all of the candidates filtering out any without a proper	195 // Pass over all of the candidates filtering out any without a proper

230 // substring match, inserting those which pass in order by score. Note that	196 // substring match, inserting those which pass in order by score. Note that

231 // in this step we are using the raw search string complete with escaped	197 // in this step we are using the raw search string complete with escaped

232 // URL elements. When the user has specifically typed something akin to	198 // URL elements. When the user has specifically typed something akin to

233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that	199 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

234 // specific substring appears in the URL or page title.	200 // specific substring appears in the URL or page title.

235	201

236 // We call these 'terms' (as opposed to 'words'; see above) as in this case	202 // We call these 'terms' (as opposed to 'words'; see above) as in this case

237 // we only want to break up the search string on 'true' whitespace rather than	203 // we only want to break up the search string on 'true' whitespace rather than

238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we	204 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

239 // get two 'terms': "colspec=id%20mstone" and "release".	205 // get two 'terms': "colspec=id%20mstone" and "release".

	206 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

240 String16Vector lower_raw_terms = base::SplitString(	207 String16Vector lower_raw_terms = base::SplitString(

241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,	208 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

242 base::SPLIT_WANT_NONEMPTY);	209 base::SPLIT_WANT_NONEMPTY);

	210

243 if (lower_raw_terms.empty()) {	211 if (lower_raw_terms.empty()) {

244 // Don't score matches when there are no terms to score against. (It's	212 // Don't score matches when there are no terms to score against. (It's

245 // possible that the word break iterater that extracts words to search	213 // possible that the word break iterater that extracts words to search

246 // for in the database allows some whitespace "words" whereas SplitString	214 // for in the database allows some whitespace "words" whereas SplitString

247 // excludes a long list of whitespace.) One could write a scoring	215 // excludes a long list of whitespace.) One could write a scoring

248 // function that gives a reasonable order to matches when there	216 // function that gives a reasonable order to matches when there

249 // are no terms (i.e., all the words are some form of whitespace),	217 // are no terms (i.e., all the words are some form of whitespace),

250 // but this is such a rare edge case that it's not worth the time.	218 // but this is such a rare edge case that it's not worth the time.

251 return scored_items;	219 return scored_items;

252 }	220 }

253 scored_items =	221 scored_items =

254 std::for_each(	222 std::for_each(

255 history_id_set.begin(), history_id_set.end(),	223 history_id_set.begin(), history_id_set.end(),

256 AddHistoryMatch(bookmark_model, template_url_service, *this,	224 AddHistoryMatch(bookmark_model, template_url_service, *this,

257 lower_raw_string, lower_raw_terms,	225 lower_raw_string, lower_raw_terms,

258 base::Time::Now())).ScoredMatches();	226 base::Time::Now())).ScoredMatches();

259	227

260 // Select and sort only the top \|max_matches\| results.	228 // Select and sort only the top \|max_matches\| results.

	229
	Mark P 2016/08/18 19:36:53 nit: don't add a blank line nit: don't add a blank line Lavar Askew 2016/08/19 04:46:07 Done. Show quoted text On 2016/08/18 19:36:53, Mark P wrote: > nit: don't add a blank line Done.
261 if (scored_items.size() > max_matches) {	230 if (scored_items.size() > max_matches) {

262 std::partial_sort(scored_items.begin(),	231 std::partial_sort(scored_items.begin(),

263 scored_items.begin() +	232 scored_items.begin() +

264 max_matches,	233 max_matches,

265 scored_items.end(),	234 scored_items.end(),

266 ScoredHistoryMatch::MatchScoreGreater);	235 ScoredHistoryMatch::MatchScoreGreater);

267 scored_items.resize(max_matches);	236 scored_items.resize(max_matches);

268 } else {	237 } else {

269 std::sort(scored_items.begin(), scored_items.end(),	238 std::sort(scored_items.begin(), scored_items.end(),

270 ScoredHistoryMatch::MatchScoreGreater);	239 ScoredHistoryMatch::MatchScoreGreater);

271 }	240 }

272 post_scoring_item_count_ = scored_items.size();	241

	242 return scored_items;

	243 }

	244

	245 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

	246 base::string16 search_string,

	247 size_t cursor_position,

	248 size_t max_matches,

	249 bookmarks::BookmarkModel* bookmark_model,

	250 TemplateURLService* template_url_service) {

	251 pre_filter_item_count_ = 0;

	252 post_filter_item_count_ = 0;

	253 post_scoring_item_count_ = 0;

	254

	255 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

	256 // approach.

	257 ResetSearchTermCache();

	258

	259 String16Vector lower_words = ExtractIndividualWordVector(search_string);

	260 HistoryIDSet history_id_set = GetHistoryIDSet(lower_words);

	261

	262 // If cursor position is set and useful (not at either end of the

	263 // string), allow the search string to be broken at cursor position.

	264 // We do this by pretending there's a space where the cursor is.

	265 if ((cursor_position != base::string16::npos) &&

	266 (cursor_position < search_string.length()) && (cursor_position > 0)) {

	267 base::string16 search_string_with_word_break(search_string);

	268 HistoryIDSet history_id_set_with_word_break;
	Mark P 2016/08/18 19:36:53 This declaration doesn't need to be at this level; This declaration doesn't need to be at this level; let it live inside the inner if statement. Lavar Askew 2016/08/19 04:46:07 Can't believe I missed that...Done. Show quoted text On 2016/08/18 19:36:53, Mark P wrote: > This declaration doesn't need to be at this level; let it live inside the inner > if statement. Can't believe I missed that...Done.
	269

	270 base::string16 word_break = base::ASCIIToUTF16(" ");

	271 search_string_with_word_break.insert(cursor_position, word_break);

	272

	273 // Add to history_id_set the ids that are related to the original

	274 // search string, but with the word break.

	275 String16Vector lower_words_with_word_break =

	276 ExtractIndividualWordVector(search_string_with_word_break);

	277

	278 // check to see if lower_words and lower_words_with_word_break
	Mark P 2016/08/18 19:36:53 nit: start sentence with capital letter ("Check".. nit: start sentence with capital letter ("Check"...) nit: please wrap the comment closer to 80 characters Lavar Askew 2016/08/19 04:46:07 Done. Show quoted text On 2016/08/18 19:36:53, Mark P wrote: > nit: start sentence with capital letter ("Check"...) > nit: please wrap the comment closer to 80 characters Done.
	279 // are the same size. If they are then this could indicate that

	280 // the word vectors contain the same words

	281 if (lower_words.size() != lower_words_with_word_break.size()) {

	282 std::sort(lower_words.begin(), lower_words.end(), Less);

	283 std::sort(lower_words_with_word_break.begin(),

	284 lower_words_with_word_break.end(), Less);

	285
	Mark P 2016/08/18 19:36:53 nit: no need for this blank line I think nit: no need for this blank line I think
	286 if (lower_words != lower_words_with_word_break) {

	287 history_id_set_with_word_break =

	288 GetHistoryIDSet(lower_words_with_word_break);

	289
	Mark P 2016/08/18 19:36:53 nit: no need for this blank line I think nit: no need for this blank line I think
	290 history_id_set.insert(history_id_set_with_word_break.begin(),

	291 history_id_set_with_word_break.end());

	292 }

	293 }

	294 }

	295

	296 // Trim the candidate pool if it is large. Note that we do not filter out

	297 // items that do not contain the search terms as proper substrings -- doing

	298 // so is the performance-costly operation we are trying to avoid in order

	299 // to maintain omnibox responsiveness.

	300 pre_filter_item_count_ = history_id_set.size();

	301 // If we trim the results set we do not want to cache the results for next

	302 // time as the user's ultimately desired result could easily be eliminated

	303 // in this early rough filter.

	304 bool was_trimmed = (pre_filter_item_count_ > max_matches);

	305 if (was_trimmed) {

	306 HistoryIDVector history_ids;

	307 std::copy(history_id_set.begin(), history_id_set.end(),

	308 std::back_inserter(history_ids));

	309 // Trim down the set by sorting by typed-count, visit-count, and last

	310 // visit.

	311 HistoryItemFactorGreater item_factor_functor(history_info_map_);

	312 std::partial_sort(history_ids.begin(), history_ids.begin() + max_matches,

	313 history_ids.end(), item_factor_functor);

	314 history_id_set.clear();

	315 std::copy(history_ids.begin(), history_ids.begin() + max_matches,

	316 std::inserter(history_id_set, history_id_set.end()));

	317 post_filter_item_count_ = history_id_set.size();

	318 }

	319

	320 ScoredHistoryMatches scored_items =

	321 GetScoredItemsForSearchString(search_string, history_id_set, max_matches,

	322 bookmark_model, template_url_service);

273	323

274 if (was_trimmed) {	324 if (was_trimmed) {

275 search_term_cache_.clear(); // Invalidate the term cache.	325 search_term_cache_.clear(); // Invalidate the term cache.

276 } else {	326 } else {

277 // Remove any stale SearchTermCacheItems.	327 // Remove any stale SearchTermCacheItems.

278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();	328 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

279 cache_iter != search_term_cache_.end(); ) {	329 cache_iter != search_term_cache_.end(); ) {

280 if (!cache_iter->second.used_)	330 if (!cache_iter->second.used_) {

281 search_term_cache_.erase(cache_iter++);	331 search_term_cache_.erase(cache_iter++);

282 else	332 } else {

283 ++cache_iter;	333 ++cache_iter;

	334 }

284 }	335 }

285 }	336 }

286	337

287 return scored_items;	338 return scored_items;

288 }	339 }

289	340

290 bool URLIndexPrivateData::UpdateURL(	341 bool URLIndexPrivateData::UpdateURL(

291 history::HistoryService* history_service,	342 history::HistoryService* history_service,

292 const history::URLRow& row,	343 const history::URLRow& row,

293 const std::set<std::string>& scheme_whitelist,	344 const std::set<std::string>& scheme_whitelist,

(...skipping 1060 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1354 // First cut: typed count, visit count, recency.	1405 // First cut: typed count, visit count, recency.

1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1406 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1356 // recently visited (within the last 12/24 hours) as highly important. Get	1407 // recently visited (within the last 12/24 hours) as highly important. Get

1357 // input from mpearson.	1408 // input from mpearson.

1358 if (r1.typed_count() != r2.typed_count())	1409 if (r1.typed_count() != r2.typed_count())

1359 return (r1.typed_count() > r2.typed_count());	1410 return (r1.typed_count() > r2.typed_count());

1360 if (r1.visit_count() != r2.visit_count())	1411 if (r1.visit_count() != r2.visit_count())

1361 return (r1.visit_count() > r2.visit_count());	1412 return (r1.visit_count() > r2.visit_count());

1362 return (r1.last_visit() > r2.last_visit());	1413 return (r1.last_visit() > r2.last_visit());

1363 }	1414 }

OLD	NEW

« components/omnibox/browser/url_index_private_data.h ('K') | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »