components/omnibox/browser/url_index_private_data.cc - Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox.

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2187343002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Removed new line after defining lower_words. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
74 WordStartsMapEntry;	74 WordStartsMapEntry;

75	75

76 // Algorithm Functions ---------------------------------------------------------	76 // Algorithm Functions ---------------------------------------------------------

77	77

78 // Comparison function for sorting search terms by descending length.	78 // Comparison function for sorting search terms by descending length.

79 bool LengthGreater(const base::string16& string_a,	79 bool LengthGreater(const base::string16& string_a,

80 const base::string16& string_b) {	80 const base::string16& string_b) {

81 return string_a.length() > string_b.length();	81 return string_a.length() > string_b.length();

82 }	82 }

83	83

84

85 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------	84 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

86	85

87 // HistoryDBTask used to update the recent visit data for a particular	86 // HistoryDBTask used to update the recent visit data for a particular

88 // row from the history database.	87 // row from the history database.

89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {	88 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {

90 public:	89 public:

91 explicit UpdateRecentVisitsFromHistoryDBTask(	90 explicit UpdateRecentVisitsFromHistoryDBTask(

92 URLIndexPrivateData* private_data,	91 URLIndexPrivateData* private_data,

93 history::URLID url_id);	92 history::URLID url_id);

94	93

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
134 }	133 }

135	134

136 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {	135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {

137 if (succeeded_)	136 if (succeeded_)

138 private_data_->UpdateRecentVisits(url_id_, recent_visits_);	137 private_data_->UpdateRecentVisits(url_id_, recent_visits_);

139 }	138 }

140	139

141 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {	140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {

142 }	141 }

143	142

144

145 // URLIndexPrivateData ---------------------------------------------------------	143 // URLIndexPrivateData ---------------------------------------------------------

146	144

147 URLIndexPrivateData::URLIndexPrivateData()	145 URLIndexPrivateData::URLIndexPrivateData()

148 : restored_cache_version_(0),	146 : restored_cache_version_(0),

149 saved_cache_version_(kCurrentCacheFileVersion),	147 saved_cache_version_(kCurrentCacheFileVersion),

150 pre_filter_item_count_(0),	148 pre_filter_item_count_(0),

151 post_filter_item_count_(0),	149 post_filter_item_count_(0),

152 post_scoring_item_count_(0) {	150 post_scoring_item_count_(0) {

153 }	151 }

154	152

155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	153 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

156 base::string16 search_string,	154 base::string16 original_search_string,

157 size_t cursor_position,	155 size_t cursor_position,

158 size_t max_matches,	156 size_t max_matches,

159 bookmarks::BookmarkModel* bookmark_model,	157 bookmarks::BookmarkModel* bookmark_model,

160 TemplateURLService* template_url_service) {	158 TemplateURLService* template_url_service) {

161 // If cursor position is set and useful (not at either end of the

162 // string), allow the search string to be broken at cursor position.

163 // We do this by pretending there's a space where the cursor is.

164 if ((cursor_position != base::string16::npos) &&

165 (cursor_position < search_string.length()) &&

166 (cursor_position > 0)) {

167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

168 }

169 pre_filter_item_count_ = 0;	159 pre_filter_item_count_ = 0;

170 post_filter_item_count_ = 0;	160 post_filter_item_count_ = 0;

171 post_scoring_item_count_ = 0;	161 post_scoring_item_count_ = 0;

172 // The search string we receive may contain escaped characters. For reducing

173 // the index we need individual, lower-cased words, ignoring escapings. For

174 // the final filtering we need whitespace separated substrings possibly

175 // containing escaped characters.

176 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

177 base::string16 lower_unescaped_string =

178 net::UnescapeURLComponent(lower_raw_string,

179 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

181 // Extract individual 'words' (as opposed to 'terms'; see below) from the

182 // search string. When the user types "colspec=ID%20Mstone Release" we get

183 // four 'words': "colspec", "id", "mstone" and "release".

184 String16Vector lower_words(

185 String16VectorFromString16(lower_unescaped_string, false, nullptr));

186 ScoredHistoryMatches scored_items;

187	162

188 // Do nothing if we have indexed no words (probably because we've not been	163 // This list will contain the original search string and any other
Mark P 2016/09/15 22:55:54 I think we need to keep this block (except for the I think we need to keep this block (except for the lower_word.empty() part of the test. I don't see search_term_cache_ being cleared if word_list_ is empty in your new code. Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:54, Mark P wrote: > I think we need to keep this block (except for the lower_word.empty() part of > the test. I don't see search_term_cache_ being cleared if word_list_ is empty > in your new code. Done.
189 // initialized yet) or the search string has no words.	164 // string transformations.

190 if (word_list_.empty() \|\| lower_words.empty()) {	165 String16Vector search_strings;

191 search_term_cache_.clear(); // Invalidate the term cache.	166 search_strings.insert(search_strings.end(), original_search_string);
	Mark P 2016/09/15 22:55:54 nit: push_back is more idiomatic and easier to rea nit: push_back is more idiomatic and easier to read. ditto below Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:54, Mark P wrote: > nit: push_back is more idiomatic and easier to read. > ditto below Done.
192 return scored_items;	167 if ((cursor_position != base::string16::npos) &&

	168 (cursor_position < original_search_string.length()) &&

	169 (cursor_position > 0)) {

	170 // The original search_string broken at cursor position. This is

	171 // one type of transformation.

	172 base::string16 transformed_search_string(original_search_string);

	173 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

	174 search_strings.insert(search_strings.end(), transformed_search_string);

193 }	175 }

194	176

195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

196 // approach.

197 ResetSearchTermCache();	177 ResetSearchTermCache();

198	178

199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);	179 ScoredHistoryMatches scored_items;

	180 for (const base::string16& search_string : search_strings) {

	181 // The search string we receive may contain escaped characters. For reducing

	182 // the index we need individual, lower-cased words, ignoring escapings. For

	183 // the final filtering we need whitespace separated substrings possibly

	184 // containing escaped characters.

	185 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

	186 base::string16 lower_unescaped_string = net::UnescapeURLComponent(

	187 lower_raw_string,

	188 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

	189 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

200	190

201 // Trim the candidate pool if it is large. Note that we do not filter out	191 // Extract individual 'words' (as opposed to 'terms'; see below) from the

202 // items that do not contain the search terms as proper substrings -- doing	192 // search string. When the user types "colspec=ID%20Mstone Release" we get

203 // so is the performance-costly operation we are trying to avoid in order	193 // four 'words': "colspec", "id", "mstone" and "release".

204 // to maintain omnibox responsiveness.	194 String16Vector lower_words(

205 const size_t kItemsToScoreLimit = 500;	195 String16VectorFromString16(lower_unescaped_string, false, nullptr));

206 pre_filter_item_count_ = history_id_set.size();	196 if (!lower_words.empty()) {
	Mark P 2016/09/15 22:55:53 nit: Instead of indenting this huge block below, I nit: Instead of indenting this huge block below, I suggest: if (lower_words.empty()) continue; Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:53, Mark P wrote: > nit: Instead of indenting this huge block below, I suggest: > if (lower_words.empty()) > continue; Done.
207 // If we trim the results set we do not want to cache the results for next	197 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

208 // time as the user's ultimately desired result could easily be eliminated	198 // Trim the candidate pool if it is large. Note that we do not filter out

209 // in this early rough filter.	199 // items that do not contain the search terms as proper substrings --

210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);	200 // doing
	Mark P 2016/09/15 22:55:53 nit: formatting nit: formatting Lavar Askew 2016/09/16 00:37:51 That git cl format command causes more trouble tha Show quoted text On 2016/09/15 22:55:53, Mark P wrote: > nit: formatting That git cl format command causes more trouble than is solves. Done. Mark P 2016/09/16 23:18:06 I agree. Show quoted text On 2016/09/16 00:37:51, Lavar Askew wrote: > On 2016/09/15 22:55:53, Mark P wrote: > > nit: formatting > That git cl format command causes more trouble than is solves. I agree.
211 if (was_trimmed) {	201 // so is the performance-costly operation we are trying to avoid in order

212 HistoryIDVector history_ids;	202 // to maintain omnibox responsiveness.

213 std::copy(history_id_set.begin(), history_id_set.end(),	203 const size_t kItemsToScoreLimit = 500;

214 std::back_inserter(history_ids));	204 pre_filter_item_count_ = pre_filter_item_count_ + history_id_set.size();
	Mark P 2016/09/15 22:55:54 nit: prefer += nit: prefer += Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:54, Mark P wrote: > nit: prefer += Done.
215 // Trim down the set by sorting by typed-count, visit-count, and last	205 // If we trim the results set we do not want to cache the results for next

216 // visit.	206 // time as the user's ultimately desired result could easily be eliminated

217 HistoryItemFactorGreater	207 // in this early rough filter.

218 item_factor_functor(history_info_map_);	208 if (pre_filter_item_count_ > kItemsToScoreLimit) {

219 std::partial_sort(history_ids.begin(),	209 HistoryIDVector history_ids;

220 history_ids.begin() + kItemsToScoreLimit,	210 std::copy(history_id_set.begin(), history_id_set.end(),

221 history_ids.end(),	211 std::back_inserter(history_ids));

222 item_factor_functor);	212 // Trim down the set by sorting by typed-count, visit-count, and last

223 history_id_set.clear();	213 // visit.

224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,	214 HistoryItemFactorGreater item_factor_functor(history_info_map_);

225 std::inserter(history_id_set, history_id_set.end()));	215 std::partial_sort(history_ids.begin(),

226 post_filter_item_count_ = history_id_set.size();	216 history_ids.begin() + kItemsToScoreLimit,

	217 history_ids.end(), item_factor_functor);

	218 history_id_set.clear();

	219 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

	220 std::inserter(history_id_set, history_id_set.end()));

	221 post_filter_item_count_ =

	222 post_filter_item_count_ + history_id_set.size();

	223 }

	224 // Pass over all of the candidates filtering out any without a proper

	225 // substring match, inserting those which pass in order by score. Note

	226 // that
	Mark P 2016/09/15 22:55:53 nit: formatting here and in the below paragraph as nit: formatting here and in the below paragraph as well Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:53, Mark P wrote: > nit: formatting > here and in the below paragraph as well Done.
	227 // in this step we are using the raw search string complete with escaped

	228 // URL elements. When the user has specifically typed something akin to

	229 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

	230 // specific substring appears in the URL or page title.

	231

	232 // We call these 'terms' (as opposed to 'words'; see above) as in this

	233 // case

	234 // we only want to break up the search string on 'true' whitespace rather

	235 // than

	236 // escaped whitespace. When the user types "colspec=ID%20Mstone Release"

	237 // we

	238 // get two 'terms': "colspec=id%20mstone" and "release".

	239 String16Vector lower_raw_terms =

	240 base::SplitString(lower_raw_string, base::kWhitespaceUTF16,

	241 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	242 if (!lower_raw_terms.empty()) {

	243 ScoredHistoryMatches temp_scored_items =

	244 std::for_each(history_id_set.begin(), history_id_set.end(),

	245 AddHistoryMatch(bookmark_model, template_url_service,

	246 *this, lower_raw_string,

	247 lower_raw_terms, base::Time::Now()))

	248 .ScoredMatches();

	249 scored_items.insert(scored_items.end(), temp_scored_items.begin(),

	250 temp_scored_items.end());

	251 }

	252 // Select and sort only the top \|max_matches\| results.
	Mark P 2016/09/15 22:55:53 I think this selecting and sorting should go outsi I think this selecting and sorting should go outside the foreach string loop. Also, the post_scoring_item_count_ setting should go with it. Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:53, Mark P wrote: > I think this selecting and sorting should go outside the foreach string loop. > Also, the post_scoring_item_count_ setting should go with it. Done.
	253 if (scored_items.size() > max_matches) {

	254 std::partial_sort(

	255 scored_items.begin(), scored_items.begin() + max_matches,

	256 scored_items.end(), ScoredHistoryMatch::MatchScoreGreater);

	257 scored_items.resize(max_matches);

	258 } else {

	259 std::sort(scored_items.begin(), scored_items.end(),

	260 ScoredHistoryMatch::MatchScoreGreater);

	261 }

	262 post_scoring_item_count_ = scored_items.size();

	263 }

227 }	264 }

228	265

229 // Pass over all of the candidates filtering out any without a proper	266 if (pre_filter_item_count_ > post_filter_item_count_) {

230 // substring match, inserting those which pass in order by score. Note that

231 // in this step we are using the raw search string complete with escaped

232 // URL elements. When the user has specifically typed something akin to

233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

234 // specific substring appears in the URL or page title.

235

236 // We call these 'terms' (as opposed to 'words'; see above) as in this case

237 // we only want to break up the search string on 'true' whitespace rather than

238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

239 // get two 'terms': "colspec=id%20mstone" and "release".

240 String16Vector lower_raw_terms = base::SplitString(

241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

242 base::SPLIT_WANT_NONEMPTY);

243 if (lower_raw_terms.empty()) {

244 // Don't score matches when there are no terms to score against. (It's
Mark P 2016/09/15 22:55:54 Please keep this comment in the new code. Please keep this comment in the new code. Lavar Askew 2016/09/16 00:37:51 Done. Show quoted text On 2016/09/15 22:55:54, Mark P wrote: > Please keep this comment in the new code. Done.
245 // possible that the word break iterater that extracts words to search

246 // for in the database allows some whitespace "words" whereas SplitString

247 // excludes a long list of whitespace.) One could write a scoring

248 // function that gives a reasonable order to matches when there

249 // are no terms (i.e., all the words are some form of whitespace),

250 // but this is such a rare edge case that it's not worth the time.

251 return scored_items;

252 }

253 scored_items =

254 std::for_each(

255 history_id_set.begin(), history_id_set.end(),

256 AddHistoryMatch(bookmark_model, template_url_service, *this,

257 lower_raw_string, lower_raw_terms,

258 base::Time::Now())).ScoredMatches();

259

260 // Select and sort only the top \|max_matches\| results.

261 if (scored_items.size() > max_matches) {

262 std::partial_sort(scored_items.begin(),

263 scored_items.begin() +

264 max_matches,

265 scored_items.end(),

266 ScoredHistoryMatch::MatchScoreGreater);

267 scored_items.resize(max_matches);

268 } else {

269 std::sort(scored_items.begin(), scored_items.end(),

270 ScoredHistoryMatch::MatchScoreGreater);

271 }

272 post_scoring_item_count_ = scored_items.size();

273

274 if (was_trimmed) {

275 search_term_cache_.clear(); // Invalidate the term cache.	267 search_term_cache_.clear(); // Invalidate the term cache.

276 } else {	268 } else {

277 // Remove any stale SearchTermCacheItems.	269 // Remove any stale SearchTermCacheItems.

278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();	270 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

279 cache_iter != search_term_cache_.end(); ) {	271 cache_iter != search_term_cache_.end(); ) {

280 if (!cache_iter->second.used_)	272 if (!cache_iter->second.used_)

281 search_term_cache_.erase(cache_iter++);	273 search_term_cache_.erase(cache_iter++);

282 else	274 else

283 ++cache_iter;	275 ++cache_iter;

284 }	276 }

(...skipping 959 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1244 return true;	1236 return true;

1245 }	1237 }

1246	1238

1247 // static	1239 // static

1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted(	1240 bool URLIndexPrivateData::URLSchemeIsWhitelisted(

1249 const GURL& gurl,	1241 const GURL& gurl,

1250 const std::set<std::string>& whitelist) {	1242 const std::set<std::string>& whitelist) {

1251 return whitelist.find(gurl.scheme()) != whitelist.end();	1243 return whitelist.find(gurl.scheme()) != whitelist.end();

1252 }	1244 }

1253	1245

1254

1255 // SearchTermCacheItem ---------------------------------------------------------	1246 // SearchTermCacheItem ---------------------------------------------------------

1256	1247

1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(	1248 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(

1258 const WordIDSet& word_id_set,	1249 const WordIDSet& word_id_set,

1259 const HistoryIDSet& history_id_set)	1250 const HistoryIDSet& history_id_set)

1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {	1251 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {

1261 }	1252 }

1262	1253

1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {	1254 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {

1264 }	1255 }

(...skipping 89 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1354 // First cut: typed count, visit count, recency.	1345 // First cut: typed count, visit count, recency.

1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1346 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1356 // recently visited (within the last 12/24 hours) as highly important. Get	1347 // recently visited (within the last 12/24 hours) as highly important. Get

1357 // input from mpearson.	1348 // input from mpearson.

1358 if (r1.typed_count() != r2.typed_count())	1349 if (r1.typed_count() != r2.typed_count())

1359 return (r1.typed_count() > r2.typed_count());	1350 return (r1.typed_count() > r2.typed_count());

1360 if (r1.visit_count() != r2.visit_count())	1351 if (r1.visit_count() != r2.visit_count())

1361 return (r1.visit_count() > r2.visit_count());	1352 return (r1.visit_count() > r2.visit_count());

1362 return (r1.last_visit() > r2.last_visit());	1353 return (r1.last_visit() > r2.last_visit());

1363 }	1354 }

OLD	NEW

« no previous file with comments | « components/omnibox/browser/url_index_private_data.h ('k') | no next file » | no next file with comments »