components/omnibox/browser/url_index_private_data.cc - Issue 2364553004: Updated and refactored URLIndexPrivateData::HistoryItemsForTerms to handle searching the history fo…

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2364553004: Updated and refactored URLIndexPrivateData::HistoryItemsForTerms to handle searching the history fo… (Closed)

Patch Set: Fixed bug to properly increment post_filter_item_count_. Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 135 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
146	146

147 URLIndexPrivateData::URLIndexPrivateData()	147 URLIndexPrivateData::URLIndexPrivateData()

148 : restored_cache_version_(0),	148 : restored_cache_version_(0),

149 saved_cache_version_(kCurrentCacheFileVersion),	149 saved_cache_version_(kCurrentCacheFileVersion),

150 pre_filter_item_count_(0),	150 pre_filter_item_count_(0),

151 post_filter_item_count_(0),	151 post_filter_item_count_(0),

152 post_scoring_item_count_(0) {	152 post_scoring_item_count_(0) {

153 }	153 }

154	154

155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

156 base::string16 search_string,	156 base::string16 original_search_string,

157 size_t cursor_position,	157 size_t cursor_position,

158 size_t max_matches,	158 size_t max_matches,

159 bookmarks::BookmarkModel* bookmark_model,	159 bookmarks::BookmarkModel* bookmark_model,

160 TemplateURLService* template_url_service) {	160 TemplateURLService* template_url_service) {

161 // If cursor position is set and useful (not at either end of the

162 // string), allow the search string to be broken at cursor position.

163 // We do this by pretending there's a space where the cursor is.

164 if ((cursor_position != base::string16::npos) &&

165 (cursor_position < search_string.length()) &&

166 (cursor_position > 0)) {

167 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

168 }

169 pre_filter_item_count_ = 0;	161 pre_filter_item_count_ = 0;

170 post_filter_item_count_ = 0;	162 post_filter_item_count_ = 0;

171 post_scoring_item_count_ = 0;	163 post_scoring_item_count_ = 0;

172 // The search string we receive may contain escaped characters. For reducing	164

173 // the index we need individual, lower-cased words, ignoring escapings. For	165 // This list will contain the original search string and any other string

174 // the final filtering we need whitespace separated substrings possibly	166 // transformations.

175 // containing escaped characters.	167 String16Vector search_strings;

176 base::string16 lower_raw_string(base::i18n::ToLower(search_string));	168 search_strings.push_back(original_search_string);

177 base::string16 lower_unescaped_string =	169 if ((cursor_position != base::string16::npos) &&

178 net::UnescapeURLComponent(lower_raw_string,	170 (cursor_position < original_search_string.length()) &&

179 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|	171 (cursor_position > 0)) {

180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);	172 // The original search_string broken at cursor position. This is one type of

181 // Extract individual 'words' (as opposed to 'terms'; see below) from the	173 // transformation.

182 // search string. When the user types "colspec=ID%20Mstone Release" we get	174 base::string16 transformed_search_string(original_search_string);

183 // four 'words': "colspec", "id", "mstone" and "release".	175 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

184 String16Vector lower_words(	176 search_strings.push_back(transformed_search_string);

185 String16VectorFromString16(lower_unescaped_string, false, nullptr));	177 }

	178

186 ScoredHistoryMatches scored_items;	179 ScoredHistoryMatches scored_items;

187	180 // Invalidate the term cache and return if we have indexed no words (probably

188 // Do nothing if we have indexed no words (probably because we've not been	181 // because we've not been initialized yet).

189 // initialized yet) or the search string has no words.	182 if (word_list_.empty()) {

190 if (word_list_.empty() \|\| lower_words.empty()) {	183 search_term_cache_.clear();

191 search_term_cache_.clear(); // Invalidate the term cache.

192 return scored_items;	184 return scored_items;

193 }	185 }

194

195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	186 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

196 // approach.	187 // approach.

197 ResetSearchTermCache();	188 ResetSearchTermCache();

198	189

199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);	190 for (const base::string16& search_string : search_strings) {

	191 // The search string we receive may contain escaped characters. For reducing

	192 // the index we need individual, lower-cased words, ignoring escapings. For

	193 // the final filtering we need whitespace separated substrings possibly

	194 // containing escaped characters.

	195 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

	196 base::string16 lower_unescaped_string = net::UnescapeURLComponent(

	197 lower_raw_string,

	198 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

	199 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

200	200

201 // Trim the candidate pool if it is large. Note that we do not filter out	201 // Extract individual 'words' (as opposed to 'terms'; see below) from the

202 // items that do not contain the search terms as proper substrings -- doing	202 // search string. When the user types "colspec=ID%20Mstone Release" we get

203 // so is the performance-costly operation we are trying to avoid in order	203 // four 'words': "colspec", "id", "mstone" and "release".

204 // to maintain omnibox responsiveness.	204 String16Vector lower_words(

205 const size_t kItemsToScoreLimit = 500;	205 String16VectorFromString16(lower_unescaped_string, false, nullptr));

206 pre_filter_item_count_ = history_id_set.size();	206 if (lower_words.empty())

207 // If we trim the results set we do not want to cache the results for next	207 continue;

208 // time as the user's ultimately desired result could easily be eliminated	208 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

209 // in this early rough filter.	209 // Trim the candidate pool if it is large. Note that we do not filter out

210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);	210 // items that do not contain the search terms as proper substrings --

211 if (was_trimmed) {	211 // doing so is the performance-costly operation we are trying to avoid in

212 HistoryIDVector history_ids;	212 // order to maintain omnibox responsiveness.

213 std::copy(history_id_set.begin(), history_id_set.end(),	213 const size_t kItemsToScoreLimit = 500;

214 std::back_inserter(history_ids));	214 pre_filter_item_count_ += history_id_set.size();

215 // Trim down the set by sorting by typed-count, visit-count, and last	215 // If we trim the results set we do not want to cache the results for next

216 // visit.	216 // time as the user's ultimately desired result could easily be eliminated

217 HistoryItemFactorGreater	217 // in this early rough filter.

218 item_factor_functor(history_info_map_);	218 if (pre_filter_item_count_ > kItemsToScoreLimit) {

219 std::partial_sort(history_ids.begin(),	219 HistoryIDVector history_ids;

220 history_ids.begin() + kItemsToScoreLimit,	220 std::copy(history_id_set.begin(), history_id_set.end(),

221 history_ids.end(),	221 std::back_inserter(history_ids));

222 item_factor_functor);	222 // Trim down the set by sorting by typed-count, visit-count, and last

223 history_id_set.clear();	223 // visit.

224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,	224 HistoryItemFactorGreater item_factor_functor(history_info_map_);

225 std::inserter(history_id_set, history_id_set.end()));	225 std::partial_sort(history_ids.begin(),

226 post_filter_item_count_ = history_id_set.size();	226 history_ids.begin() + kItemsToScoreLimit,

227 }	227 history_ids.end(), item_factor_functor);

	228 history_id_set.clear();
	Mark P 2016/09/23 18:02:13 Should we be clearing post_filter_item_count_ here Should we be clearing post_filter_item_count_ here? After all, if we're on the second pass through the list of search strings, the history_id_set will include both the first pass and second pass items that we then filter. Incrementing post_filter_item_count_ to the size of the final filtered set would end up double-counting the first pass's items. Lavar Askew 2016/09/23 19:07:48 I disagree because history_id_set is reinitialized Show quoted text On 2016/09/23 18:02:13, Mark P wrote: > Should we be clearing post_filter_item_count_ here? > After all, if we're on the second pass through the list of search strings, the > history_id_set will include both the first pass and second pass items that we > then filter. Incrementing post_filter_item_count_ to the size of the final > filtered set would end up double-counting the first pass's items. I disagree because history_id_set is reinitialized each time through the for loop. Mark P 2016/09/23 19:32:14 Ah, good point. Show quoted text On 2016/09/23 19:07:48, Lavar Askew wrote: > On 2016/09/23 18:02:13, Mark P wrote: > > Should we be clearing post_filter_item_count_ here? > > After all, if we're on the second pass through the list of search strings, the > > history_id_set will include both the first pass and second pass items that we > > then filter. Incrementing post_filter_item_count_ to the size of the final > > filtered set would end up double-counting the first pass's items. > > I disagree because history_id_set is reinitialized each time through the for > loop. Ah, good point.
	229 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

	230 std::inserter(history_id_set, history_id_set.end()));

	231 post_filter_item_count_ += history_id_set.size();

	232 } else {

	233 post_filter_item_count_ += pre_filter_item_count_;

	234 }

	235 // Pass over all of the candidates filtering out any without a proper

	236 // substring match, inserting those which pass in order by score. Note that

	237 // in this step we are using the raw search string complete with escaped

	238 // URL elements. When the user has specifically typed something akin to

	239 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

	240 // specific substring appears in the URL or page title.

228	241

229 // Pass over all of the candidates filtering out any without a proper	242 // We call these 'terms' (as opposed to 'words'; see above) as in this case

230 // substring match, inserting those which pass in order by score. Note that	243 // we only want to break up the search string on 'true' whitespace rather

231 // in this step we are using the raw search string complete with escaped	244 // than escaped whitespace. When the user types "colspec=ID%20Mstone

232 // URL elements. When the user has specifically typed something akin to	245 // Release" we get two 'terms': "colspec=id%20mstone" and "release".

233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that	246 String16Vector lower_raw_terms =

234 // specific substring appears in the URL or page title.	247 base::SplitString(lower_raw_string, base::kWhitespaceUTF16,

	248 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

235	249

236 // We call these 'terms' (as opposed to 'words'; see above) as in this case

237 // we only want to break up the search string on 'true' whitespace rather than

238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

239 // get two 'terms': "colspec=id%20mstone" and "release".

240 String16Vector lower_raw_terms = base::SplitString(

241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

242 base::SPLIT_WANT_NONEMPTY);

243 if (lower_raw_terms.empty()) {

244 // Don't score matches when there are no terms to score against. (It's	250 // Don't score matches when there are no terms to score against. (It's

245 // possible that the word break iterater that extracts words to search	251 // possible that the word break iterater that extracts words to search

246 // for in the database allows some whitespace "words" whereas SplitString	252 // for in the database allows some whitespace "words" whereas SplitString

247 // excludes a long list of whitespace.) One could write a scoring	253 // excludes a long list of whitespace.) One could write a scoring function

248 // function that gives a reasonable order to matches when there	254 // that gives a reasonable order to matches when there are no terms (i.e.,

249 // are no terms (i.e., all the words are some form of whitespace),	255 // all the words are some form of whitespace), but this is such a rare edge

250 // but this is such a rare edge case that it's not worth the time.	256 // case that it's not worth the time.

251 return scored_items;	257 if (lower_raw_terms.empty())

	258 continue;

	259 ScoredHistoryMatches temp_scored_items =

	260 std::for_each(history_id_set.begin(), history_id_set.end(),

	261 AddHistoryMatch(bookmark_model, template_url_service,

	262 *this, lower_raw_string, lower_raw_terms,

	263 base::Time::Now()))

	264 .ScoredMatches();

	265 scored_items.insert(scored_items.end(), temp_scored_items.begin(),

	266 temp_scored_items.end());

252 }	267 }

253 scored_items =

254 std::for_each(

255 history_id_set.begin(), history_id_set.end(),

256 AddHistoryMatch(bookmark_model, template_url_service, *this,

257 lower_raw_string, lower_raw_terms,

258 base::Time::Now())).ScoredMatches();

259

260 // Select and sort only the top \|max_matches\| results.	268 // Select and sort only the top \|max_matches\| results.

261 if (scored_items.size() > max_matches) {	269 if (scored_items.size() > max_matches) {

262 std::partial_sort(scored_items.begin(),	270 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,

263 scored_items.begin() +

264 max_matches,

265 scored_items.end(),	271 scored_items.end(),

266 ScoredHistoryMatch::MatchScoreGreater);	272 ScoredHistoryMatch::MatchScoreGreater);

267 scored_items.resize(max_matches);	273 scored_items.resize(max_matches);

268 } else {	274 } else {

269 std::sort(scored_items.begin(), scored_items.end(),	275 std::sort(scored_items.begin(), scored_items.end(),

270 ScoredHistoryMatch::MatchScoreGreater);	276 ScoredHistoryMatch::MatchScoreGreater);

271 }	277 }

272 post_scoring_item_count_ = scored_items.size();	278 post_scoring_item_count_ = scored_items.size();

273	279 if (pre_filter_item_count_ > post_filter_item_count_) {

274 if (was_trimmed) {

275 search_term_cache_.clear(); // Invalidate the term cache.	280 search_term_cache_.clear(); // Invalidate the term cache.

276 } else {	281 } else {

277 // Remove any stale SearchTermCacheItems.	282 // Remove any stale SearchTermCacheItems.

278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();	283 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

279 cache_iter != search_term_cache_.end(); ) {	284 cache_iter != search_term_cache_.end(); ) {

280 if (!cache_iter->second.used_)	285 if (!cache_iter->second.used_)

281 search_term_cache_.erase(cache_iter++);	286 search_term_cache_.erase(cache_iter++);

282 else	287 else

283 ++cache_iter;	288 ++cache_iter;

284 }	289 }

(...skipping 1069 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1354 // First cut: typed count, visit count, recency.	1359 // First cut: typed count, visit count, recency.

1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1360 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1356 // recently visited (within the last 12/24 hours) as highly important. Get	1361 // recently visited (within the last 12/24 hours) as highly important. Get

1357 // input from mpearson.	1362 // input from mpearson.

1358 if (r1.typed_count() != r2.typed_count())	1363 if (r1.typed_count() != r2.typed_count())

1359 return (r1.typed_count() > r2.typed_count());	1364 return (r1.typed_count() > r2.typed_count());

1360 if (r1.visit_count() != r2.visit_count())	1365 if (r1.visit_count() != r2.visit_count())

1361 return (r1.visit_count() > r2.visit_count());	1366 return (r1.visit_count() > r2.visit_count());

1362 return (r1.last_visit() > r2.last_visit());	1367 return (r1.last_visit() > r2.last_visit());

1363 }	1368 }

OLD	NEW

« no previous file with comments | « components/omnibox/browser/history_quick_provider_unittest.cc ('k') | no next file » | no next file with comments »