components/omnibox/browser/url_index_private_data.cc - Issue 2363463002: Generating autocomplete results with and without word breaks in the Omnibox.

Side by Side Diff: components/omnibox/browser/url_index_private_data.cc

Issue 2363463002: Generating autocomplete results with and without word breaks in the Omnibox. (Closed)

Patch Set: Resubmitting patch for closed code review with iss number 2363463002. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/url_index_private_data.h"	5 #include "components/omnibox/browser/url_index_private_data.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <functional>	9 #include <functional>

10 #include <iterator>	10 #include <iterator>

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
133 }	133 }

134	134

135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {	135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {

136 if (succeeded_)	136 if (succeeded_)

137 private_data_->UpdateRecentVisits(url_id_, recent_visits_);	137 private_data_->UpdateRecentVisits(url_id_, recent_visits_);

138 }	138 }

139	139

140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {	140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {

141 }	141 }

142	142

143

144 // URLIndexPrivateData ---------------------------------------------------------	143 // URLIndexPrivateData ---------------------------------------------------------

145	144

146 URLIndexPrivateData::URLIndexPrivateData()	145 URLIndexPrivateData::URLIndexPrivateData()

147 : restored_cache_version_(0),	146 : restored_cache_version_(0),

148 saved_cache_version_(kCurrentCacheFileVersion),	147 saved_cache_version_(kCurrentCacheFileVersion),

149 pre_filter_item_count_(0),	148 pre_filter_item_count_(0),

150 post_filter_item_count_(0),	149 post_filter_item_count_(0),

151 post_scoring_item_count_(0) {	150 post_scoring_item_count_(0) {

152 }	151 }

153	152

154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(	153 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

155 base::string16 search_string,	154 base::string16 original_search_string,

156 size_t cursor_position,	155 size_t cursor_position,

157 size_t max_matches,	156 size_t max_matches,

158 bookmarks::BookmarkModel* bookmark_model,	157 bookmarks::BookmarkModel* bookmark_model,

159 TemplateURLService* template_url_service) {	158 TemplateURLService* template_url_service) {

160 pre_filter_item_count_ = 0;	159 pre_filter_item_count_ = 0;

161 post_filter_item_count_ = 0;	160 post_filter_item_count_ = 0;

162 post_scoring_item_count_ = 0;	161 post_scoring_item_count_ = 0;

163 // The search string we receive may contain escaped characters. For reducing

164 // the index we need individual, lower-cased words, ignoring escapings. For

165 // the final filtering we need whitespace separated substrings possibly

166 // containing escaped characters.

167 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

168 base::string16 lower_unescaped_string =

169 net::UnescapeURLComponent(lower_raw_string,

170 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

171 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

172	162

173 // Extract individual 'words' (as opposed to 'terms'; see below) from the	163 // This list will contain the original search string and any other string

174 // search string. When the user types "colspec=ID%20Mstone Release" we get	164 // transformations.

175 // four 'words': "colspec", "id", "mstone" and "release".	165 String16Vector search_strings;

176 String16Vector lower_words(	166 search_strings.push_back(original_search_string);

177 String16VectorFromString16(lower_unescaped_string, false, nullptr));	167 if ((cursor_position != base::string16::npos) &&

	168 (cursor_position < original_search_string.length()) &&

	169 (cursor_position > 0)) {

	170 // The original search_string broken at cursor position. This is one type of

	171 // transformation.

	172 base::string16 transformed_search_string(original_search_string);

	173 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

	174 search_strings.push_back(transformed_search_string);

	175 }

	176

178 ScoredHistoryMatches scored_items;	177 ScoredHistoryMatches scored_items;

179	178 // Invalidate the term cache and return if we have indexed no words (probably

180 // Do nothing if we have indexed no words (probably because we've not been	179 // because we've not been initialized yet).

181 // initialized yet) or the search string has no words.	180 if (word_list_.empty()) {

182 if (word_list_.empty() \|\| lower_words.empty()) {	181 search_term_cache_.clear();

183 search_term_cache_.clear(); // Invalidate the term cache.

184 return scored_items;	182 return scored_items;

185 }	183 }

186

187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	184 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

188 // approach.	185 // approach.

189 ResetSearchTermCache();	186 ResetSearchTermCache();

190	187

191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);	188 for (const base::string16& search_string : search_strings) {

	189 // The search string we receive may contain escaped characters. For reducing

	190 // the index we need individual, lower-cased words, ignoring escapings. For

	191 // the final filtering we need whitespace separated substrings possibly

	192 // containing escaped characters.

	193 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

	194 base::string16 lower_unescaped_string = net::UnescapeURLComponent(

	195 lower_raw_string,

	196 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

	197 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

192	198

193 // Trim the candidate pool if it is large. Note that we do not filter out	199 // Extract individual 'words' (as opposed to 'terms'; see below) from the

194 // items that do not contain the search terms as proper substrings -- doing	200 // search string. When the user types "colspec=ID%20Mstone Release" we get

195 // so is the performance-costly operation we are trying to avoid in order	201 // four 'words': "colspec", "id", "mstone" and "release".

196 // to maintain omnibox responsiveness.	202 String16Vector lower_words(

197 const size_t kItemsToScoreLimit = 500;	203 String16VectorFromString16(lower_unescaped_string, false, nullptr));

198 pre_filter_item_count_ = history_id_set.size();	204 if (lower_words.empty())

199 // If we trim the results set we do not want to cache the results for next	205 continue;

200 // time as the user's ultimately desired result could easily be eliminated	206 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

201 // in this early rough filter.	207 // Trim the candidate pool if it is large. Note that we do not filter out

202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);	208 // items that do not contain the search terms as proper substrings --

203 if (was_trimmed) {	209 // doing so is the performance-costly operation we are trying to avoid in

204 HistoryIDVector history_ids;	210 // order to maintain omnibox responsiveness.

205 std::copy(history_id_set.begin(), history_id_set.end(),	211 const size_t kItemsToScoreLimit = 500;

206 std::back_inserter(history_ids));	212 pre_filter_item_count_ += history_id_set.size();

207 // Trim down the set by sorting by typed-count, visit-count, and last	213 // If we trim the results set we do not want to cache the results for next

208 // visit.	214 // time as the user's ultimately desired result could easily be eliminated

209 HistoryItemFactorGreater	215 // in this early rough filter.

210 item_factor_functor(history_info_map_);	216 if (pre_filter_item_count_ > kItemsToScoreLimit) {

211 std::partial_sort(history_ids.begin(),	217 HistoryIDVector history_ids;

212 history_ids.begin() + kItemsToScoreLimit,	218 std::copy(history_id_set.begin(), history_id_set.end(),

213 history_ids.end(),	219 std::back_inserter(history_ids));

214 item_factor_functor);	220 // Trim down the set by sorting by typed-count, visit-count, and last

215 history_id_set.clear();	221 // visit.

216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,	222 HistoryItemFactorGreater item_factor_functor(history_info_map_);

217 std::inserter(history_id_set, history_id_set.end()));	223 std::partial_sort(history_ids.begin(),

218 post_filter_item_count_ = history_id_set.size();	224 history_ids.begin() + kItemsToScoreLimit,

219 }	225 history_ids.end(), item_factor_functor);

	226 history_id_set.clear();

	227 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

	228 std::inserter(history_id_set, history_id_set.end()));

	229 post_filter_item_count_ = post_filter_item_count_ + history_id_set.size();

	230 }

	231 // Pass over all of the candidates filtering out any without a proper

	232 // substring match, inserting those which pass in order by score. Note that

	233 // in this step we are using the raw search string complete with escaped

	234 // URL elements. When the user has specifically typed something akin to

	235 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

	236 // specific substring appears in the URL or page title.

220	237

221 // Pass over all of the candidates filtering out any without a proper	238 // We call these 'terms' (as opposed to 'words'; see above) as in this case

222 // substring match, inserting those which pass in order by score. Note that	239 // we only want to break up the search string on 'true' whitespace rather

223 // in this step we are using the raw search string complete with escaped	240 // than escaped whitespace. When the user types "colspec=ID%20Mstone

224 // URL elements. When the user has specifically typed something akin to	241 // Release" we get two 'terms': "colspec=id%20mstone" and "release".

225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that	242 String16Vector lower_raw_terms =

226 // specific substring appears in the URL or page title.	243 base::SplitString(lower_raw_string, base::kWhitespaceUTF16,

	244 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

227	245

228 // We call these 'terms' (as opposed to 'words'; see above) as in this case

229 // we only want to break up the search string on 'true' whitespace rather than

230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

231 // get two 'terms': "colspec=id%20mstone" and "release".

232 String16Vector lower_raw_terms = base::SplitString(

233 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,

234 base::SPLIT_WANT_NONEMPTY);

235

236 if (lower_raw_terms.empty()) {

237 // Don't score matches when there are no terms to score against. (It's	246 // Don't score matches when there are no terms to score against. (It's

238 // possible that the word break iterater that extracts words to search	247 // possible that the word break iterater that extracts words to search

239 // for in the database allows some whitespace "words" whereas SplitString	248 // for in the database allows some whitespace "words" whereas SplitString

240 // excludes a long list of whitespace.) One could write a scoring	249 // excludes a long list of whitespace.) One could write a scoring function

241 // function that gives a reasonable order to matches when there	250 // that gives a reasonable order to matches when there are no terms (i.e.,

242 // are no terms (i.e., all the words are some form of whitespace),	251 // all the words are some form of whitespace), but this is such a rare edge

243 // but this is such a rare edge case that it's not worth the time.	252 // case that it's not worth the time.

244 return scored_items;	253 if (lower_raw_terms.empty())

245 }	254 continue;

246	255 ScoredHistoryMatches temp_scored_items =

247 scored_items =	256 std::for_each(history_id_set.begin(), history_id_set.end(),

248 std::for_each(	257 AddHistoryMatch(bookmark_model, template_url_service,

249 history_id_set.begin(), history_id_set.end(),	258 *this, lower_raw_string, lower_raw_terms,

250 AddHistoryMatch(bookmark_model, template_url_service, *this,	259 base::Time::Now()))

251 lower_raw_string, lower_raw_terms, base::Time::Now()))	260 .ScoredMatches();

252 .ScoredMatches();	261 scored_items.insert(scored_items.end(), temp_scored_items.begin(),

253	262 temp_scored_items.end());

254 // If cursor position is set and useful (not at either end of the string),

255 // allow the search string to be broken at cursor position. We do this by

256 // pretending there's a space where the cursor is.

257 // This phenomena will be referred to as space ghosting.

258 // Going forward we perform a similar process for obtaining

259 // scored items as above. In fact all of the variable names are the same

260 // except that they are prepended with 'transformed_' reflecting that these

261 // variables are related to space ghosting.

262 if ((cursor_position != base::string16::npos) &&

263 (cursor_position < search_string.length()) && (cursor_position > 0)) {

264 // The original search_string broken at cursor position.

265 base::string16 transformed_search_string(search_string);

266 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

267

268 // The same as lower_raw_string and defined for the same reasons.

269 base::string16 transformed_lower_raw_string(

270 base::i18n::ToLower(transformed_search_string));

271 String16Vector transformed_lower_raw_terms =

272 base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16,

273 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

274 if (!transformed_lower_raw_terms.empty()) {

275 base::string16 transormed_lower_unescaped_string =

276 net::UnescapeURLComponent(

277 transformed_lower_raw_string,

278 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

279 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

280 String16Vector transformed_lower_words(String16VectorFromString16(

281 transormed_lower_unescaped_string, false, nullptr));

282 HistoryIDSet transformed_history_id_set =

283 HistoryIDSetFromWords(transformed_lower_words);

284

285 if (history_id_set != transformed_history_id_set) {

286 pre_filter_item_count_ = history_id_set.size();

287 // Trim the candidate pool if it is large as above.

288 was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

289 if (was_trimmed) {

290 HistoryIDVector transformed_history_ids;

291 std::copy(transformed_history_id_set.begin(),

292 transformed_history_id_set.end(),

293 std::back_inserter(transformed_history_ids));

294 // Trim down the set by sorting by typed-count, visit-count, and last

295 // visit.

296 HistoryItemFactorGreater item_factor_functor(history_info_map_);

297 std::partial_sort(

298 transformed_history_ids.begin(),

299 transformed_history_ids.begin() + kItemsToScoreLimit,

300 transformed_history_ids.end(), item_factor_functor);

301 transformed_history_id_set.clear();

302 std::copy(transformed_history_ids.begin(),

303 transformed_history_ids.begin() + kItemsToScoreLimit,

304 std::inserter(transformed_history_id_set,

305 transformed_history_id_set.end()));

306 }

307 history_id_set.insert(transformed_history_id_set.begin(),

308 transformed_history_id_set.end());

309 ScoredHistoryMatches transformed_scored_items =

310 std::for_each(

311 history_id_set.begin(), history_id_set.end(),

312 AddHistoryMatch(bookmark_model, template_url_service, *this,

313 transformed_lower_raw_string,

314 transformed_lower_raw_terms, base::Time::Now()))

315 .ScoredMatches();

316 scored_items.insert(scored_items.end(),

317 transformed_scored_items.begin(),

318 transformed_scored_items.end());

319 }

320 }

321 }	263 }

322 // Select and sort only the top \|max_matches\| results.	264 // Select and sort only the top \|max_matches\| results.

323 if (scored_items.size() > max_matches) {	265 if (scored_items.size() > max_matches) {

324 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,	266 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches,

325 scored_items.end(),	267 scored_items.end(),

326 ScoredHistoryMatch::MatchScoreGreater);	268 ScoredHistoryMatch::MatchScoreGreater);

327 scored_items.resize(max_matches);	269 scored_items.resize(max_matches);

328 } else {	270 } else {

329 std::sort(scored_items.begin(), scored_items.end(),	271 std::sort(scored_items.begin(), scored_items.end(),

330 ScoredHistoryMatch::MatchScoreGreater);	272 ScoredHistoryMatch::MatchScoreGreater);

331 }	273 }

332 post_scoring_item_count_ = scored_items.size();	274 post_scoring_item_count_ = scored_items.size();

333	275 if (pre_filter_item_count_ > post_filter_item_count_) {

334 if (was_trimmed) {

335 search_term_cache_.clear(); // Invalidate the term cache.	276 search_term_cache_.clear(); // Invalidate the term cache.

336 } else {	277 } else {

337 // Remove any stale SearchTermCacheItems.	278 // Remove any stale SearchTermCacheItems.

338 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();	279 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

339 cache_iter != search_term_cache_.end(); ) {	280 cache_iter != search_term_cache_.end(); ) {

340 if (!cache_iter->second.used_)	281 if (!cache_iter->second.used_)

341 search_term_cache_.erase(cache_iter++);	282 search_term_cache_.erase(cache_iter++);

342 else	283 else

343 ++cache_iter;	284 ++cache_iter;

344 }	285 }

(...skipping 1068 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1413 // First cut: typed count, visit count, recency.	1354 // First cut: typed count, visit count, recency.

1414 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks	1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1415 // recently visited (within the last 12/24 hours) as highly important. Get	1356 // recently visited (within the last 12/24 hours) as highly important. Get

1416 // input from mpearson.	1357 // input from mpearson.

1417 if (r1.typed_count() != r2.typed_count())	1358 if (r1.typed_count() != r2.typed_count())

1418 return (r1.typed_count() > r2.typed_count());	1359 return (r1.typed_count() > r2.typed_count());

1419 if (r1.visit_count() != r2.visit_count())	1360 if (r1.visit_count() != r2.visit_count())

1420 return (r1.visit_count() > r2.visit_count());	1361 return (r1.visit_count() > r2.visit_count());

1421 return (r1.last_visit() > r2.last_visit());	1362 return (r1.last_visit() > r2.last_visit());

1422 }	1363 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »