OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
6 | 6 |
7 #include <stdint.h> | 7 #include <stdint.h> |
8 | 8 |
9 #include <functional> | 9 #include <functional> |
10 #include <iterator> | 10 #include <iterator> |
(...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
145 // URLIndexPrivateData --------------------------------------------------------- | 145 // URLIndexPrivateData --------------------------------------------------------- |
146 | 146 |
147 URLIndexPrivateData::URLIndexPrivateData() | 147 URLIndexPrivateData::URLIndexPrivateData() |
148 : restored_cache_version_(0), | 148 : restored_cache_version_(0), |
149 saved_cache_version_(kCurrentCacheFileVersion), | 149 saved_cache_version_(kCurrentCacheFileVersion), |
150 pre_filter_item_count_(0), | 150 pre_filter_item_count_(0), |
151 post_filter_item_count_(0), | 151 post_filter_item_count_(0), |
152 post_scoring_item_count_(0) { | 152 post_scoring_item_count_(0) { |
153 } | 153 } |
154 | 154 |
155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | 155 HistoryIDSet URLIndexPrivateData::HistoryItemsForWords( |
156 base::string16 search_string, | 156 base::string16 search_string) { |
157 size_t cursor_position, | 157 |
158 size_t max_matches, | 158 HistoryIDSet history_id_set; |
Mark P
2016/08/10 17:46:19
This variable isn't needed; simply return HistoryI
Lavar Askew
2016/08/18 03:07:51
Done.
| |
159 bookmarks::BookmarkModel* bookmark_model, | 159 |
160 TemplateURLService* template_url_service) { | |
161 // If cursor position is set and useful (not at either end of the | |
162 // string), allow the search string to be broken at cursor position. | |
163 // We do this by pretending there's a space where the cursor is. | |
164 if ((cursor_position != base::string16::npos) && | |
165 (cursor_position < search_string.length()) && | |
166 (cursor_position > 0)) { | |
167 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
168 } | |
169 pre_filter_item_count_ = 0; | |
170 post_filter_item_count_ = 0; | |
171 post_scoring_item_count_ = 0; | |
172 // The search string we receive may contain escaped characters. For reducing | 160 // The search string we receive may contain escaped characters. For reducing |
173 // the index we need individual, lower-cased words, ignoring escapings. For | 161 // the index we need individual, lower-cased words, ignoring escapings. For |
174 // the final filtering we need whitespace separated substrings possibly | 162 // the final filtering we need whitespace separated substrings possibly |
175 // containing escaped characters. | 163 // containing escaped characters. |
176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | 164 base::string16 lower_raw_string( |
165 base::i18n::ToLower(search_string)); | |
177 base::string16 lower_unescaped_string = | 166 base::string16 lower_unescaped_string = |
178 net::UnescapeURLComponent(lower_raw_string, | 167 net::UnescapeURLComponent(lower_raw_string, |
179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | 168 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | |
180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | 169 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); |
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the | 170 // Extract individual 'words' (as opposed to 'terms'; see below) from the |
182 // search string. When the user types "colspec=ID%20Mstone Release" we get | 171 // search string. When the user types "colspec=ID%20Mstone Release" we get |
183 // four 'words': "colspec", "id", "mstone" and "release". | 172 // four 'words': "colspec", "id", "mstone" and "release". |
184 String16Vector lower_words( | 173 String16Vector lower_words( |
185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); | 174 String16VectorFromString16( |
186 ScoredHistoryMatches scored_items; | 175 lower_unescaped_string, false, nullptr)); |
187 | 176 |
188 // Do nothing if we have indexed no words (probably because we've not been | 177 // Do nothing if we have indexed no words (probably because we've not been |
189 // initialized yet) or the search string has no words. | 178 // initialized yet) or the search string has no words. |
190 if (word_list_.empty() || lower_words.empty()) { | 179 if (word_list_.empty() || lower_words.empty()) { |
191 search_term_cache_.clear(); // Invalidate the term cache. | 180 search_term_cache_.clear(); // Invalidate the term cache. |
192 return scored_items; | 181 return history_id_set; |
193 } | 182 } |
194 | 183 |
195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 184 history_id_set = |
196 // approach. | 185 HistoryIDSetFromWords(lower_words); |
197 ResetSearchTermCache(); | |
198 | 186 |
199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | 187 return history_id_set; |
188 } | |
200 | 189 |
201 // Trim the candidate pool if it is large. Note that we do not filter out | 190 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString ( |
202 // items that do not contain the search terms as proper substrings -- doing | 191 base::string16 search_string, |
203 // so is the performance-costly operation we are trying to avoid in order | 192 HistoryIDSet history_id_set, |
204 // to maintain omnibox responsiveness. | 193 size_t max_matches, |
205 const size_t kItemsToScoreLimit = 500; | 194 bookmarks::BookmarkModel* bookmark_model, |
206 pre_filter_item_count_ = history_id_set.size(); | 195 TemplateURLService* template_url_service) { |
207 // If we trim the results set we do not want to cache the results for next | 196 ScoredHistoryMatches scored_items; |
208 // time as the user's ultimately desired result could easily be eliminated | |
209 // in this early rough filter. | |
210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
211 if (was_trimmed) { | |
212 HistoryIDVector history_ids; | |
213 std::copy(history_id_set.begin(), history_id_set.end(), | |
214 std::back_inserter(history_ids)); | |
215 // Trim down the set by sorting by typed-count, visit-count, and last | |
216 // visit. | |
217 HistoryItemFactorGreater | |
218 item_factor_functor(history_info_map_); | |
219 std::partial_sort(history_ids.begin(), | |
220 history_ids.begin() + kItemsToScoreLimit, | |
221 history_ids.end(), | |
222 item_factor_functor); | |
223 history_id_set.clear(); | |
224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
225 std::inserter(history_id_set, history_id_set.end())); | |
226 post_filter_item_count_ = history_id_set.size(); | |
227 } | |
228 | 197 |
229 // Pass over all of the candidates filtering out any without a proper | 198 // Pass over all of the candidates filtering out any without a proper |
230 // substring match, inserting those which pass in order by score. Note that | 199 // substring match, inserting those which pass in order by score. Note that |
231 // in this step we are using the raw search string complete with escaped | 200 // in this step we are using the raw search string complete with escaped |
232 // URL elements. When the user has specifically typed something akin to | 201 // URL elements. When the user has specifically typed something akin to |
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | 202 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that |
234 // specific substring appears in the URL or page title. | 203 // specific substring appears in the URL or page title. |
235 | 204 |
236 // We call these 'terms' (as opposed to 'words'; see above) as in this case | 205 // We call these 'terms' (as opposed to 'words'; see above) as in this case |
237 // we only want to break up the search string on 'true' whitespace rather than | 206 // we only want to break up the search string on 'true' whitespace rather than |
238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | 207 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we |
239 // get two 'terms': "colspec=id%20mstone" and "release". | 208 // get two 'terms': "colspec=id%20mstone" and "release". |
209 base::string16 lower_raw_string( | |
210 base::i18n::ToLower(search_string)); | |
240 String16Vector lower_raw_terms = base::SplitString( | 211 String16Vector lower_raw_terms = base::SplitString( |
241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, | 212 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, |
242 base::SPLIT_WANT_NONEMPTY); | 213 base::SPLIT_WANT_NONEMPTY); |
214 | |
243 if (lower_raw_terms.empty()) { | 215 if (lower_raw_terms.empty()) { |
244 // Don't score matches when there are no terms to score against. (It's | 216 // Don't score matches when there are no terms to score against. (It's |
245 // possible that the word break iterater that extracts words to search | 217 // possible that the word break iterater that extracts words to search |
246 // for in the database allows some whitespace "words" whereas SplitString | 218 // for in the database allows some whitespace "words" whereas SplitString |
247 // excludes a long list of whitespace.) One could write a scoring | 219 // excludes a long list of whitespace.) One could write a scoring |
248 // function that gives a reasonable order to matches when there | 220 // function that gives a reasonable order to matches when there |
249 // are no terms (i.e., all the words are some form of whitespace), | 221 // are no terms (i.e., all the words are some form of whitespace), |
250 // but this is such a rare edge case that it's not worth the time. | 222 // but this is such a rare edge case that it's not worth the time. |
251 return scored_items; | 223 return scored_items; |
252 } | 224 } |
253 scored_items = | 225 scored_items = |
254 std::for_each( | 226 std::for_each( |
255 history_id_set.begin(), history_id_set.end(), | 227 history_id_set.begin(), history_id_set.end(), |
256 AddHistoryMatch(bookmark_model, template_url_service, *this, | 228 AddHistoryMatch(bookmark_model, template_url_service, *this, |
257 lower_raw_string, lower_raw_terms, | 229 lower_raw_string, lower_raw_terms, |
258 base::Time::Now())).ScoredMatches(); | 230 base::Time::Now())).ScoredMatches(); |
259 | 231 |
260 // Select and sort only the top |max_matches| results. | 232 // Select and sort only the top |max_matches| results. |
261 if (scored_items.size() > max_matches) { | 233 if (scored_items.size() > max_matches) { |
262 std::partial_sort(scored_items.begin(), | 234 std::partial_sort(scored_items.begin(), |
263 scored_items.begin() + | 235 scored_items.begin() + |
264 max_matches, | 236 max_matches, |
265 scored_items.end(), | 237 scored_items.end(), |
266 ScoredHistoryMatch::MatchScoreGreater); | 238 ScoredHistoryMatch::MatchScoreGreater); |
267 scored_items.resize(max_matches); | 239 scored_items.resize(max_matches); |
268 } else { | 240 } |
241 else { | |
269 std::sort(scored_items.begin(), scored_items.end(), | 242 std::sort(scored_items.begin(), scored_items.end(), |
270 ScoredHistoryMatch::MatchScoreGreater); | 243 ScoredHistoryMatch::MatchScoreGreater); |
271 } | 244 } |
272 post_scoring_item_count_ = scored_items.size(); | |
273 | |
274 if (was_trimmed) { | |
275 search_term_cache_.clear(); // Invalidate the term cache. | |
276 } else { | |
277 // Remove any stale SearchTermCacheItems. | |
278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
279 cache_iter != search_term_cache_.end(); ) { | |
280 if (!cache_iter->second.used_) | |
281 search_term_cache_.erase(cache_iter++); | |
282 else | |
283 ++cache_iter; | |
284 } | |
285 } | |
286 | 245 |
287 return scored_items; | 246 return scored_items; |
288 } | 247 } |
289 | 248 |
249 | |
250 bool URLIndexPrivateData::TrimCandidatePool (HistoryIDSet history_id_set) { | |
251 | |
252 // Trim the candidate pool if it is large. Note that we do not filter out | |
Mark P
2016/08/10 17:46:19
I assume this whole block was copied with no modif
Lavar Askew
2016/08/18 03:07:51
Yes, you are correct.
Done.
| |
253 // items that do not contain the search terms as proper substrings -- doing | |
254 // so is the performance-costly operation we are trying to avoid in order | |
255 // to maintain omnibox responsiveness. | |
256 const size_t kItemsToScoreLimit = 500; | |
257 pre_filter_item_count_ = history_id_set.size(); | |
258 // If we trim the results set we do not want to cache the results for next | |
259 // time as the user's ultimately desired result could easily be eliminated | |
260 // in this early rough filter. | |
261 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
262 if (was_trimmed) { | |
263 HistoryIDVector history_ids; | |
264 std::copy(history_id_set.begin(), history_id_set.end(), | |
265 std::back_inserter(history_ids)); | |
266 // Trim down the set by sorting by typed-count, visit-count, and last | |
267 // visit. | |
268 HistoryItemFactorGreater | |
269 item_factor_functor(history_info_map_); | |
270 std::partial_sort(history_ids.begin(), | |
271 history_ids.begin() + kItemsToScoreLimit, | |
272 history_ids.end(), | |
273 item_factor_functor); | |
274 history_id_set.clear(); | |
275 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
276 std::inserter(history_id_set, history_id_set.end())); | |
277 post_filter_item_count_ = history_id_set.size(); | |
278 } | |
279 | |
280 return was_trimmed; | |
281 } | |
282 | |
283 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
284 base::string16 search_string, | |
285 size_t cursor_position, | |
286 size_t max_matches, | |
287 bookmarks::BookmarkModel* bookmark_model, | |
288 TemplateURLService* template_url_service) { | |
289 pre_filter_item_count_ = 0; | |
290 post_filter_item_count_ = 0; | |
291 post_scoring_item_count_ = 0; | |
292 | |
293 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
294 // approach. | |
295 ResetSearchTermCache(); | |
296 | |
297 HistoryIDSet history_id_set = HistoryItemsForWords(search_string); | |
298 | |
299 bool history_id_set_without_word_break_was_trimmed = | |
300 TrimCandidatePool(history_id_set); | |
301 | |
302 ScoredHistoryMatches scored_items_without_word_break = | |
303 GetScoredItemsForSearchString( | |
304 search_string, | |
305 history_id_set, | |
306 max_matches, | |
307 bookmark_model, | |
308 template_url_service); | |
309 | |
310 ScoredHistoryMatches all_scored_items; | |
311 ScoredHistoryMatches scored_items_with_word_break; | |
312 | |
313 bool history_id_set_with_word_break_was_trimmed = false; | |
314 | |
315 // If cursor position is set and useful (not at either end of the | |
316 // string), allow the search string to be broken at cursor position. | |
317 // We do this by pretending there's a space where the cursor is. | |
318 if ((cursor_position != base::string16::npos) && | |
319 (cursor_position < search_string.length()) && | |
320 (cursor_position > 0)) { | |
321 base::string16 search_string_with_word_break = search_string; | |
322 HistoryIDSet history_id_set_with_word_break; | |
323 | |
324 base::string16 word_break = | |
325 base::ASCIIToUTF16(" "); | |
326 search_string_with_word_break.insert(cursor_position, word_break); | |
327 | |
328 // Add to history_id_set the ids that are related to the original | |
329 // search string, but with the word break. | |
330 history_id_set_with_word_break = | |
331 HistoryItemsForWords(search_string_with_word_break); | |
Mark P
2016/08/10 17:46:19
This might do unnecessary work if the new search s
| |
332 | |
333 history_id_set_with_word_break_was_trimmed = | |
334 TrimCandidatePool (history_id_set_with_word_break); | |
335 | |
336 scored_items_with_word_break = GetScoredItemsForSearchString( | |
337 search_string_with_word_break, | |
338 history_id_set_with_word_break, | |
339 max_matches, | |
340 bookmark_model, | |
341 template_url_service); | |
342 } | |
343 | |
344 if (history_id_set_without_word_break_was_trimmed || | |
345 history_id_set_with_word_break_was_trimmed) { | |
346 search_term_cache_.clear(); // Invalidate the term cache. | |
347 } | |
348 else { | |
349 // Remove any stale SearchTermCacheItems. | |
350 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
351 cache_iter != search_term_cache_.end(); ) { | |
352 if (!cache_iter->second.used_) { | |
353 search_term_cache_.erase(cache_iter++); | |
354 } | |
355 else { | |
356 ++cache_iter; | |
357 } | |
358 } | |
359 } | |
360 | |
361 // all_scored_items represents the unification the | |
362 //ScoredHistoryMatches for the search string with and without the word break. | |
Mark P
2016/08/10 17:46:19
The code below can be made more efficient and easi
Lavar Askew
2016/08/18 03:07:51
Done.
| |
363 if (scored_items_without_word_break.size() > 0 && | |
364 scored_items_with_word_break.size() > 0) { | |
365 post_scoring_item_count_ = | |
366 scored_items_without_word_break.size() | |
367 + scored_items_with_word_break.size(); | |
368 | |
369 all_scored_items.reserve(post_scoring_item_count_); | |
370 all_scored_items.insert(all_scored_items.end(), | |
371 scored_items_without_word_break.begin(), | |
372 scored_items_without_word_break.end()); | |
373 all_scored_items.insert(all_scored_items.end(), | |
374 scored_items_with_word_break.begin(), | |
375 scored_items_with_word_break.end()); | |
376 } | |
377 else if (scored_items_without_word_break.size() == 0 && | |
378 scored_items_with_word_break.size() > 0) { | |
379 all_scored_items = scored_items_with_word_break; | |
380 } | |
381 else if (scored_items_without_word_break.size() > 0 && | |
382 scored_items_with_word_break.size() == 0) { | |
383 all_scored_items = scored_items_without_word_break; | |
384 } | |
385 | |
Mark P
2016/08/10 17:46:19
I could imagine situations where the items overlap
Lavar Askew
2016/08/18 03:07:51
Done.
| |
386 return all_scored_items; | |
Mark P
2016/08/10 17:46:19
Also, the number of items we're supposed to return
Lavar Askew
2016/08/18 03:07:52
Done.
| |
387 } | |
388 | |
290 bool URLIndexPrivateData::UpdateURL( | 389 bool URLIndexPrivateData::UpdateURL( |
291 history::HistoryService* history_service, | 390 history::HistoryService* history_service, |
292 const history::URLRow& row, | 391 const history::URLRow& row, |
293 const std::set<std::string>& scheme_whitelist, | 392 const std::set<std::string>& scheme_whitelist, |
294 base::CancelableTaskTracker* tracker) { | 393 base::CancelableTaskTracker* tracker) { |
295 // The row may or may not already be in our index. If it is not already | 394 // The row may or may not already be in our index. If it is not already |
296 // indexed and it qualifies then it gets indexed. If it is already | 395 // indexed and it qualifies then it gets indexed. If it is already |
297 // indexed and still qualifies then it gets updated, otherwise it | 396 // indexed and still qualifies then it gets updated, otherwise it |
298 // is deleted from the index. | 397 // is deleted from the index. |
299 bool row_was_updated = false; | 398 bool row_was_updated = false; |
(...skipping 1054 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1354 // First cut: typed count, visit count, recency. | 1453 // First cut: typed count, visit count, recency. |
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1454 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
1356 // recently visited (within the last 12/24 hours) as highly important. Get | 1455 // recently visited (within the last 12/24 hours) as highly important. Get |
1357 // input from mpearson. | 1456 // input from mpearson. |
1358 if (r1.typed_count() != r2.typed_count()) | 1457 if (r1.typed_count() != r2.typed_count()) |
1359 return (r1.typed_count() > r2.typed_count()); | 1458 return (r1.typed_count() > r2.typed_count()); |
1360 if (r1.visit_count() != r2.visit_count()) | 1459 if (r1.visit_count() != r2.visit_count()) |
1361 return (r1.visit_count() > r2.visit_count()); | 1460 return (r1.visit_count() > r2.visit_count()); |
1362 return (r1.last_visit() > r2.last_visit()); | 1461 return (r1.last_visit() > r2.last_visit()); |
1363 } | 1462 } |
OLD | NEW |