OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
6 | 6 |
7 #include <stdint.h> | 7 #include <stdint.h> |
8 | 8 |
9 #include <functional> | 9 #include <functional> |
10 #include <iterator> | 10 #include <iterator> |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
134 } | 134 } |
135 | 135 |
136 | 136 |
137 // URLIndexPrivateData --------------------------------------------------------- | 137 // URLIndexPrivateData --------------------------------------------------------- |
138 | 138 |
139 // static | 139 // static |
140 constexpr size_t URLIndexPrivateData::kMaxVisitsToStoreInCache; | 140 constexpr size_t URLIndexPrivateData::kMaxVisitsToStoreInCache; |
141 | 141 |
142 URLIndexPrivateData::URLIndexPrivateData() | 142 URLIndexPrivateData::URLIndexPrivateData() |
143 : restored_cache_version_(0), | 143 : restored_cache_version_(0), |
144 saved_cache_version_(kCurrentCacheFileVersion), | 144 saved_cache_version_(kCurrentCacheFileVersion) {} |
145 pre_filter_item_count_(0), | |
146 post_filter_item_count_(0), | |
147 post_scoring_item_count_(0) { | |
148 } | |
149 | 145 |
150 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | 146 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( |
151 base::string16 original_search_string, | 147 base::string16 original_search_string, |
152 size_t cursor_position, | 148 size_t cursor_position, |
153 size_t max_matches, | 149 size_t max_matches, |
154 bookmarks::BookmarkModel* bookmark_model, | 150 bookmarks::BookmarkModel* bookmark_model, |
155 TemplateURLService* template_url_service) { | 151 TemplateURLService* template_url_service) { |
156 pre_filter_item_count_ = 0; | |
157 post_filter_item_count_ = 0; | |
158 post_scoring_item_count_ = 0; | |
159 | |
160 // This list will contain the original search string and any other string | 152 // This list will contain the original search string and any other string |
161 // transformations. | 153 // transformations. |
162 String16Vector search_strings; | 154 String16Vector search_strings; |
163 search_strings.push_back(original_search_string); | 155 search_strings.push_back(original_search_string); |
164 if ((cursor_position != base::string16::npos) && | 156 if ((cursor_position != base::string16::npos) && |
165 (cursor_position < original_search_string.length()) && | 157 (cursor_position < original_search_string.length()) && |
166 (cursor_position > 0)) { | 158 (cursor_position > 0)) { |
167 // The original search_string broken at cursor position. This is one type of | 159 // The original search_string broken at cursor position. This is one type of |
168 // transformation. | 160 // transformation. |
169 base::string16 transformed_search_string(original_search_string); | 161 base::string16 transformed_search_string(original_search_string); |
170 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | 162 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); |
171 search_strings.push_back(transformed_search_string); | 163 search_strings.push_back(transformed_search_string); |
172 } | 164 } |
173 | 165 |
174 ScoredHistoryMatches scored_items; | 166 ScoredHistoryMatches scored_items; |
175 // Invalidate the term cache and return if we have indexed no words (probably | 167 // Invalidate the term cache and return if we have indexed no words (probably |
176 // because we've not been initialized yet). | 168 // because we've not been initialized yet). |
177 if (word_list_.empty()) { | 169 if (word_list_.empty()) { |
178 search_term_cache_.clear(); | 170 search_term_cache_.clear(); |
179 return scored_items; | 171 return scored_items; |
180 } | 172 } |
181 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 173 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
182 // approach. | 174 // approach. |
183 ResetSearchTermCache(); | 175 ResetSearchTermCache(); |
184 | 176 |
177 bool history_ids_were_trimmed = false; | |
185 for (const base::string16& search_string : search_strings) { | 178 for (const base::string16& search_string : search_strings) { |
186 // The search string we receive may contain escaped characters. For reducing | 179 // The search string we receive may contain escaped characters. For reducing |
187 // the index we need individual, lower-cased words, ignoring escapings. For | 180 // the index we need individual, lower-cased words, ignoring escapings. For |
188 // the final filtering we need whitespace separated substrings possibly | 181 // the final filtering we need whitespace separated substrings possibly |
189 // containing escaped characters. | 182 // containing escaped characters. |
190 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | 183 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); |
191 base::string16 lower_unescaped_string = net::UnescapeURLComponent( | 184 base::string16 lower_unescaped_string = net::UnescapeURLComponent( |
192 lower_raw_string, | 185 lower_raw_string, |
193 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | 186 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | |
194 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | 187 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); |
195 | 188 |
196 // Extract individual 'words' (as opposed to 'terms'; see comment in | 189 // Extract individual 'words' (as opposed to 'terms'; see comment in |
197 // HistoryIdSetToScoredMatches()) from the search string. When the user | 190 // HistoryIdSetToScoredMatches()) from the search string. When the user |
198 // types "colspec=ID%20Mstone Release" we get four 'words': "colspec", "id", | 191 // types "colspec=ID%20Mstone Release" we get four 'words': "colspec", "id", |
199 // "mstone" and "release". | 192 // "mstone" and "release". |
200 String16Vector lower_words( | 193 String16Vector lower_words( |
201 String16VectorFromString16(lower_unescaped_string, false, nullptr)); | 194 String16VectorFromString16(lower_unescaped_string, false, nullptr)); |
202 if (lower_words.empty()) | 195 if (lower_words.empty()) |
203 continue; | 196 continue; |
204 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | 197 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); |
205 pre_filter_item_count_ += history_id_set.size(); | 198 history_ids_were_trimmed = |
Peter Kasting
2017/02/23 01:04:45
Nit: Use |=
dyaroshev
2017/02/23 01:41:26
This would be a bit operation - is it ok?
I think
Peter Kasting
2017/02/23 01:46:06
As you note, A |= B is equivalent to A = A | B. B
dyaroshev
2017/02/24 01:27:03
Done.
| |
206 // Trim the candidate pool if it is large. Note that we do not filter out | 199 TrimHistoryIdsPool(&history_id_set) || history_ids_were_trimmed; |
207 // items that do not contain the search terms as proper substrings -- | 200 |
208 // doing so is the performance-costly operation we are trying to avoid in | |
209 // order to maintain omnibox responsiveness. | |
210 const size_t kItemsToScoreLimit = 500; | |
211 if (history_id_set.size() > kItemsToScoreLimit) { | |
212 HistoryIDVector history_ids; | |
213 std::copy(history_id_set.begin(), history_id_set.end(), | |
214 std::back_inserter(history_ids)); | |
215 // Trim down the set by sorting by typed-count, visit-count, and last | |
216 // visit. | |
217 HistoryItemFactorGreater item_factor_functor(history_info_map_); | |
218 std::partial_sort(history_ids.begin(), | |
219 history_ids.begin() + kItemsToScoreLimit, | |
220 history_ids.end(), item_factor_functor); | |
221 history_id_set.clear(); | |
222 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
223 std::inserter(history_id_set, history_id_set.end())); | |
224 post_filter_item_count_ += history_id_set.size(); | |
225 } else { | |
226 post_filter_item_count_ += pre_filter_item_count_; | |
227 } | |
228 ScoredHistoryMatches temp_scored_items; | 201 ScoredHistoryMatches temp_scored_items; |
229 HistoryIdSetToScoredMatches(history_id_set, lower_raw_string, | 202 HistoryIdSetToScoredMatches(history_id_set, lower_raw_string, |
230 template_url_service, bookmark_model, | 203 template_url_service, bookmark_model, |
231 &temp_scored_items); | 204 &temp_scored_items); |
232 scored_items.insert(scored_items.end(), temp_scored_items.begin(), | 205 scored_items.insert(scored_items.end(), temp_scored_items.begin(), |
233 temp_scored_items.end()); | 206 temp_scored_items.end()); |
234 } | 207 } |
235 // Select and sort only the top |max_matches| results. | 208 // Select and sort only the top |max_matches| results. |
236 if (scored_items.size() > max_matches) { | 209 if (scored_items.size() > max_matches) { |
237 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, | 210 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, |
238 scored_items.end(), | 211 scored_items.end(), |
239 ScoredHistoryMatch::MatchScoreGreater); | 212 ScoredHistoryMatch::MatchScoreGreater); |
240 scored_items.resize(max_matches); | 213 scored_items.resize(max_matches); |
241 } else { | 214 } else { |
242 std::sort(scored_items.begin(), scored_items.end(), | 215 std::sort(scored_items.begin(), scored_items.end(), |
243 ScoredHistoryMatch::MatchScoreGreater); | 216 ScoredHistoryMatch::MatchScoreGreater); |
244 } | 217 } |
245 post_scoring_item_count_ = scored_items.size(); | 218 if (history_ids_were_trimmed) { |
246 if (pre_filter_item_count_ > post_filter_item_count_) { | |
247 // If we trim the results set we do not want to cache the results for next | 219 // If we trim the results set we do not want to cache the results for next |
248 // time as the user's ultimately desired result could easily be eliminated | 220 // time as the user's ultimately desired result could easily be eliminated |
249 // in the early rough filter. | 221 // in the early rough filter. |
250 search_term_cache_.clear(); | 222 search_term_cache_.clear(); |
251 } else { | 223 } else { |
252 // Remove any stale SearchTermCacheItems. | 224 // Remove any stale SearchTermCacheItems. |
253 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | 225 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
254 cache_iter != search_term_cache_.end(); ) { | 226 cache_iter != search_term_cache_.end(); ) { |
255 if (!cache_iter->second.used_) | 227 if (!cache_iter->second.used_) |
256 cache_iter = search_term_cache_.erase(cache_iter); | 228 cache_iter = search_term_cache_.erase(cache_iter); |
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
465 data_copy->available_words_ = available_words_; | 437 data_copy->available_words_ = available_words_; |
466 data_copy->word_map_ = word_map_; | 438 data_copy->word_map_ = word_map_; |
467 data_copy->char_word_map_ = char_word_map_; | 439 data_copy->char_word_map_ = char_word_map_; |
468 data_copy->word_id_history_map_ = word_id_history_map_; | 440 data_copy->word_id_history_map_ = word_id_history_map_; |
469 data_copy->history_id_word_map_ = history_id_word_map_; | 441 data_copy->history_id_word_map_ = history_id_word_map_; |
470 data_copy->history_info_map_ = history_info_map_; | 442 data_copy->history_info_map_ = history_info_map_; |
471 data_copy->word_starts_map_ = word_starts_map_; | 443 data_copy->word_starts_map_ = word_starts_map_; |
472 return data_copy; | 444 return data_copy; |
473 // Not copied: | 445 // Not copied: |
474 // search_term_cache_ | 446 // search_term_cache_ |
475 // pre_filter_item_count_ | |
476 // post_filter_item_count_ | |
477 // post_scoring_item_count_ | |
478 } | 447 } |
479 | 448 |
480 bool URLIndexPrivateData::Empty() const { | 449 bool URLIndexPrivateData::Empty() const { |
481 return history_info_map_.empty(); | 450 return history_info_map_.empty(); |
482 } | 451 } |
483 | 452 |
484 void URLIndexPrivateData::Clear() { | 453 void URLIndexPrivateData::Clear() { |
485 last_time_rebuilt_from_history_ = base::Time(); | 454 last_time_rebuilt_from_history_ = base::Time(); |
486 word_list_.clear(); | 455 word_list_.clear(); |
487 available_words_.clear(); | 456 available_words_.clear(); |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
521 history_id_set.swap(term_history_set); | 490 history_id_set.swap(term_history_set); |
522 } else { | 491 } else { |
523 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( | 492 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( |
524 history_id_set, term_history_set); | 493 history_id_set, term_history_set); |
525 history_id_set.swap(new_history_id_set); | 494 history_id_set.swap(new_history_id_set); |
526 } | 495 } |
527 } | 496 } |
528 return history_id_set; | 497 return history_id_set; |
529 } | 498 } |
530 | 499 |
500 bool URLIndexPrivateData::TrimHistoryIdsPool( | |
501 HistoryIDSet* history_id_set) const { | |
502 constexpr size_t kItemsToScoreLimit = 500; | |
503 if (history_id_set->size() < kItemsToScoreLimit) | |
504 return false; | |
505 | |
506 HistoryIDVector history_ids(history_id_set->begin(), history_id_set->end()); | |
507 | |
508 // Trim down the set by sorting by typed-count, visit-count, and last | |
509 // visit. | |
510 auto new_end = history_ids.begin() + kItemsToScoreLimit; | |
511 HistoryItemFactorGreater item_factor_functor(history_info_map_); | |
512 | |
513 std::nth_element(history_ids.begin(), new_end, history_ids.end(), | |
514 item_factor_functor); | |
515 history_ids.erase(new_end, history_ids.end()); | |
516 | |
517 *history_id_set = {history_ids.begin(), history_ids.end()}; | |
518 return true; | |
519 } | |
520 | |
531 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( | 521 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( |
532 const base::string16& term) { | 522 const base::string16& term) { |
533 if (term.empty()) | 523 if (term.empty()) |
534 return HistoryIDSet(); | 524 return HistoryIDSet(); |
535 | 525 |
536 // TODO(mrossetti): Consider optimizing for very common terms such as | 526 // TODO(mrossetti): Consider optimizing for very common terms such as |
537 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently | 527 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently |
538 // occuring words in the user's searches. | 528 // occuring words in the user's searches. |
539 | 529 |
540 size_t term_length = term.length(); | 530 size_t term_length = term.length(); |
(...skipping 840 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1381 // First cut: typed count, visit count, recency. | 1371 // First cut: typed count, visit count, recency. |
1382 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1372 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
1383 // recently visited (within the last 12/24 hours) as highly important. Get | 1373 // recently visited (within the last 12/24 hours) as highly important. Get |
1384 // input from mpearson. | 1374 // input from mpearson. |
1385 if (r1.typed_count() != r2.typed_count()) | 1375 if (r1.typed_count() != r2.typed_count()) |
1386 return (r1.typed_count() > r2.typed_count()); | 1376 return (r1.typed_count() > r2.typed_count()); |
1387 if (r1.visit_count() != r2.visit_count()) | 1377 if (r1.visit_count() != r2.visit_count()) |
1388 return (r1.visit_count() > r2.visit_count()); | 1378 return (r1.visit_count() > r2.visit_count()); |
1389 return (r1.last_visit() > r2.last_visit()); | 1379 return (r1.last_visit() > r2.last_visit()); |
1390 } | 1380 } |
OLD | NEW |