OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
6 | 6 |
7 #include <stdint.h> | 7 #include <stdint.h> |
8 | 8 |
9 #include <functional> | 9 #include <functional> |
10 #include <iterator> | 10 #include <iterator> |
(...skipping 135 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
146 | 146 |
147 URLIndexPrivateData::URLIndexPrivateData() | 147 URLIndexPrivateData::URLIndexPrivateData() |
148 : restored_cache_version_(0), | 148 : restored_cache_version_(0), |
149 saved_cache_version_(kCurrentCacheFileVersion), | 149 saved_cache_version_(kCurrentCacheFileVersion), |
150 pre_filter_item_count_(0), | 150 pre_filter_item_count_(0), |
151 post_filter_item_count_(0), | 151 post_filter_item_count_(0), |
152 post_scoring_item_count_(0) { | 152 post_scoring_item_count_(0) { |
153 } | 153 } |
154 | 154 |
155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | 155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( |
156 base::string16 search_string, | 156 base::string16 original_search_string, |
157 size_t cursor_position, | 157 size_t cursor_position, |
158 size_t max_matches, | 158 size_t max_matches, |
159 bookmarks::BookmarkModel* bookmark_model, | 159 bookmarks::BookmarkModel* bookmark_model, |
160 TemplateURLService* template_url_service) { | 160 TemplateURLService* template_url_service) { |
161 // If cursor position is set and useful (not at either end of the | |
162 // string), allow the search string to be broken at cursor position. | |
163 // We do this by pretending there's a space where the cursor is. | |
164 if ((cursor_position != base::string16::npos) && | |
165 (cursor_position < search_string.length()) && | |
166 (cursor_position > 0)) { | |
167 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
168 } | |
169 pre_filter_item_count_ = 0; | 161 pre_filter_item_count_ = 0; |
170 post_filter_item_count_ = 0; | 162 post_filter_item_count_ = 0; |
171 post_scoring_item_count_ = 0; | 163 post_scoring_item_count_ = 0; |
172 // The search string we receive may contain escaped characters. For reducing | 164 |
173 // the index we need individual, lower-cased words, ignoring escapings. For | 165 // This list will contain the original search string and any other string |
174 // the final filtering we need whitespace separated substrings possibly | 166 // transformations. |
175 // containing escaped characters. | 167 String16Vector search_strings; |
176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | 168 search_strings.push_back(original_search_string); |
177 base::string16 lower_unescaped_string = | 169 if ((cursor_position != base::string16::npos) && |
178 net::UnescapeURLComponent(lower_raw_string, | 170 (cursor_position < original_search_string.length()) && |
179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | 171 (cursor_position > 0)) { |
180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | 172 // The original search_string broken at cursor position. This is one type of |
181 // Extract individual 'words' (as opposed to 'terms'; see below) from the | 173 // transformation. |
182 // search string. When the user types "colspec=ID%20Mstone Release" we get | 174 base::string16 transformed_search_string(original_search_string); |
183 // four 'words': "colspec", "id", "mstone" and "release". | 175 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); |
184 String16Vector lower_words( | 176 search_strings.push_back(transformed_search_string); |
185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); | 177 } |
178 | |
186 ScoredHistoryMatches scored_items; | 179 ScoredHistoryMatches scored_items; |
187 | 180 // Invalidate the term cache and return if we have indexed no words (probably |
188 // Do nothing if we have indexed no words (probably because we've not been | 181 // because we've not been initialized yet). |
189 // initialized yet) or the search string has no words. | 182 if (word_list_.empty()) { |
190 if (word_list_.empty() || lower_words.empty()) { | 183 search_term_cache_.clear(); |
191 search_term_cache_.clear(); // Invalidate the term cache. | |
192 return scored_items; | 184 return scored_items; |
193 } | 185 } |
194 | |
195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 186 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
196 // approach. | 187 // approach. |
197 ResetSearchTermCache(); | 188 ResetSearchTermCache(); |
198 | 189 |
199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | 190 for (const base::string16& search_string : search_strings) { |
191 // The search string we receive may contain escaped characters. For reducing | |
192 // the index we need individual, lower-cased words, ignoring escapings. For | |
193 // the final filtering we need whitespace separated substrings possibly | |
194 // containing escaped characters. | |
195 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
196 base::string16 lower_unescaped_string = net::UnescapeURLComponent( | |
197 lower_raw_string, | |
198 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | |
199 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | |
200 | 200 |
201 // Trim the candidate pool if it is large. Note that we do not filter out | 201 // Extract individual 'words' (as opposed to 'terms'; see below) from the |
202 // items that do not contain the search terms as proper substrings -- doing | 202 // search string. When the user types "colspec=ID%20Mstone Release" we get |
203 // so is the performance-costly operation we are trying to avoid in order | 203 // four 'words': "colspec", "id", "mstone" and "release". |
204 // to maintain omnibox responsiveness. | 204 String16Vector lower_words( |
205 const size_t kItemsToScoreLimit = 500; | 205 String16VectorFromString16(lower_unescaped_string, false, nullptr)); |
206 pre_filter_item_count_ = history_id_set.size(); | 206 if (lower_words.empty()) |
207 // If we trim the results set we do not want to cache the results for next | 207 continue; |
208 // time as the user's ultimately desired result could easily be eliminated | 208 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); |
209 // in this early rough filter. | 209 // Trim the candidate pool if it is large. Note that we do not filter out |
210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | 210 // items that do not contain the search terms as proper substrings -- |
211 if (was_trimmed) { | 211 // doing so is the performance-costly operation we are trying to avoid in |
212 HistoryIDVector history_ids; | 212 // order to maintain omnibox responsiveness. |
213 std::copy(history_id_set.begin(), history_id_set.end(), | 213 const size_t kItemsToScoreLimit = 500; |
214 std::back_inserter(history_ids)); | 214 pre_filter_item_count_ += history_id_set.size(); |
215 // Trim down the set by sorting by typed-count, visit-count, and last | 215 // If we trim the results set we do not want to cache the results for next |
216 // visit. | 216 // time as the user's ultimately desired result could easily be eliminated |
217 HistoryItemFactorGreater | 217 // in this early rough filter. |
218 item_factor_functor(history_info_map_); | 218 if (pre_filter_item_count_ > kItemsToScoreLimit) { |
219 std::partial_sort(history_ids.begin(), | 219 HistoryIDVector history_ids; |
220 history_ids.begin() + kItemsToScoreLimit, | 220 std::copy(history_id_set.begin(), history_id_set.end(), |
221 history_ids.end(), | 221 std::back_inserter(history_ids)); |
222 item_factor_functor); | 222 // Trim down the set by sorting by typed-count, visit-count, and last |
223 history_id_set.clear(); | 223 // visit. |
224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | 224 HistoryItemFactorGreater item_factor_functor(history_info_map_); |
225 std::inserter(history_id_set, history_id_set.end())); | 225 std::partial_sort(history_ids.begin(), |
226 post_filter_item_count_ = history_id_set.size(); | 226 history_ids.begin() + kItemsToScoreLimit, |
227 } | 227 history_ids.end(), item_factor_functor); |
228 history_id_set.clear(); | |
Mark P
2016/09/23 18:02:13
Should we be clearing post_filter_item_count_ here
Lavar Askew
2016/09/23 19:07:48
I disagree because history_id_set is reinitialized
Mark P
2016/09/23 19:32:14
Ah, good point.
| |
229 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
230 std::inserter(history_id_set, history_id_set.end())); | |
231 post_filter_item_count_ += history_id_set.size(); | |
232 } else { | |
233 post_filter_item_count_ += pre_filter_item_count_; | |
234 } | |
235 // Pass over all of the candidates filtering out any without a proper | |
236 // substring match, inserting those which pass in order by score. Note that | |
237 // in this step we are using the raw search string complete with escaped | |
238 // URL elements. When the user has specifically typed something akin to | |
239 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
240 // specific substring appears in the URL or page title. | |
228 | 241 |
229 // Pass over all of the candidates filtering out any without a proper | 242 // We call these 'terms' (as opposed to 'words'; see above) as in this case |
230 // substring match, inserting those which pass in order by score. Note that | 243 // we only want to break up the search string on 'true' whitespace rather |
231 // in this step we are using the raw search string complete with escaped | 244 // than escaped whitespace. When the user types "colspec=ID%20Mstone |
232 // URL elements. When the user has specifically typed something akin to | 245 // Release" we get two 'terms': "colspec=id%20mstone" and "release". |
233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | 246 String16Vector lower_raw_terms = |
234 // specific substring appears in the URL or page title. | 247 base::SplitString(lower_raw_string, base::kWhitespaceUTF16, |
248 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY); | |
235 | 249 |
236 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
237 // we only want to break up the search string on 'true' whitespace rather than | |
238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
239 // get two 'terms': "colspec=id%20mstone" and "release". | |
240 String16Vector lower_raw_terms = base::SplitString( | |
241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, | |
242 base::SPLIT_WANT_NONEMPTY); | |
243 if (lower_raw_terms.empty()) { | |
244 // Don't score matches when there are no terms to score against. (It's | 250 // Don't score matches when there are no terms to score against. (It's |
245 // possible that the word break iterater that extracts words to search | 251 // possible that the word break iterater that extracts words to search |
246 // for in the database allows some whitespace "words" whereas SplitString | 252 // for in the database allows some whitespace "words" whereas SplitString |
247 // excludes a long list of whitespace.) One could write a scoring | 253 // excludes a long list of whitespace.) One could write a scoring function |
248 // function that gives a reasonable order to matches when there | 254 // that gives a reasonable order to matches when there are no terms (i.e., |
249 // are no terms (i.e., all the words are some form of whitespace), | 255 // all the words are some form of whitespace), but this is such a rare edge |
250 // but this is such a rare edge case that it's not worth the time. | 256 // case that it's not worth the time. |
251 return scored_items; | 257 if (lower_raw_terms.empty()) |
258 continue; | |
259 ScoredHistoryMatches temp_scored_items = | |
260 std::for_each(history_id_set.begin(), history_id_set.end(), | |
261 AddHistoryMatch(bookmark_model, template_url_service, | |
262 *this, lower_raw_string, lower_raw_terms, | |
263 base::Time::Now())) | |
264 .ScoredMatches(); | |
265 scored_items.insert(scored_items.end(), temp_scored_items.begin(), | |
266 temp_scored_items.end()); | |
252 } | 267 } |
253 scored_items = | |
254 std::for_each( | |
255 history_id_set.begin(), history_id_set.end(), | |
256 AddHistoryMatch(bookmark_model, template_url_service, *this, | |
257 lower_raw_string, lower_raw_terms, | |
258 base::Time::Now())).ScoredMatches(); | |
259 | |
260 // Select and sort only the top |max_matches| results. | 268 // Select and sort only the top |max_matches| results. |
261 if (scored_items.size() > max_matches) { | 269 if (scored_items.size() > max_matches) { |
262 std::partial_sort(scored_items.begin(), | 270 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, |
263 scored_items.begin() + | |
264 max_matches, | |
265 scored_items.end(), | 271 scored_items.end(), |
266 ScoredHistoryMatch::MatchScoreGreater); | 272 ScoredHistoryMatch::MatchScoreGreater); |
267 scored_items.resize(max_matches); | 273 scored_items.resize(max_matches); |
268 } else { | 274 } else { |
269 std::sort(scored_items.begin(), scored_items.end(), | 275 std::sort(scored_items.begin(), scored_items.end(), |
270 ScoredHistoryMatch::MatchScoreGreater); | 276 ScoredHistoryMatch::MatchScoreGreater); |
271 } | 277 } |
272 post_scoring_item_count_ = scored_items.size(); | 278 post_scoring_item_count_ = scored_items.size(); |
273 | 279 if (pre_filter_item_count_ > post_filter_item_count_) { |
274 if (was_trimmed) { | |
275 search_term_cache_.clear(); // Invalidate the term cache. | 280 search_term_cache_.clear(); // Invalidate the term cache. |
276 } else { | 281 } else { |
277 // Remove any stale SearchTermCacheItems. | 282 // Remove any stale SearchTermCacheItems. |
278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | 283 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
279 cache_iter != search_term_cache_.end(); ) { | 284 cache_iter != search_term_cache_.end(); ) { |
280 if (!cache_iter->second.used_) | 285 if (!cache_iter->second.used_) |
281 search_term_cache_.erase(cache_iter++); | 286 search_term_cache_.erase(cache_iter++); |
282 else | 287 else |
283 ++cache_iter; | 288 ++cache_iter; |
284 } | 289 } |
(...skipping 1069 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1354 // First cut: typed count, visit count, recency. | 1359 // First cut: typed count, visit count, recency. |
1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1360 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
1356 // recently visited (within the last 12/24 hours) as highly important. Get | 1361 // recently visited (within the last 12/24 hours) as highly important. Get |
1357 // input from mpearson. | 1362 // input from mpearson. |
1358 if (r1.typed_count() != r2.typed_count()) | 1363 if (r1.typed_count() != r2.typed_count()) |
1359 return (r1.typed_count() > r2.typed_count()); | 1364 return (r1.typed_count() > r2.typed_count()); |
1360 if (r1.visit_count() != r2.visit_count()) | 1365 if (r1.visit_count() != r2.visit_count()) |
1361 return (r1.visit_count() > r2.visit_count()); | 1366 return (r1.visit_count() > r2.visit_count()); |
1362 return (r1.last_visit() > r2.last_visit()); | 1367 return (r1.last_visit() > r2.last_visit()); |
1363 } | 1368 } |
OLD | NEW |