OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
6 | 6 |
7 #include <stdint.h> | 7 #include <stdint.h> |
8 | 8 |
9 #include <functional> | 9 #include <functional> |
10 #include <iterator> | 10 #include <iterator> |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
133 } | 133 } |
134 | 134 |
135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { | 135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { |
136 if (succeeded_) | 136 if (succeeded_) |
137 private_data_->UpdateRecentVisits(url_id_, recent_visits_); | 137 private_data_->UpdateRecentVisits(url_id_, recent_visits_); |
138 } | 138 } |
139 | 139 |
140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { | 140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { |
141 } | 141 } |
142 | 142 |
143 | |
144 // URLIndexPrivateData --------------------------------------------------------- | 143 // URLIndexPrivateData --------------------------------------------------------- |
145 | 144 |
146 URLIndexPrivateData::URLIndexPrivateData() | 145 URLIndexPrivateData::URLIndexPrivateData() |
147 : restored_cache_version_(0), | 146 : restored_cache_version_(0), |
148 saved_cache_version_(kCurrentCacheFileVersion), | 147 saved_cache_version_(kCurrentCacheFileVersion), |
149 pre_filter_item_count_(0), | 148 pre_filter_item_count_(0), |
150 post_filter_item_count_(0), | 149 post_filter_item_count_(0), |
151 post_scoring_item_count_(0) { | 150 post_scoring_item_count_(0) { |
152 } | 151 } |
153 | 152 |
154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | 153 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( |
155 base::string16 search_string, | 154 base::string16 original_search_string, |
156 size_t cursor_position, | 155 size_t cursor_position, |
157 size_t max_matches, | 156 size_t max_matches, |
158 bookmarks::BookmarkModel* bookmark_model, | 157 bookmarks::BookmarkModel* bookmark_model, |
159 TemplateURLService* template_url_service) { | 158 TemplateURLService* template_url_service) { |
160 pre_filter_item_count_ = 0; | 159 pre_filter_item_count_ = 0; |
161 post_filter_item_count_ = 0; | 160 post_filter_item_count_ = 0; |
162 post_scoring_item_count_ = 0; | 161 post_scoring_item_count_ = 0; |
163 // The search string we receive may contain escaped characters. For reducing | |
164 // the index we need individual, lower-cased words, ignoring escapings. For | |
165 // the final filtering we need whitespace separated substrings possibly | |
166 // containing escaped characters. | |
167 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
168 base::string16 lower_unescaped_string = | |
169 net::UnescapeURLComponent(lower_raw_string, | |
170 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | |
171 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | |
172 | 162 |
173 // Extract individual 'words' (as opposed to 'terms'; see below) from the | 163 // This list will contain the original search string and any other string |
174 // search string. When the user types "colspec=ID%20Mstone Release" we get | 164 // transformations. |
175 // four 'words': "colspec", "id", "mstone" and "release". | 165 String16Vector search_strings; |
176 String16Vector lower_words( | 166 search_strings.push_back(original_search_string); |
177 String16VectorFromString16(lower_unescaped_string, false, nullptr)); | 167 if ((cursor_position != base::string16::npos) && |
| 168 (cursor_position < original_search_string.length()) && |
| 169 (cursor_position > 0)) { |
| 170 // The original search_string broken at cursor position. This is one type of |
| 171 // transformation. |
| 172 base::string16 transformed_search_string(original_search_string); |
| 173 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); |
| 174 search_strings.push_back(transformed_search_string); |
| 175 } |
| 176 |
178 ScoredHistoryMatches scored_items; | 177 ScoredHistoryMatches scored_items; |
179 | 178 // Invalidate the term cache and return if we have indexed no words (probably |
180 // Do nothing if we have indexed no words (probably because we've not been | 179 // because we've not been initialized yet). |
181 // initialized yet) or the search string has no words. | 180 if (word_list_.empty()) { |
182 if (word_list_.empty() || lower_words.empty()) { | 181 search_term_cache_.clear(); |
183 search_term_cache_.clear(); // Invalidate the term cache. | |
184 return scored_items; | 182 return scored_items; |
185 } | 183 } |
186 | |
187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 184 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
188 // approach. | 185 // approach. |
189 ResetSearchTermCache(); | 186 ResetSearchTermCache(); |
190 | 187 |
191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | 188 for (const base::string16& search_string : search_strings) { |
| 189 // The search string we receive may contain escaped characters. For reducing |
| 190 // the index we need individual, lower-cased words, ignoring escapings. For |
| 191 // the final filtering we need whitespace separated substrings possibly |
| 192 // containing escaped characters. |
| 193 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); |
| 194 base::string16 lower_unescaped_string = net::UnescapeURLComponent( |
| 195 lower_raw_string, |
| 196 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | |
| 197 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); |
192 | 198 |
193 // Trim the candidate pool if it is large. Note that we do not filter out | 199 // Extract individual 'words' (as opposed to 'terms'; see below) from the |
194 // items that do not contain the search terms as proper substrings -- doing | 200 // search string. When the user types "colspec=ID%20Mstone Release" we get |
195 // so is the performance-costly operation we are trying to avoid in order | 201 // four 'words': "colspec", "id", "mstone" and "release". |
196 // to maintain omnibox responsiveness. | 202 String16Vector lower_words( |
197 const size_t kItemsToScoreLimit = 500; | 203 String16VectorFromString16(lower_unescaped_string, false, nullptr)); |
198 pre_filter_item_count_ = history_id_set.size(); | 204 if (lower_words.empty()) |
199 // If we trim the results set we do not want to cache the results for next | 205 continue; |
200 // time as the user's ultimately desired result could easily be eliminated | 206 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); |
201 // in this early rough filter. | 207 // Trim the candidate pool if it is large. Note that we do not filter out |
202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | 208 // items that do not contain the search terms as proper substrings -- |
203 if (was_trimmed) { | 209 // doing so is the performance-costly operation we are trying to avoid in |
204 HistoryIDVector history_ids; | 210 // order to maintain omnibox responsiveness. |
205 std::copy(history_id_set.begin(), history_id_set.end(), | 211 const size_t kItemsToScoreLimit = 500; |
206 std::back_inserter(history_ids)); | 212 pre_filter_item_count_ += history_id_set.size(); |
207 // Trim down the set by sorting by typed-count, visit-count, and last | 213 // If we trim the results set we do not want to cache the results for next |
208 // visit. | 214 // time as the user's ultimately desired result could easily be eliminated |
209 HistoryItemFactorGreater | 215 // in this early rough filter. |
210 item_factor_functor(history_info_map_); | 216 if (pre_filter_item_count_ > kItemsToScoreLimit) { |
211 std::partial_sort(history_ids.begin(), | 217 HistoryIDVector history_ids; |
212 history_ids.begin() + kItemsToScoreLimit, | 218 std::copy(history_id_set.begin(), history_id_set.end(), |
213 history_ids.end(), | 219 std::back_inserter(history_ids)); |
214 item_factor_functor); | 220 // Trim down the set by sorting by typed-count, visit-count, and last |
215 history_id_set.clear(); | 221 // visit. |
216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | 222 HistoryItemFactorGreater item_factor_functor(history_info_map_); |
217 std::inserter(history_id_set, history_id_set.end())); | 223 std::partial_sort(history_ids.begin(), |
218 post_filter_item_count_ = history_id_set.size(); | 224 history_ids.begin() + kItemsToScoreLimit, |
219 } | 225 history_ids.end(), item_factor_functor); |
| 226 history_id_set.clear(); |
| 227 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, |
| 228 std::inserter(history_id_set, history_id_set.end())); |
| 229 post_filter_item_count_ = post_filter_item_count_ + history_id_set.size(); |
| 230 } |
| 231 // Pass over all of the candidates filtering out any without a proper |
| 232 // substring match, inserting those which pass in order by score. Note that |
| 233 // in this step we are using the raw search string complete with escaped |
| 234 // URL elements. When the user has specifically typed something akin to |
| 235 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that |
| 236 // specific substring appears in the URL or page title. |
220 | 237 |
221 // Pass over all of the candidates filtering out any without a proper | 238 // We call these 'terms' (as opposed to 'words'; see above) as in this case |
222 // substring match, inserting those which pass in order by score. Note that | 239 // we only want to break up the search string on 'true' whitespace rather |
223 // in this step we are using the raw search string complete with escaped | 240 // than escaped whitespace. When the user types "colspec=ID%20Mstone |
224 // URL elements. When the user has specifically typed something akin to | 241 // Release" we get two 'terms': "colspec=id%20mstone" and "release". |
225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | 242 String16Vector lower_raw_terms = |
226 // specific substring appears in the URL or page title. | 243 base::SplitString(lower_raw_string, base::kWhitespaceUTF16, |
| 244 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY); |
227 | 245 |
228 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
229 // we only want to break up the search string on 'true' whitespace rather than | |
230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
231 // get two 'terms': "colspec=id%20mstone" and "release". | |
232 String16Vector lower_raw_terms = base::SplitString( | |
233 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, | |
234 base::SPLIT_WANT_NONEMPTY); | |
235 | |
236 if (lower_raw_terms.empty()) { | |
237 // Don't score matches when there are no terms to score against. (It's | 246 // Don't score matches when there are no terms to score against. (It's |
238 // possible that the word break iterater that extracts words to search | 247 // possible that the word break iterater that extracts words to search |
239 // for in the database allows some whitespace "words" whereas SplitString | 248 // for in the database allows some whitespace "words" whereas SplitString |
240 // excludes a long list of whitespace.) One could write a scoring | 249 // excludes a long list of whitespace.) One could write a scoring function |
241 // function that gives a reasonable order to matches when there | 250 // that gives a reasonable order to matches when there are no terms (i.e., |
242 // are no terms (i.e., all the words are some form of whitespace), | 251 // all the words are some form of whitespace), but this is such a rare edge |
243 // but this is such a rare edge case that it's not worth the time. | 252 // case that it's not worth the time. |
244 return scored_items; | 253 if (lower_raw_terms.empty()) |
245 } | 254 continue; |
246 | 255 ScoredHistoryMatches temp_scored_items = |
247 scored_items = | 256 std::for_each(history_id_set.begin(), history_id_set.end(), |
248 std::for_each( | 257 AddHistoryMatch(bookmark_model, template_url_service, |
249 history_id_set.begin(), history_id_set.end(), | 258 *this, lower_raw_string, lower_raw_terms, |
250 AddHistoryMatch(bookmark_model, template_url_service, *this, | 259 base::Time::Now())) |
251 lower_raw_string, lower_raw_terms, base::Time::Now())) | 260 .ScoredMatches(); |
252 .ScoredMatches(); | 261 scored_items.insert(scored_items.end(), temp_scored_items.begin(), |
253 | 262 temp_scored_items.end()); |
254 // If cursor position is set and useful (not at either end of the string), | |
255 // allow the search string to be broken at cursor position. We do this by | |
256 // pretending there's a space where the cursor is. | |
257 // This phenomena will be referred to as space ghosting. | |
258 // Going forward we perform a similar process for obtaining | |
259 // scored items as above. In fact all of the variable names are the same | |
260 // except that they are prepended with 'transformed_' reflecting that these | |
261 // variables are related to space ghosting. | |
262 if ((cursor_position != base::string16::npos) && | |
263 (cursor_position < search_string.length()) && (cursor_position > 0)) { | |
264 // The original search_string broken at cursor position. | |
265 base::string16 transformed_search_string(search_string); | |
266 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
267 | |
268 // The same as lower_raw_string and defined for the same reasons. | |
269 base::string16 transformed_lower_raw_string( | |
270 base::i18n::ToLower(transformed_search_string)); | |
271 String16Vector transformed_lower_raw_terms = | |
272 base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16, | |
273 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY); | |
274 if (!transformed_lower_raw_terms.empty()) { | |
275 base::string16 transormed_lower_unescaped_string = | |
276 net::UnescapeURLComponent( | |
277 transformed_lower_raw_string, | |
278 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | |
279 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | |
280 String16Vector transformed_lower_words(String16VectorFromString16( | |
281 transormed_lower_unescaped_string, false, nullptr)); | |
282 HistoryIDSet transformed_history_id_set = | |
283 HistoryIDSetFromWords(transformed_lower_words); | |
284 | |
285 if (history_id_set != transformed_history_id_set) { | |
286 pre_filter_item_count_ = history_id_set.size(); | |
287 // Trim the candidate pool if it is large as above. | |
288 was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
289 if (was_trimmed) { | |
290 HistoryIDVector transformed_history_ids; | |
291 std::copy(transformed_history_id_set.begin(), | |
292 transformed_history_id_set.end(), | |
293 std::back_inserter(transformed_history_ids)); | |
294 // Trim down the set by sorting by typed-count, visit-count, and last | |
295 // visit. | |
296 HistoryItemFactorGreater item_factor_functor(history_info_map_); | |
297 std::partial_sort( | |
298 transformed_history_ids.begin(), | |
299 transformed_history_ids.begin() + kItemsToScoreLimit, | |
300 transformed_history_ids.end(), item_factor_functor); | |
301 transformed_history_id_set.clear(); | |
302 std::copy(transformed_history_ids.begin(), | |
303 transformed_history_ids.begin() + kItemsToScoreLimit, | |
304 std::inserter(transformed_history_id_set, | |
305 transformed_history_id_set.end())); | |
306 } | |
307 history_id_set.insert(transformed_history_id_set.begin(), | |
308 transformed_history_id_set.end()); | |
309 ScoredHistoryMatches transformed_scored_items = | |
310 std::for_each( | |
311 history_id_set.begin(), history_id_set.end(), | |
312 AddHistoryMatch(bookmark_model, template_url_service, *this, | |
313 transformed_lower_raw_string, | |
314 transformed_lower_raw_terms, base::Time::Now())) | |
315 .ScoredMatches(); | |
316 scored_items.insert(scored_items.end(), | |
317 transformed_scored_items.begin(), | |
318 transformed_scored_items.end()); | |
319 } | |
320 } | |
321 } | 263 } |
322 // Select and sort only the top |max_matches| results. | 264 // Select and sort only the top |max_matches| results. |
323 if (scored_items.size() > max_matches) { | 265 if (scored_items.size() > max_matches) { |
324 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, | 266 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, |
325 scored_items.end(), | 267 scored_items.end(), |
326 ScoredHistoryMatch::MatchScoreGreater); | 268 ScoredHistoryMatch::MatchScoreGreater); |
327 scored_items.resize(max_matches); | 269 scored_items.resize(max_matches); |
328 } else { | 270 } else { |
329 std::sort(scored_items.begin(), scored_items.end(), | 271 std::sort(scored_items.begin(), scored_items.end(), |
330 ScoredHistoryMatch::MatchScoreGreater); | 272 ScoredHistoryMatch::MatchScoreGreater); |
331 } | 273 } |
332 post_scoring_item_count_ = scored_items.size(); | 274 post_scoring_item_count_ = scored_items.size(); |
333 | 275 if (pre_filter_item_count_ > post_filter_item_count_) { |
334 if (was_trimmed) { | |
335 search_term_cache_.clear(); // Invalidate the term cache. | 276 search_term_cache_.clear(); // Invalidate the term cache. |
336 } else { | 277 } else { |
337 // Remove any stale SearchTermCacheItems. | 278 // Remove any stale SearchTermCacheItems. |
338 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | 279 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
339 cache_iter != search_term_cache_.end(); ) { | 280 cache_iter != search_term_cache_.end(); ) { |
340 if (!cache_iter->second.used_) | 281 if (!cache_iter->second.used_) |
341 search_term_cache_.erase(cache_iter++); | 282 search_term_cache_.erase(cache_iter++); |
342 else | 283 else |
343 ++cache_iter; | 284 ++cache_iter; |
344 } | 285 } |
(...skipping 1068 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1413 // First cut: typed count, visit count, recency. | 1354 // First cut: typed count, visit count, recency. |
1414 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
1415 // recently visited (within the last 12/24 hours) as highly important. Get | 1356 // recently visited (within the last 12/24 hours) as highly important. Get |
1416 // input from mpearson. | 1357 // input from mpearson. |
1417 if (r1.typed_count() != r2.typed_count()) | 1358 if (r1.typed_count() != r2.typed_count()) |
1418 return (r1.typed_count() > r2.typed_count()); | 1359 return (r1.typed_count() > r2.typed_count()); |
1419 if (r1.visit_count() != r2.visit_count()) | 1360 if (r1.visit_count() != r2.visit_count()) |
1420 return (r1.visit_count() > r2.visit_count()); | 1361 return (r1.visit_count() > r2.visit_count()); |
1421 return (r1.last_visit() > r2.last_visit()); | 1362 return (r1.last_visit() > r2.last_visit()); |
1422 } | 1363 } |
OLD | NEW |