Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
| 6 | 6 |
| 7 #include <stdint.h> | 7 #include <stdint.h> |
| 8 | 8 |
| 9 #include <functional> | 9 #include <functional> |
| 10 #include <iterator> | 10 #include <iterator> |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 74 WordStartsMapEntry; | 74 WordStartsMapEntry; |
| 75 | 75 |
| 76 // Algorithm Functions --------------------------------------------------------- | 76 // Algorithm Functions --------------------------------------------------------- |
| 77 | 77 |
| 78 // Comparison function for sorting search terms by descending length. | 78 // Comparison function for sorting search terms by descending length. |
| 79 bool LengthGreater(const base::string16& string_a, | 79 bool LengthGreater(const base::string16& string_a, |
| 80 const base::string16& string_b) { | 80 const base::string16& string_b) { |
| 81 return string_a.length() > string_b.length(); | 81 return string_a.length() > string_b.length(); |
| 82 } | 82 } |
| 83 | 83 |
| 84 bool Less(const base::string16& string_a, const base::string16& string_b) { | |
| 85 return (string_a.compare(string_b) <= 0); | |
| 86 } | |
| 84 | 87 |
| 85 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- | 88 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- |
| 86 | 89 |
| 87 // HistoryDBTask used to update the recent visit data for a particular | 90 // HistoryDBTask used to update the recent visit data for a particular |
| 88 // row from the history database. | 91 // row from the history database. |
| 89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { | 92 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { |
| 90 public: | 93 public: |
| 91 explicit UpdateRecentVisitsFromHistoryDBTask( | 94 explicit UpdateRecentVisitsFromHistoryDBTask( |
| 92 URLIndexPrivateData* private_data, | 95 URLIndexPrivateData* private_data, |
| 93 history::URLID url_id); | 96 history::URLID url_id); |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 145 // URLIndexPrivateData --------------------------------------------------------- | 148 // URLIndexPrivateData --------------------------------------------------------- |
| 146 | 149 |
| 147 URLIndexPrivateData::URLIndexPrivateData() | 150 URLIndexPrivateData::URLIndexPrivateData() |
| 148 : restored_cache_version_(0), | 151 : restored_cache_version_(0), |
| 149 saved_cache_version_(kCurrentCacheFileVersion), | 152 saved_cache_version_(kCurrentCacheFileVersion), |
| 150 pre_filter_item_count_(0), | 153 pre_filter_item_count_(0), |
| 151 post_filter_item_count_(0), | 154 post_filter_item_count_(0), |
| 152 post_scoring_item_count_(0) { | 155 post_scoring_item_count_(0) { |
| 153 } | 156 } |
| 154 | 157 |
| 155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | 158 String16Vector URLIndexPrivateData::ExtractIndividualWordVector( |
| 156 base::string16 search_string, | 159 const base::string16& search_string) { |
| 157 size_t cursor_position, | |
| 158 size_t max_matches, | |
| 159 bookmarks::BookmarkModel* bookmark_model, | |
| 160 TemplateURLService* template_url_service) { | |
| 161 // If cursor position is set and useful (not at either end of the | |
| 162 // string), allow the search string to be broken at cursor position. | |
| 163 // We do this by pretending there's a space where the cursor is. | |
| 164 if ((cursor_position != base::string16::npos) && | |
| 165 (cursor_position < search_string.length()) && | |
| 166 (cursor_position > 0)) { | |
| 167 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
| 168 } | |
| 169 pre_filter_item_count_ = 0; | |
| 170 post_filter_item_count_ = 0; | |
| 171 post_scoring_item_count_ = 0; | |
| 172 // The search string we receive may contain escaped characters. For reducing | 160 // The search string we receive may contain escaped characters. For reducing |
| 173 // the index we need individual, lower-cased words, ignoring escapings. For | 161 // the index we need individual, lower-cased words, ignoring escapings. For |
| 174 // the final filtering we need whitespace separated substrings possibly | 162 // the final filtering we need whitespace separated substrings possibly |
| 175 // containing escaped characters. | 163 // containing escaped characters. |
| 176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | 164 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); |
| 177 base::string16 lower_unescaped_string = | 165 base::string16 lower_unescaped_string = net::UnescapeURLComponent( |
| 178 net::UnescapeURLComponent(lower_raw_string, | 166 lower_raw_string, |
| 179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | 167 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | |
| 180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | 168 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); |
| 181 // Extract individual 'words' (as opposed to 'terms'; see below) from the | 169 // Extract individual 'words' (as opposed to 'terms'; see below) from the |
| 182 // search string. When the user types "colspec=ID%20Mstone Release" we get | 170 // search string. When the user types "colspec=ID%20Mstone Release" we get |
| 183 // four 'words': "colspec", "id", "mstone" and "release". | 171 // four 'words': "colspec", "id", "mstone" and "release". |
| 184 String16Vector lower_words( | 172 return String16VectorFromString16(lower_unescaped_string, false, nullptr); |
| 185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); | 173 } |
| 186 ScoredHistoryMatches scored_items; | |
| 187 | 174 |
| 175 HistoryIDSet URLIndexPrivateData::GetHistoryIDSet( | |
| 176 const String16Vector& lower_words) { | |
| 188 // Do nothing if we have indexed no words (probably because we've not been | 177 // Do nothing if we have indexed no words (probably because we've not been |
| 189 // initialized yet) or the search string has no words. | 178 // initialized yet) or the search string has no words. |
| 190 if (word_list_.empty() || lower_words.empty()) { | 179 if (word_list_.empty() || lower_words.empty()) { |
| 191 search_term_cache_.clear(); // Invalidate the term cache. | 180 search_term_cache_.clear(); // Invalidate the term cache. |
| 192 return scored_items; | 181 return HistoryIDSet(); |
| 193 } | 182 } |
| 194 | 183 |
| 195 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | 184 return HistoryIDSetFromWords(lower_words); |
| 196 // approach. | 185 } |
| 197 ResetSearchTermCache(); | |
| 198 | 186 |
| 199 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | 187 ScoredHistoryMatches URLIndexPrivateData::GetScoredItemsForSearchString( |
|
Mark P
2016/08/23 22:14:46
Not quite right. The .h file order is fine. Now
Lavar Askew
2016/08/24 19:53:14
Done.
| |
| 200 | 188 const base::string16& search_string, |
| 201 // Trim the candidate pool if it is large. Note that we do not filter out | 189 const HistoryIDSet& history_id_set, |
| 202 // items that do not contain the search terms as proper substrings -- doing | 190 const size_t max_matches, |
| 203 // so is the performance-costly operation we are trying to avoid in order | 191 bookmarks::BookmarkModel* bookmark_model, |
| 204 // to maintain omnibox responsiveness. | 192 TemplateURLService* template_url_service) { |
| 205 const size_t kItemsToScoreLimit = 500; | 193 ScoredHistoryMatches scored_items; |
| 206 pre_filter_item_count_ = history_id_set.size(); | |
| 207 // If we trim the results set we do not want to cache the results for next | |
| 208 // time as the user's ultimately desired result could easily be eliminated | |
| 209 // in this early rough filter. | |
| 210 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
| 211 if (was_trimmed) { | |
| 212 HistoryIDVector history_ids; | |
| 213 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 214 std::back_inserter(history_ids)); | |
| 215 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 216 // visit. | |
| 217 HistoryItemFactorGreater | |
| 218 item_factor_functor(history_info_map_); | |
| 219 std::partial_sort(history_ids.begin(), | |
| 220 history_ids.begin() + kItemsToScoreLimit, | |
| 221 history_ids.end(), | |
| 222 item_factor_functor); | |
| 223 history_id_set.clear(); | |
| 224 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
| 225 std::inserter(history_id_set, history_id_set.end())); | |
| 226 post_filter_item_count_ = history_id_set.size(); | |
| 227 } | |
| 228 | 194 |
| 229 // Pass over all of the candidates filtering out any without a proper | 195 // Pass over all of the candidates filtering out any without a proper |
| 230 // substring match, inserting those which pass in order by score. Note that | 196 // substring match, inserting those which pass in order by score. Note that |
| 231 // in this step we are using the raw search string complete with escaped | 197 // in this step we are using the raw search string complete with escaped |
| 232 // URL elements. When the user has specifically typed something akin to | 198 // URL elements. When the user has specifically typed something akin to |
| 233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | 199 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that |
| 234 // specific substring appears in the URL or page title. | 200 // specific substring appears in the URL or page title. |
| 235 | 201 |
| 236 // We call these 'terms' (as opposed to 'words'; see above) as in this case | 202 // We call these 'terms' (as opposed to 'words'; see above) as in this case |
| 237 // we only want to break up the search string on 'true' whitespace rather than | 203 // we only want to break up the search string on 'true' whitespace rather than |
| 238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | 204 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we |
| 239 // get two 'terms': "colspec=id%20mstone" and "release". | 205 // get two 'terms': "colspec=id%20mstone" and "release". |
| 206 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
| 240 String16Vector lower_raw_terms = base::SplitString( | 207 String16Vector lower_raw_terms = base::SplitString( |
| 241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, | 208 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, |
| 242 base::SPLIT_WANT_NONEMPTY); | 209 base::SPLIT_WANT_NONEMPTY); |
| 210 | |
| 243 if (lower_raw_terms.empty()) { | 211 if (lower_raw_terms.empty()) { |
| 244 // Don't score matches when there are no terms to score against. (It's | 212 // Don't score matches when there are no terms to score against. (It's |
| 245 // possible that the word break iterater that extracts words to search | 213 // possible that the word break iterater that extracts words to search |
| 246 // for in the database allows some whitespace "words" whereas SplitString | 214 // for in the database allows some whitespace "words" whereas SplitString |
| 247 // excludes a long list of whitespace.) One could write a scoring | 215 // excludes a long list of whitespace.) One could write a scoring |
| 248 // function that gives a reasonable order to matches when there | 216 // function that gives a reasonable order to matches when there |
| 249 // are no terms (i.e., all the words are some form of whitespace), | 217 // are no terms (i.e., all the words are some form of whitespace), |
| 250 // but this is such a rare edge case that it's not worth the time. | 218 // but this is such a rare edge case that it's not worth the time. |
| 251 return scored_items; | 219 return scored_items; |
| 252 } | 220 } |
| 253 scored_items = | 221 scored_items = |
| 254 std::for_each( | 222 std::for_each( |
| 255 history_id_set.begin(), history_id_set.end(), | 223 history_id_set.begin(), history_id_set.end(), |
| 256 AddHistoryMatch(bookmark_model, template_url_service, *this, | 224 AddHistoryMatch(bookmark_model, template_url_service, *this, |
| 257 lower_raw_string, lower_raw_terms, | 225 lower_raw_string, lower_raw_terms, |
| 258 base::Time::Now())).ScoredMatches(); | 226 base::Time::Now())).ScoredMatches(); |
| 259 | 227 |
| 260 // Select and sort only the top |max_matches| results. | 228 // Select and sort only the top |max_matches| results. |
| 261 if (scored_items.size() > max_matches) { | 229 if (scored_items.size() > max_matches) { |
| 262 std::partial_sort(scored_items.begin(), | 230 std::partial_sort(scored_items.begin(), |
| 263 scored_items.begin() + | 231 scored_items.begin() + |
| 264 max_matches, | 232 max_matches, |
| 265 scored_items.end(), | 233 scored_items.end(), |
| 266 ScoredHistoryMatch::MatchScoreGreater); | 234 ScoredHistoryMatch::MatchScoreGreater); |
| 267 scored_items.resize(max_matches); | 235 scored_items.resize(max_matches); |
| 268 } else { | 236 } else { |
| 269 std::sort(scored_items.begin(), scored_items.end(), | 237 std::sort(scored_items.begin(), scored_items.end(), |
| 270 ScoredHistoryMatch::MatchScoreGreater); | 238 ScoredHistoryMatch::MatchScoreGreater); |
| 271 } | 239 } |
| 272 post_scoring_item_count_ = scored_items.size(); | |
| 273 | 240 |
| 241 return scored_items; | |
| 242 } | |
| 243 | |
| 244 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
| 245 base::string16 search_string, | |
| 246 size_t cursor_position, | |
| 247 size_t max_matches, | |
| 248 bookmarks::BookmarkModel* bookmark_model, | |
| 249 TemplateURLService* template_url_service) { | |
| 250 pre_filter_item_count_ = 0; | |
| 251 post_filter_item_count_ = 0; | |
| 252 post_scoring_item_count_ = 0; | |
| 253 | |
| 254 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
| 255 // approach. | |
| 256 ResetSearchTermCache(); | |
| 257 | |
| 258 String16Vector lower_words = ExtractIndividualWordVector(search_string); | |
| 259 HistoryIDSet history_id_set = GetHistoryIDSet(lower_words); | |
| 260 | |
| 261 // If cursor position is set and useful (not at either end of the | |
| 262 // string), allow the search string to be broken at cursor position. | |
| 263 // We do this by pretending there's a space where the cursor is. | |
| 264 if ((cursor_position != base::string16::npos) && | |
| 265 (cursor_position < search_string.length()) && (cursor_position > 0)) { | |
| 266 base::string16 search_string_with_word_break(search_string); | |
| 267 | |
| 268 base::string16 word_break = base::ASCIIToUTF16(" "); | |
| 269 search_string_with_word_break.insert(cursor_position, word_break); | |
| 270 | |
| 271 // Add to history_id_set the ids that are related to the original search | |
| 272 // string, but with the word break. | |
| 273 String16Vector lower_words_with_word_break = | |
| 274 ExtractIndividualWordVector(search_string_with_word_break); | |
| 275 | |
| 276 // Check to see if lower_words and lower_words_with_word_break are the same | |
| 277 // size. If they are then this could indicate that the word vectors | |
| 278 // contain the same words. | |
| 279 if (lower_words.size() != lower_words_with_word_break.size()) { | |
| 280 std::sort(lower_words.begin(), lower_words.end(), Less); | |
| 281 std::sort(lower_words_with_word_break.begin(), | |
| 282 lower_words_with_word_break.end(), Less); | |
| 283 if (lower_words != lower_words_with_word_break) { | |
| 284 HistoryIDSet history_id_set_with_word_break = | |
| 285 GetHistoryIDSet(lower_words_with_word_break); | |
| 286 history_id_set.insert(history_id_set_with_word_break.begin(), | |
| 287 history_id_set_with_word_break.end()); | |
| 288 } | |
| 289 } | |
| 290 } | |
| 291 // Trim the candidate pool if it is large. Note that we do not filter out | |
| 292 // items that do not contain the search terms as proper substrings -- doing | |
| 293 // so is the performance-costly operation we are trying to avoid in order | |
| 294 // to maintain omnibox responsiveness. | |
| 295 pre_filter_item_count_ = history_id_set.size(); | |
| 296 // If we trim the results set we do not want to cache the results for next | |
| 297 // time as the user's ultimately desired result could easily be eliminated | |
| 298 // in this early rough filter. | |
| 299 bool was_trimmed = (pre_filter_item_count_ > max_matches); | |
| 300 if (was_trimmed) { | |
| 301 HistoryIDVector history_ids; | |
| 302 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 303 std::back_inserter(history_ids)); | |
| 304 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 305 // visit. | |
| 306 HistoryItemFactorGreater item_factor_functor(history_info_map_); | |
| 307 std::partial_sort(history_ids.begin(), history_ids.begin() + max_matches, | |
| 308 history_ids.end(), item_factor_functor); | |
| 309 history_id_set.clear(); | |
| 310 std::copy(history_ids.begin(), history_ids.begin() + max_matches, | |
| 311 std::inserter(history_id_set, history_id_set.end())); | |
| 312 post_filter_item_count_ = history_id_set.size(); | |
| 313 } | |
| 314 ScoredHistoryMatches scored_items = | |
| 315 GetScoredItemsForSearchString(search_string, history_id_set, max_matches, | |
| 316 bookmark_model, template_url_service); | |
| 274 if (was_trimmed) { | 317 if (was_trimmed) { |
| 275 search_term_cache_.clear(); // Invalidate the term cache. | 318 search_term_cache_.clear(); // Invalidate the term cache. |
| 276 } else { | 319 } else { |
| 277 // Remove any stale SearchTermCacheItems. | 320 // Remove any stale SearchTermCacheItems. |
| 278 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | 321 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
| 279 cache_iter != search_term_cache_.end(); ) { | 322 cache_iter != search_term_cache_.end(); ) { |
| 280 if (!cache_iter->second.used_) | 323 if (!cache_iter->second.used_) { |
| 281 search_term_cache_.erase(cache_iter++); | 324 search_term_cache_.erase(cache_iter++); |
| 282 else | 325 } else { |
| 283 ++cache_iter; | 326 ++cache_iter; |
| 327 } | |
| 284 } | 328 } |
| 285 } | 329 } |
| 286 | |
| 287 return scored_items; | 330 return scored_items; |
| 288 } | 331 } |
| 289 | 332 |
| 290 bool URLIndexPrivateData::UpdateURL( | 333 bool URLIndexPrivateData::UpdateURL( |
| 291 history::HistoryService* history_service, | 334 history::HistoryService* history_service, |
| 292 const history::URLRow& row, | 335 const history::URLRow& row, |
| 293 const std::set<std::string>& scheme_whitelist, | 336 const std::set<std::string>& scheme_whitelist, |
| 294 base::CancelableTaskTracker* tracker) { | 337 base::CancelableTaskTracker* tracker) { |
| 295 // The row may or may not already be in our index. If it is not already | 338 // The row may or may not already be in our index. If it is not already |
| 296 // indexed and it qualifies then it gets indexed. If it is already | 339 // indexed and it qualifies then it gets indexed. If it is already |
| (...skipping 1057 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1354 // First cut: typed count, visit count, recency. | 1397 // First cut: typed count, visit count, recency. |
| 1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1398 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
| 1356 // recently visited (within the last 12/24 hours) as highly important. Get | 1399 // recently visited (within the last 12/24 hours) as highly important. Get |
| 1357 // input from mpearson. | 1400 // input from mpearson. |
| 1358 if (r1.typed_count() != r2.typed_count()) | 1401 if (r1.typed_count() != r2.typed_count()) |
| 1359 return (r1.typed_count() > r2.typed_count()); | 1402 return (r1.typed_count() > r2.typed_count()); |
| 1360 if (r1.visit_count() != r2.visit_count()) | 1403 if (r1.visit_count() != r2.visit_count()) |
| 1361 return (r1.visit_count() > r2.visit_count()); | 1404 return (r1.visit_count() > r2.visit_count()); |
| 1362 return (r1.last_visit() > r2.last_visit()); | 1405 return (r1.last_visit() > r2.last_visit()); |
| 1363 } | 1406 } |
| OLD | NEW |