| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/omnibox/browser/url_index_private_data.h" | 5 #include "components/omnibox/browser/url_index_private_data.h" |
| 6 | 6 |
| 7 #include <stdint.h> | 7 #include <stdint.h> |
| 8 | 8 |
| 9 #include <functional> | 9 #include <functional> |
| 10 #include <iterator> | 10 #include <iterator> |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 74 WordStartsMapEntry; | 74 WordStartsMapEntry; |
| 75 | 75 |
| 76 // Algorithm Functions --------------------------------------------------------- | 76 // Algorithm Functions --------------------------------------------------------- |
| 77 | 77 |
| 78 // Comparison function for sorting search terms by descending length. | 78 // Comparison function for sorting search terms by descending length. |
| 79 bool LengthGreater(const base::string16& string_a, | 79 bool LengthGreater(const base::string16& string_a, |
| 80 const base::string16& string_b) { | 80 const base::string16& string_b) { |
| 81 return string_a.length() > string_b.length(); | 81 return string_a.length() > string_b.length(); |
| 82 } | 82 } |
| 83 | 83 |
| 84 |
| 84 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- | 85 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- |
| 85 | 86 |
| 86 // HistoryDBTask used to update the recent visit data for a particular | 87 // HistoryDBTask used to update the recent visit data for a particular |
| 87 // row from the history database. | 88 // row from the history database. |
| 88 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { | 89 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { |
| 89 public: | 90 public: |
| 90 explicit UpdateRecentVisitsFromHistoryDBTask( | 91 explicit UpdateRecentVisitsFromHistoryDBTask( |
| 91 URLIndexPrivateData* private_data, | 92 URLIndexPrivateData* private_data, |
| 92 history::URLID url_id); | 93 history::URLID url_id); |
| 93 | 94 |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 150 post_filter_item_count_(0), | 151 post_filter_item_count_(0), |
| 151 post_scoring_item_count_(0) { | 152 post_scoring_item_count_(0) { |
| 152 } | 153 } |
| 153 | 154 |
| 154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | 155 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( |
| 155 base::string16 search_string, | 156 base::string16 search_string, |
| 156 size_t cursor_position, | 157 size_t cursor_position, |
| 157 size_t max_matches, | 158 size_t max_matches, |
| 158 bookmarks::BookmarkModel* bookmark_model, | 159 bookmarks::BookmarkModel* bookmark_model, |
| 159 TemplateURLService* template_url_service) { | 160 TemplateURLService* template_url_service) { |
| 161 // If cursor position is set and useful (not at either end of the |
| 162 // string), allow the search string to be broken at cursor position. |
| 163 // We do this by pretending there's a space where the cursor is. |
| 164 if ((cursor_position != base::string16::npos) && |
| 165 (cursor_position < search_string.length()) && |
| 166 (cursor_position > 0)) { |
| 167 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); |
| 168 } |
| 160 pre_filter_item_count_ = 0; | 169 pre_filter_item_count_ = 0; |
| 161 post_filter_item_count_ = 0; | 170 post_filter_item_count_ = 0; |
| 162 post_scoring_item_count_ = 0; | 171 post_scoring_item_count_ = 0; |
| 163 // The search string we receive may contain escaped characters. For reducing | 172 // The search string we receive may contain escaped characters. For reducing |
| 164 // the index we need individual, lower-cased words, ignoring escapings. For | 173 // the index we need individual, lower-cased words, ignoring escapings. For |
| 165 // the final filtering we need whitespace separated substrings possibly | 174 // the final filtering we need whitespace separated substrings possibly |
| 166 // containing escaped characters. | 175 // containing escaped characters. |
| 167 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | 176 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); |
| 168 base::string16 lower_unescaped_string = | 177 base::string16 lower_unescaped_string = |
| 169 net::UnescapeURLComponent(lower_raw_string, | 178 net::UnescapeURLComponent(lower_raw_string, |
| 170 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | 179 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | |
| 171 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | 180 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); |
| 172 | |
| 173 // Extract individual 'words' (as opposed to 'terms'; see below) from the | 181 // Extract individual 'words' (as opposed to 'terms'; see below) from the |
| 174 // search string. When the user types "colspec=ID%20Mstone Release" we get | 182 // search string. When the user types "colspec=ID%20Mstone Release" we get |
| 175 // four 'words': "colspec", "id", "mstone" and "release". | 183 // four 'words': "colspec", "id", "mstone" and "release". |
| 176 String16Vector lower_words( | 184 String16Vector lower_words( |
| 177 String16VectorFromString16(lower_unescaped_string, false, nullptr)); | 185 String16VectorFromString16(lower_unescaped_string, false, nullptr)); |
| 178 ScoredHistoryMatches scored_items; | 186 ScoredHistoryMatches scored_items; |
| 179 | 187 |
| 180 // Do nothing if we have indexed no words (probably because we've not been | 188 // Do nothing if we have indexed no words (probably because we've not been |
| 181 // initialized yet) or the search string has no words. | 189 // initialized yet) or the search string has no words. |
| 182 if (word_list_.empty() || lower_words.empty()) { | 190 if (word_list_.empty() || lower_words.empty()) { |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | 233 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that |
| 226 // specific substring appears in the URL or page title. | 234 // specific substring appears in the URL or page title. |
| 227 | 235 |
| 228 // We call these 'terms' (as opposed to 'words'; see above) as in this case | 236 // We call these 'terms' (as opposed to 'words'; see above) as in this case |
| 229 // we only want to break up the search string on 'true' whitespace rather than | 237 // we only want to break up the search string on 'true' whitespace rather than |
| 230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | 238 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we |
| 231 // get two 'terms': "colspec=id%20mstone" and "release". | 239 // get two 'terms': "colspec=id%20mstone" and "release". |
| 232 String16Vector lower_raw_terms = base::SplitString( | 240 String16Vector lower_raw_terms = base::SplitString( |
| 233 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, | 241 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, |
| 234 base::SPLIT_WANT_NONEMPTY); | 242 base::SPLIT_WANT_NONEMPTY); |
| 235 | |
| 236 if (lower_raw_terms.empty()) { | 243 if (lower_raw_terms.empty()) { |
| 237 // Don't score matches when there are no terms to score against. (It's | 244 // Don't score matches when there are no terms to score against. (It's |
| 238 // possible that the word break iterater that extracts words to search | 245 // possible that the word break iterater that extracts words to search |
| 239 // for in the database allows some whitespace "words" whereas SplitString | 246 // for in the database allows some whitespace "words" whereas SplitString |
| 240 // excludes a long list of whitespace.) One could write a scoring | 247 // excludes a long list of whitespace.) One could write a scoring |
| 241 // function that gives a reasonable order to matches when there | 248 // function that gives a reasonable order to matches when there |
| 242 // are no terms (i.e., all the words are some form of whitespace), | 249 // are no terms (i.e., all the words are some form of whitespace), |
| 243 // but this is such a rare edge case that it's not worth the time. | 250 // but this is such a rare edge case that it's not worth the time. |
| 244 return scored_items; | 251 return scored_items; |
| 245 } | 252 } |
| 246 | |
| 247 scored_items = | 253 scored_items = |
| 248 std::for_each( | 254 std::for_each( |
| 249 history_id_set.begin(), history_id_set.end(), | 255 history_id_set.begin(), history_id_set.end(), |
| 250 AddHistoryMatch(bookmark_model, template_url_service, *this, | 256 AddHistoryMatch(bookmark_model, template_url_service, *this, |
| 251 lower_raw_string, lower_raw_terms, base::Time::Now())) | 257 lower_raw_string, lower_raw_terms, |
| 252 .ScoredMatches(); | 258 base::Time::Now())).ScoredMatches(); |
| 253 | 259 |
| 254 // If cursor position is set and useful (not at either end of the string), | |
| 255 // allow the search string to be broken at cursor position. We do this by | |
| 256 // pretending there's a space where the cursor is. | |
| 257 // This phenomena will be referred to as space ghosting. | |
| 258 // Going forward we perform a similar process for obtaining | |
| 259 // scored items as above. In fact all of the variable names are the same | |
| 260 // except that they are prepended with 'transformed_' reflecting that these | |
| 261 // variables are related to space ghosting. | |
| 262 if ((cursor_position != base::string16::npos) && | |
| 263 (cursor_position < search_string.length()) && (cursor_position > 0)) { | |
| 264 // The original search_string broken at cursor position. | |
| 265 base::string16 transformed_search_string(search_string); | |
| 266 transformed_search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
| 267 | |
| 268 // The same as lower_raw_string and defined for the same reasons. | |
| 269 base::string16 transformed_lower_raw_string( | |
| 270 base::i18n::ToLower(transformed_search_string)); | |
| 271 String16Vector transformed_lower_raw_terms = | |
| 272 base::SplitString(transformed_lower_raw_string, base::kWhitespaceUTF16, | |
| 273 base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY); | |
| 274 if (!transformed_lower_raw_terms.empty()) { | |
| 275 base::string16 transormed_lower_unescaped_string = | |
| 276 net::UnescapeURLComponent( | |
| 277 transformed_lower_raw_string, | |
| 278 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS | | |
| 279 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | |
| 280 String16Vector transformed_lower_words(String16VectorFromString16( | |
| 281 transormed_lower_unescaped_string, false, nullptr)); | |
| 282 HistoryIDSet transformed_history_id_set = | |
| 283 HistoryIDSetFromWords(transformed_lower_words); | |
| 284 | |
| 285 if (history_id_set != transformed_history_id_set) { | |
| 286 pre_filter_item_count_ = history_id_set.size(); | |
| 287 // Trim the candidate pool if it is large as above. | |
| 288 was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
| 289 if (was_trimmed) { | |
| 290 HistoryIDVector transformed_history_ids; | |
| 291 std::copy(transformed_history_id_set.begin(), | |
| 292 transformed_history_id_set.end(), | |
| 293 std::back_inserter(transformed_history_ids)); | |
| 294 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 295 // visit. | |
| 296 HistoryItemFactorGreater item_factor_functor(history_info_map_); | |
| 297 std::partial_sort( | |
| 298 transformed_history_ids.begin(), | |
| 299 transformed_history_ids.begin() + kItemsToScoreLimit, | |
| 300 transformed_history_ids.end(), item_factor_functor); | |
| 301 transformed_history_id_set.clear(); | |
| 302 std::copy(transformed_history_ids.begin(), | |
| 303 transformed_history_ids.begin() + kItemsToScoreLimit, | |
| 304 std::inserter(transformed_history_id_set, | |
| 305 transformed_history_id_set.end())); | |
| 306 } | |
| 307 history_id_set.insert(transformed_history_id_set.begin(), | |
| 308 transformed_history_id_set.end()); | |
| 309 ScoredHistoryMatches transformed_scored_items = | |
| 310 std::for_each( | |
| 311 history_id_set.begin(), history_id_set.end(), | |
| 312 AddHistoryMatch(bookmark_model, template_url_service, *this, | |
| 313 transformed_lower_raw_string, | |
| 314 transformed_lower_raw_terms, base::Time::Now())) | |
| 315 .ScoredMatches(); | |
| 316 scored_items.insert(scored_items.end(), | |
| 317 transformed_scored_items.begin(), | |
| 318 transformed_scored_items.end()); | |
| 319 } | |
| 320 } | |
| 321 } | |
| 322 // Select and sort only the top |max_matches| results. | 260 // Select and sort only the top |max_matches| results. |
| 323 if (scored_items.size() > max_matches) { | 261 if (scored_items.size() > max_matches) { |
| 324 std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, | 262 std::partial_sort(scored_items.begin(), |
| 263 scored_items.begin() + |
| 264 max_matches, |
| 325 scored_items.end(), | 265 scored_items.end(), |
| 326 ScoredHistoryMatch::MatchScoreGreater); | 266 ScoredHistoryMatch::MatchScoreGreater); |
| 327 scored_items.resize(max_matches); | 267 scored_items.resize(max_matches); |
| 328 } else { | 268 } else { |
| 329 std::sort(scored_items.begin(), scored_items.end(), | 269 std::sort(scored_items.begin(), scored_items.end(), |
| 330 ScoredHistoryMatch::MatchScoreGreater); | 270 ScoredHistoryMatch::MatchScoreGreater); |
| 331 } | 271 } |
| 332 post_scoring_item_count_ = scored_items.size(); | 272 post_scoring_item_count_ = scored_items.size(); |
| 333 | 273 |
| 334 if (was_trimmed) { | 274 if (was_trimmed) { |
| 335 search_term_cache_.clear(); // Invalidate the term cache. | 275 search_term_cache_.clear(); // Invalidate the term cache. |
| 336 } else { | 276 } else { |
| 337 // Remove any stale SearchTermCacheItems. | 277 // Remove any stale SearchTermCacheItems. |
| (...skipping 966 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1304 return true; | 1244 return true; |
| 1305 } | 1245 } |
| 1306 | 1246 |
| 1307 // static | 1247 // static |
| 1308 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | 1248 bool URLIndexPrivateData::URLSchemeIsWhitelisted( |
| 1309 const GURL& gurl, | 1249 const GURL& gurl, |
| 1310 const std::set<std::string>& whitelist) { | 1250 const std::set<std::string>& whitelist) { |
| 1311 return whitelist.find(gurl.scheme()) != whitelist.end(); | 1251 return whitelist.find(gurl.scheme()) != whitelist.end(); |
| 1312 } | 1252 } |
| 1313 | 1253 |
| 1254 |
| 1314 // SearchTermCacheItem --------------------------------------------------------- | 1255 // SearchTermCacheItem --------------------------------------------------------- |
| 1315 | 1256 |
| 1316 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 1257 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
| 1317 const WordIDSet& word_id_set, | 1258 const WordIDSet& word_id_set, |
| 1318 const HistoryIDSet& history_id_set) | 1259 const HistoryIDSet& history_id_set) |
| 1319 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { | 1260 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { |
| 1320 } | 1261 } |
| 1321 | 1262 |
| 1322 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { | 1263 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { |
| 1323 } | 1264 } |
| (...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1413 // First cut: typed count, visit count, recency. | 1354 // First cut: typed count, visit count, recency. |
| 1414 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | 1355 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
| 1415 // recently visited (within the last 12/24 hours) as highly important. Get | 1356 // recently visited (within the last 12/24 hours) as highly important. Get |
| 1416 // input from mpearson. | 1357 // input from mpearson. |
| 1417 if (r1.typed_count() != r2.typed_count()) | 1358 if (r1.typed_count() != r2.typed_count()) |
| 1418 return (r1.typed_count() > r2.typed_count()); | 1359 return (r1.typed_count() > r2.typed_count()); |
| 1419 if (r1.visit_count() != r2.visit_count()) | 1360 if (r1.visit_count() != r2.visit_count()) |
| 1420 return (r1.visit_count() > r2.visit_count()); | 1361 return (r1.visit_count() > r2.visit_count()); |
| 1421 return (r1.last_visit() > r2.last_visit()); | 1362 return (r1.last_visit() > r2.last_visit()); |
| 1422 } | 1363 } |
| OLD | NEW |