Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/history/url_index_private_data.h" | 5 #include "chrome/browser/history/url_index_private_data.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <functional> | 8 #include <functional> |
| 9 #include <iterator> | 9 #include <iterator> |
| 10 #include <limits> | 10 #include <limits> |
| 11 #include <numeric> | 11 #include <numeric> |
| 12 #include <vector> | 12 #include <vector> |
| 13 | 13 |
| 14 #include "base/basictypes.h" | 14 #include "base/basictypes.h" |
| 15 #include "base/file_util.h" | 15 #include "base/file_util.h" |
| 16 #include "base/i18n/case_conversion.h" | 16 #include "base/i18n/case_conversion.h" |
| 17 #include "base/metrics/histogram.h" | |
| 18 #include "base/string_util.h" | 17 #include "base/string_util.h" |
| 19 #include "base/time.h" | |
| 20 #include "base/utf_string_conversions.h" | 18 #include "base/utf_string_conversions.h" |
| 21 #include "chrome/browser/autocomplete/autocomplete_provider.h" | 19 #include "chrome/browser/autocomplete/autocomplete_provider.h" |
| 22 #include "chrome/browser/autocomplete/url_prefix.h" | |
| 23 #include "chrome/browser/history/history_database.h" | 20 #include "chrome/browser/history/history_database.h" |
| 24 #include "chrome/browser/history/in_memory_url_index.h" | 21 #include "chrome/browser/history/in_memory_url_cache_database.h" |
| 22 #include "chrome/common/chrome_constants.h" | |
| 23 #include "chrome/common/url_constants.h" | |
| 25 #include "content/public/browser/browser_thread.h" | 24 #include "content/public/browser/browser_thread.h" |
| 26 #include "content/public/browser/notification_details.h" | 25 #include "content/public/browser/notification_details.h" |
| 27 #include "content/public/browser/notification_service.h" | 26 #include "content/public/browser/notification_service.h" |
| 28 #include "content/public/browser/notification_source.h" | 27 #include "content/public/browser/notification_source.h" |
| 29 #include "net/base/net_util.h" | 28 #include "net/base/net_util.h" |
| 30 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | 29 |
| 31 | 30 using content::BrowserThread; |
| 32 using google::protobuf::RepeatedField; | |
| 33 using google::protobuf::RepeatedPtrField; | |
| 34 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
| 35 | 31 |
| 36 namespace history { | 32 namespace history { |
| 37 | 33 |
| 38 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | 34 // Comparison function for sorting search terms by descending length. |
| 39 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | 35 bool LengthGreater(const string16& string_a, const string16& string_b) { |
| 40 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | 36 return string_a.length() > string_b.length(); |
| 41 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; | 37 } |
| 42 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry | 38 |
| 43 CharWordMapEntry; | 39 // Initializes a whitelist of URL schemes. |
| 44 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | 40 void InitializeSchemeWhitelist(std::set<std::string>* whitelist) { |
| 45 WordIDHistoryMapItem; | 41 DCHECK(whitelist); |
| 46 typedef imui:: | 42 if (!whitelist->empty()) |
| 47 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | 43 return; // Nothing to do, already initialized. |
| 48 WordIDHistoryMapEntry; | 44 whitelist->insert(std::string(chrome::kAboutScheme)); |
| 49 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; | 45 whitelist->insert(std::string(chrome::kChromeUIScheme)); |
| 50 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | 46 whitelist->insert(std::string(chrome::kFileScheme)); |
| 51 HistoryInfoMapEntry; | 47 whitelist->insert(std::string(chrome::kFtpScheme)); |
| 52 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; | 48 whitelist->insert(std::string(chrome::kHttpScheme)); |
| 53 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry | 49 whitelist->insert(std::string(chrome::kHttpsScheme)); |
| 54 WordStartsMapEntry; | 50 whitelist->insert(std::string(chrome::kMailToScheme)); |
| 51 } | |
| 52 | |
| 53 // CacheTransaction ------------------------------------------------------------ | |
| 54 | |
| 55 // Simple automatic helper class encapsulating calls to BeginTransaction/ | |
| 56 // CommitTransaction. | |
| 57 class CacheTransaction { | |
| 58 public: | |
| 59 explicit CacheTransaction(InMemoryURLCacheDatabase* db); | |
| 60 ~CacheTransaction(); | |
| 61 | |
| 62 private: | |
| 63 InMemoryURLCacheDatabase* db_; | |
| 64 | |
| 65 DISALLOW_COPY_AND_ASSIGN(CacheTransaction); | |
| 66 }; | |
| 67 | |
| 68 CacheTransaction::CacheTransaction(InMemoryURLCacheDatabase* db) | |
| 69 : db_(db) { | |
| 70 if (db_) | |
| 71 db_->BeginTransaction(); | |
| 72 } | |
| 73 | |
| 74 CacheTransaction::~CacheTransaction() { | |
| 75 if (db_) | |
| 76 db_->CommitTransaction(); | |
| 77 } | |
| 78 | |
| 79 // URLIndexPrivateData Public (but only to InMemoryURLIndex) Functions --------- | |
| 80 | |
| 81 URLIndexPrivateData::URLIndexPrivateData(const FilePath& history_dir, | |
| 82 const std::string& languages) | |
| 83 : history_dir_(history_dir), | |
| 84 languages_(languages), | |
| 85 cache_enabled_(true), | |
| 86 shutdown_(false), | |
| 87 lock_(), | |
| 88 pre_filter_item_count_(0), | |
| 89 post_filter_item_count_(0), | |
| 90 post_scoring_item_count_(0) { | |
| 91 InitializeSchemeWhitelist(&scheme_whitelist_); | |
| 92 } | |
| 93 | |
| 94 bool URLIndexPrivateData::Init( | |
| 95 base::SequencedWorkerPool::SequenceToken sequence_token) { | |
| 96 // Since init occurs on the DB thread, it is possible that the profile was | |
| 97 // told to shutdown before this task had a chance to kick off. | |
| 98 base::AutoLock lock(lock_); | |
| 99 if (shutdown_) | |
| 100 return false; | |
| 101 cache_db_ = new InMemoryURLCacheDatabase(); | |
| 102 FilePath dir_path; | |
| 103 set_cache_enabled(GetCacheDBPath(&dir_path) && | |
| 104 cache_db_->Init(dir_path, sequence_token)); | |
| 105 return cache_enabled(); | |
| 106 } | |
| 107 | |
| 108 void URLIndexPrivateData::Reset() { | |
| 109 Clear(); | |
| 110 if (cache_enabled()) | |
| 111 cache_db_->Reset(); | |
| 112 } | |
| 113 | |
| 114 bool URLIndexPrivateData::Empty() const { | |
| 115 return history_info_map_.empty(); | |
| 116 } | |
| 117 | |
| 118 URLIndexPrivateData* URLIndexPrivateData::Snapshot() const { | |
| 119 return new URLIndexPrivateData(*this); | |
| 120 } | |
| 121 | |
| 122 void URLIndexPrivateData::Shutdown() { | |
| 123 base::AutoLock lock(lock_); | |
| 124 shutdown_ = true; | |
| 125 if (cache_enabled()) | |
| 126 cache_db_->Shutdown(); | |
| 127 } | |
| 128 | |
| 129 // Helper predicates for ValidateConsistency. | |
| 130 template <typename Pair> | |
| 131 struct SelectFirst : std::unary_function<Pair, typename Pair::first_type> { | |
| 132 const typename Pair::first_type& operator()(const Pair& x) const { | |
| 133 return x.first; | |
| 134 } | |
| 135 }; | |
| 136 | |
| 137 template <typename Pair> | |
| 138 struct SelectSecond : std::unary_function<Pair, typename Pair::second_type> { | |
| 139 const typename Pair::second_type& operator()(const Pair& x) const { | |
| 140 return x.second; | |
| 141 } | |
| 142 }; | |
| 143 | |
| 144 template <typename SourcePair, typename Target> | |
| 145 struct TargetInserter : std::unary_function<SourcePair, void> { | |
| 146 explicit TargetInserter(Target* target) : target_(target) {} | |
| 147 | |
| 148 void operator()(const SourcePair& source) { | |
| 149 target_->insert(source.second.begin(), source.second.end()); | |
| 150 } | |
| 151 | |
| 152 Target* target_; | |
| 153 }; | |
| 154 | |
| 155 bool URLIndexPrivateData::ValidateConsistency() const { | |
| 156 // Scope things so that a large data set doesn't unnecessarily accumulate and | |
| 157 // hang around. | |
| 158 { | |
| 159 // Make a set of WordIDs from word_map_. | |
| 160 WordIDSet word_id_set_a; | |
| 161 std::transform(word_map_.begin(), word_map_.end(), | |
| 162 std::inserter(word_id_set_a, word_id_set_a.begin()), | |
| 163 SelectSecond<WordMap::value_type>()); | |
| 164 | |
| 165 // Compare word_map_'s word set to the words from word_id_history_map_. | |
| 166 { | |
| 167 WordIDSet word_id_set_b; | |
| 168 std::transform(word_id_history_map_.begin(), word_id_history_map_.end(), | |
| 169 std::inserter(word_id_set_b, word_id_set_b.begin()), | |
| 170 SelectFirst<WordIDHistoryMap::value_type>()); | |
| 171 WordIDSet leftovers; | |
| 172 std::set_symmetric_difference( | |
| 173 word_id_set_a.begin(), word_id_set_a.end(), | |
| 174 word_id_set_b.begin(), word_id_set_b.end(), | |
| 175 std::inserter(leftovers, leftovers.begin())); | |
| 176 if (!leftovers.empty()) | |
| 177 return false; | |
| 178 } | |
| 179 | |
| 180 // Compare word_map_'s word set to the words from history_id_word_map_. | |
| 181 { | |
| 182 WordIDSet word_id_set_b; | |
| 183 std::for_each(history_id_word_map_.begin(), history_id_word_map_.end(), | |
| 184 TargetInserter<HistoryIDWordMap::value_type, | |
| 185 WordIDSet>(&word_id_set_b)); | |
| 186 WordIDSet leftovers; | |
| 187 std::set_symmetric_difference( | |
| 188 word_id_set_a.begin(), word_id_set_a.end(), | |
| 189 word_id_set_b.begin(), word_id_set_b.end(), | |
| 190 std::inserter(leftovers, leftovers.begin())); | |
| 191 if (!leftovers.empty()) | |
| 192 return false; | |
| 193 } | |
| 194 | |
| 195 // Compare word_map_'s word set to the words from char_word_map_. | |
| 196 { | |
| 197 WordIDSet word_id_set_b; | |
| 198 std::for_each(char_word_map_.begin(), char_word_map_.end(), | |
| 199 TargetInserter<CharWordIDMap::value_type, | |
| 200 WordIDSet>(&word_id_set_b)); | |
| 201 WordIDSet leftovers; | |
| 202 std::set_symmetric_difference( | |
| 203 word_id_set_a.begin(), word_id_set_a.end(), | |
| 204 word_id_set_b.begin(), word_id_set_b.end(), | |
| 205 std::inserter(leftovers, leftovers.begin())); | |
| 206 if (!leftovers.empty()) | |
| 207 return false; | |
| 208 } | |
| 209 | |
| 210 // Intersect available_words_ with set of WordIDs (created above from | |
| 211 // word_map_) and test for no common WordIDs. | |
| 212 { | |
| 213 WordIDSet leftovers; | |
| 214 std::set_intersection(word_id_set_a.begin(), word_id_set_a.end(), | |
| 215 available_words_.begin(), available_words_.end(), | |
| 216 std::inserter(leftovers, leftovers.begin())); | |
| 217 if (!leftovers.empty()) | |
| 218 return false; | |
| 219 } | |
| 220 } | |
| 221 | |
| 222 { | |
| 223 // Make a set of HistoryIDs from history_info_map_. | |
| 224 HistoryIDSet history_id_set_a; | |
| 225 std::transform(history_info_map_.begin(), history_info_map_.end(), | |
| 226 std::inserter(history_id_set_a, history_id_set_a.begin()), | |
| 227 SelectFirst<HistoryInfoMap::value_type>()); | |
| 228 | |
| 229 // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs. | |
| 230 { | |
| 231 HistoryIDSet history_id_set_b; | |
| 232 std::transform(history_info_map_.begin(), history_info_map_.end(), | |
| 233 std::inserter(history_id_set_b, history_id_set_b.begin()), | |
| 234 SelectFirst<HistoryInfoMap::value_type>()); | |
| 235 HistoryIDSet leftovers; | |
| 236 std::set_symmetric_difference( | |
| 237 history_id_set_a.begin(), history_id_set_a.end(), | |
| 238 history_id_set_b.begin(), history_id_set_b.end(), | |
| 239 std::inserter(leftovers, leftovers.begin())); | |
| 240 if (!leftovers.empty()) | |
| 241 return false; | |
| 242 } | |
| 243 | |
| 244 // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs. | |
| 245 { | |
| 246 HistoryIDSet history_id_set_b; | |
| 247 std::for_each(word_id_history_map_.begin(), word_id_history_map_.end(), | |
| 248 TargetInserter<WordIDHistoryMap::value_type, | |
| 249 HistoryIDSet>(&history_id_set_b)); | |
| 250 HistoryIDSet leftovers; | |
| 251 std::set_symmetric_difference( | |
| 252 history_id_set_a.begin(), history_id_set_a.end(), | |
| 253 history_id_set_b.begin(), history_id_set_b.end(), | |
| 254 std::inserter(leftovers, leftovers.begin())); | |
| 255 if (!leftovers.empty()) | |
| 256 return false; | |
| 257 } | |
| 258 | |
| 259 // Compare history_info_map_'s set to word_starts_map_'s HistoryIDs. | |
| 260 { | |
| 261 HistoryIDSet history_id_set_b; | |
| 262 std::transform(word_starts_map_.begin(), word_starts_map_.end(), | |
| 263 std::inserter(history_id_set_b, history_id_set_b.begin()), | |
| 264 SelectFirst<WordStartsMap::value_type>()); | |
| 265 HistoryIDSet leftovers; | |
| 266 std::set_symmetric_difference( | |
| 267 history_id_set_a.begin(), history_id_set_a.end(), | |
| 268 history_id_set_b.begin(), history_id_set_b.end(), | |
| 269 std::inserter(leftovers, leftovers.begin())); | |
| 270 if (!leftovers.empty()) | |
| 271 return false; | |
| 272 } | |
| 273 } | |
| 274 | |
| 275 // Make sets of words from word_list_ and word_map_ and test for equality. | |
| 276 { | |
| 277 String16Set word_list_set; | |
| 278 std::copy(word_list_.begin(), word_list_.end(), | |
| 279 std::inserter(word_list_set, word_list_set.end())); | |
| 280 String16Set word_map_set; | |
| 281 std::transform(word_map_.begin(), word_map_.end(), | |
| 282 std::inserter(word_map_set, word_map_set.begin()), | |
| 283 SelectFirst<WordMap::value_type>()); | |
| 284 if (word_list_set != word_map_set) | |
| 285 return false; | |
| 286 } | |
| 287 | |
| 288 return true; | |
| 289 } | |
| 290 | |
| 291 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
| 292 const string16& search_string) { | |
| 293 pre_filter_item_count_ = 0; | |
| 294 post_filter_item_count_ = 0; | |
| 295 post_scoring_item_count_ = 0; | |
| 296 // The search string we receive may contain escaped characters. For reducing | |
| 297 // the index we need individual, lower-cased words, ignoring escapings. For | |
| 298 // the final filtering we need whitespace separated substrings possibly | |
| 299 // containing escaped characters. | |
| 300 string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
| 301 string16 lower_unescaped_string = | |
| 302 net::UnescapeURLComponent(lower_raw_string, | |
| 303 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
| 304 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
| 305 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
| 306 // four 'words': "colspec", "id", "mstone" and "release". | |
| 307 String16Vector lower_words( | |
| 308 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
| 309 ScoredHistoryMatches scored_items; | |
| 310 | |
| 311 // Do nothing if we have indexed no words (probably because we've not been | |
| 312 // initialized yet) or the search string has no words. | |
| 313 if (word_list_.empty() || lower_words.empty()) { | |
| 314 search_term_cache_.clear(); // Invalidate the term cache. | |
| 315 return scored_items; | |
| 316 } | |
| 317 | |
| 318 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
| 319 // approach. | |
| 320 ResetSearchTermCache(); | |
| 321 | |
| 322 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
| 323 | |
| 324 // Trim the candidate pool if it is large. Note that we do not filter out | |
| 325 // items that do not contain the search terms as proper substrings -- doing | |
| 326 // so is the performance-costly operation we are trying to avoid in order | |
| 327 // to maintain omnibox responsiveness. | |
| 328 const size_t kItemsToScoreLimit = 500; | |
| 329 pre_filter_item_count_ = history_id_set.size(); | |
| 330 // If we trim the results set we do not want to cache the results for next | |
| 331 // time as the user's ultimately desired result could easily be eliminated | |
| 332 // in this early rough filter. | |
| 333 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
| 334 if (was_trimmed) { | |
| 335 HistoryIDVector history_ids; | |
| 336 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 337 std::back_inserter(history_ids)); | |
| 338 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 339 // visit. | |
| 340 HistoryItemFactorGreater | |
| 341 item_factor_functor(history_info_map_); | |
| 342 std::partial_sort(history_ids.begin(), | |
| 343 history_ids.begin() + kItemsToScoreLimit, | |
| 344 history_ids.end(), | |
| 345 item_factor_functor); | |
| 346 history_id_set.clear(); | |
| 347 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
| 348 std::inserter(history_id_set, history_id_set.end())); | |
| 349 post_filter_item_count_ = history_id_set.size(); | |
| 350 } | |
| 351 | |
| 352 // Pass over all of the candidates filtering out any without a proper | |
| 353 // substring match, inserting those which pass in order by score. Note that | |
| 354 // in this step we are using the raw search string complete with escaped | |
| 355 // URL elements. When the user has specifically typed something akin to | |
| 356 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
| 357 // specific substring appears in the URL or page title. | |
| 358 | |
| 359 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
| 360 // we only want to break up the search string on 'true' whitespace rather than | |
| 361 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
| 362 // get two 'terms': "colspec=id%20mstone" and "release". | |
| 363 history::String16Vector lower_raw_terms; | |
| 364 Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms); | |
| 365 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
| 366 AddHistoryMatch(*this, lower_raw_string, | |
| 367 lower_raw_terms, base::Time::Now())).ScoredMatches(); | |
| 368 | |
| 369 // Select and sort only the top kMaxMatches results. | |
| 370 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | |
| 371 std::partial_sort(scored_items.begin(), | |
| 372 scored_items.begin() + | |
| 373 AutocompleteProvider::kMaxMatches, | |
| 374 scored_items.end(), | |
| 375 ScoredHistoryMatch::MatchScoreGreater); | |
| 376 scored_items.resize(AutocompleteProvider::kMaxMatches); | |
| 377 } else { | |
| 378 std::sort(scored_items.begin(), scored_items.end(), | |
| 379 ScoredHistoryMatch::MatchScoreGreater); | |
| 380 } | |
| 381 post_scoring_item_count_ = scored_items.size(); | |
| 382 | |
| 383 if (was_trimmed) { | |
| 384 search_term_cache_.clear(); // Invalidate the term cache. | |
| 385 } else { | |
| 386 // Remove any stale SearchTermCacheItems. | |
| 387 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 388 cache_iter != search_term_cache_.end(); ) { | |
| 389 if (!cache_iter->second.used_) | |
| 390 search_term_cache_.erase(cache_iter++); | |
| 391 else | |
| 392 ++cache_iter; | |
| 393 } | |
| 394 } | |
| 395 | |
| 396 return scored_items; | |
| 397 } | |
| 398 | |
| 399 bool URLIndexPrivateData::UpdateURL(const URLRow& row) { | |
| 400 // The row may or may not already be in our index. If it is not already | |
| 401 // indexed and it qualifies then it gets indexed. If it is already | |
| 402 // indexed and still qualifies then it gets updated, otherwise it | |
| 403 // is deleted from the index. | |
| 404 bool row_was_updated = false; | |
| 405 URLID row_id = row.id(); | |
| 406 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
| 407 if (row_pos == history_info_map_.end()) { | |
| 408 // This new row should be indexed if it qualifies. | |
| 409 URLRow new_row(row); | |
| 410 new_row.set_id(row_id); | |
| 411 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
| 412 IndexRow(new_row); | |
| 413 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
| 414 // This indexed row still qualifies and will be re-indexed. | |
| 415 // The url won't have changed but the title, visit count, etc. | |
| 416 // might have changed. | |
| 417 URLRow& row_to_update = row_pos->second; | |
| 418 bool title_updated = row_to_update.title() != row.title(); | |
| 419 if (row_to_update.visit_count() != row.visit_count() || | |
| 420 row_to_update.typed_count() != row.typed_count() || | |
| 421 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
| 422 CacheTransaction cache_transaction(cache_enabled() ? | |
| 423 cache_db_.get() : NULL); | |
| 424 row_to_update.set_visit_count(row.visit_count()); | |
| 425 row_to_update.set_typed_count(row.typed_count()); | |
| 426 row_to_update.set_last_visit(row.last_visit()); | |
| 427 row_to_update.set_hidden(row.hidden()); | |
| 428 // While the URL is guaranteed to remain stable, the title may have | |
| 429 // changed. If so, then update the index with the changed words. | |
| 430 if (title_updated) { | |
| 431 // Clear all words associated with this row and re-index both the | |
| 432 // URL and title. | |
| 433 RemoveRowWordsFromIndex(row_to_update); | |
| 434 row_to_update.set_title(row.title()); | |
| 435 RowWordStarts word_starts; | |
| 436 AddRowWordsToIndex(row_to_update, &word_starts); | |
| 437 word_starts_map_[row_id] = word_starts; | |
| 438 // Replace all word_starts for the row. | |
| 439 if (cache_enabled()) { | |
| 440 cache_db_->RemoveWordStarts(row_id); | |
| 441 cache_db_->AddRowWordStarts(row_id, word_starts); | |
| 442 } | |
| 443 } | |
| 444 if (cache_enabled()) { | |
| 445 cache_db_->RemoveHistoryIDFromURLs(row_id); | |
| 446 cache_db_->AddHistoryToURLs(row_id, row); | |
| 447 } | |
| 448 row_was_updated = true; | |
| 449 } | |
| 450 } else { | |
| 451 // This indexed row no longer qualifies and will be de-indexed by | |
| 452 // clearing all words associated with this row. | |
| 453 RemoveRowFromIndex(row); | |
| 454 row_was_updated = true; | |
| 455 } | |
| 456 if (row_was_updated) | |
| 457 search_term_cache_.clear(); // This invalidates the cache. | |
| 458 return row_was_updated; | |
| 459 } | |
| 460 | |
| 461 // Helper functor for DeleteURL. | |
| 462 class HistoryInfoMapItemHasURL { | |
| 463 public: | |
| 464 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
| 465 | |
| 466 bool operator()(const std::pair<const HistoryID, URLRow>& item) { | |
| 467 return item.second.url() == url_; | |
| 468 } | |
| 469 | |
| 470 private: | |
| 471 const GURL& url_; | |
| 472 }; | |
| 473 | |
| 474 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
| 475 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
| 476 BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
| 477 // Find the matching entry in the history_info_map_. | |
| 478 HistoryInfoMap::iterator pos = std::find_if( | |
| 479 history_info_map_.begin(), | |
| 480 history_info_map_.end(), | |
| 481 HistoryInfoMapItemHasURL(url)); | |
| 482 if (pos == history_info_map_.end()) | |
| 483 return false; | |
| 484 RemoveRowFromIndex(pos->second); | |
| 485 search_term_cache_.clear(); // This invalidates the cache. | |
| 486 return true; | |
| 487 } | |
| 488 | |
| 489 bool URLIndexPrivateData::RestoreFromCacheTask() { | |
| 490 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
| 491 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
| 492 DCHECK(cache_db_); | |
| 493 if (IsShutdown()) | |
| 494 return false; | |
| 495 Clear(); | |
| 496 if (RestoreFromCache(cache_db_)) | |
| 497 return true; | |
| 498 // If there was no SQLite-based cache then there might be an old, | |
| 499 // deprecated, protobuf-based cache file laying around. Get rid of it. | |
| 500 DeleteOldVersionCacheFile(); | |
| 501 return false; | |
| 502 } | |
| 503 | |
| 504 // static | |
| 505 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
| 506 HistoryDatabase* history_db, | |
| 507 scoped_refptr<URLIndexPrivateData> old_data) { | |
| 508 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
| 509 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
| 510 if (!history_db) | |
| 511 return NULL; | |
| 512 | |
| 513 scoped_refptr<URLIndexPrivateData> rebuilt_data( | |
| 514 new URLIndexPrivateData(*(old_data.get()))); | |
| 515 | |
| 516 // NOTE: We disable the cache database until after the replacement private | |
| 517 // data has been created and then we re-enable the database. Once the private | |
| 518 // data has been swapped in the database is refreshed with the new data. | |
| 519 URLDatabase::URLEnumerator history_enum; | |
| 520 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
| 521 return NULL; | |
| 522 | |
| 523 for (URLRow row; !old_data->IsShutdown() && history_enum.GetNextURL(&row); ) | |
| 524 rebuilt_data->IndexRow(row); | |
| 525 | |
| 526 if (old_data->IsShutdown()) { | |
| 527 rebuilt_data.release(); | |
| 528 } | |
| 529 | |
| 530 return rebuilt_data; | |
| 531 } | |
| 532 | |
| 533 void URLIndexPrivateData::RefreshCacheTask() { | |
| 534 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
| 535 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
| 536 base::AutoLock lock(lock_); | |
| 537 if (shutdown_ || !cache_enabled()) | |
| 538 return; | |
| 539 // Should a refresh fail for any reason we consider the database to be | |
| 540 // corrupt and unavailable; no further remedial action is possible. | |
| 541 set_cache_enabled(cache_db_->Reset() && cache_db_->Refresh(*this)); | |
| 542 } | |
| 543 | |
| 544 // static | |
| 545 void URLIndexPrivateData::InitializeSchemeWhitelistForTesting( | |
| 546 std::set<std::string>* whitelist) { | |
| 547 InitializeSchemeWhitelist(whitelist); | |
| 548 } | |
| 55 | 549 |
| 56 // SearchTermCacheItem --------------------------------------------------------- | 550 // SearchTermCacheItem --------------------------------------------------------- |
| 57 | 551 |
| 58 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 552 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
| 59 const WordIDSet& word_id_set, | 553 const WordIDSet& word_id_set, |
| 60 const HistoryIDSet& history_id_set) | 554 const HistoryIDSet& history_id_set) |
| 61 : word_id_set_(word_id_set), | 555 : word_id_set_(word_id_set), |
| 62 history_id_set_(history_id_set), | 556 history_id_set_(history_id_set), |
| 63 used_(true) {} | 557 used_(true) {} |
| 64 | 558 |
| 65 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() | 559 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() |
| 66 : used_(true) {} | 560 : used_(true) {} |
| 67 | 561 |
| 68 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} | 562 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} |
| 69 | 563 |
| 70 // Algorithm Functions --------------------------------------------------------- | 564 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- |
| 71 | 565 |
| 72 // Comparison function for sorting search terms by descending length. | 566 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( |
| 73 bool LengthGreater(const string16& string_a, const string16& string_b) { | 567 const URLIndexPrivateData& private_data, |
| 74 return string_a.length() > string_b.length(); | 568 const string16& lower_string, |
| 569 const String16Vector& lower_terms, | |
| 570 const base::Time now) | |
| 571 : private_data_(private_data), | |
| 572 lower_string_(lower_string), | |
| 573 lower_terms_(lower_terms), | |
| 574 now_(now) {} | |
| 575 | |
| 576 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
| 577 | |
| 578 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
| 579 const HistoryID history_id) { | |
| 580 HistoryInfoMap::const_iterator hist_pos = | |
| 581 private_data_.history_info_map_.find(history_id); | |
| 582 if (hist_pos != private_data_.history_info_map_.end()) { | |
| 583 const URLRow& hist_item = hist_pos->second; | |
| 584 WordStartsMap::const_iterator starts_pos = | |
| 585 private_data_.word_starts_map_.find(history_id); | |
| 586 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
| 587 ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_, | |
| 588 starts_pos->second, now_); | |
| 589 if (match.raw_score > 0) | |
| 590 scored_matches_.push_back(match); | |
| 591 } | |
| 75 } | 592 } |
| 76 | 593 |
| 77 // InMemoryURLIndex's Private Data --------------------------------------------- | 594 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- |
| 78 | 595 |
| 79 URLIndexPrivateData::URLIndexPrivateData() | 596 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( |
| 80 : restored_cache_version_(0), | 597 const HistoryInfoMap& history_info_map) |
| 81 saved_cache_version_(kCurrentCacheFileVersion), | 598 : history_info_map_(history_info_map) { |
| 599 } | |
| 600 | |
| 601 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
| 602 | |
| 603 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
| 604 const HistoryID h1, | |
| 605 const HistoryID h2) { | |
| 606 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
| 607 if (entry1 == history_info_map_.end()) | |
| 608 return false; | |
| 609 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
| 610 if (entry2 == history_info_map_.end()) | |
| 611 return true; | |
| 612 const URLRow& r1(entry1->second); | |
| 613 const URLRow& r2(entry2->second); | |
| 614 // First cut: typed count, visit count, recency. | |
| 615 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
| 616 // recently visited (within the last 12/24 hours) as highly important. Get | |
| 617 // input from mpearson. | |
| 618 if (r1.typed_count() != r2.typed_count()) | |
| 619 return (r1.typed_count() > r2.typed_count()); | |
| 620 if (r1.visit_count() != r2.visit_count()) | |
| 621 return (r1.visit_count() > r2.visit_count()); | |
| 622 return (r1.last_visit() > r2.last_visit()); | |
| 623 } | |
| 624 | |
| 625 // URLIndexPrivateData Private Functions --------------------------------------- | |
| 626 | |
| 627 URLIndexPrivateData::URLIndexPrivateData(const URLIndexPrivateData& old_data) | |
| 628 : history_dir_(old_data.history_dir_), | |
| 629 languages_(old_data.languages_), | |
| 630 cache_enabled_(old_data.cache_enabled_), | |
| 631 shutdown_(old_data.IsShutdown()), | |
| 632 lock_(), | |
| 633 scheme_whitelist_(old_data.scheme_whitelist_), | |
| 82 pre_filter_item_count_(0), | 634 pre_filter_item_count_(0), |
| 83 post_filter_item_count_(0), | 635 post_filter_item_count_(0), |
| 84 post_scoring_item_count_(0) { | 636 post_scoring_item_count_(0) { |
| 637 cache_db_ = old_data.cache_db_; | |
| 638 } | |
| 639 | |
| 640 // Called only by unit tests. | |
| 641 URLIndexPrivateData::URLIndexPrivateData() | |
| 642 : cache_enabled_(true), | |
| 643 shutdown_(false), | |
| 644 lock_(), | |
| 645 pre_filter_item_count_(0), | |
| 646 post_filter_item_count_(0), | |
| 647 post_scoring_item_count_(0) { | |
| 648 InitializeSchemeWhitelist(&scheme_whitelist_); | |
| 85 } | 649 } |
| 86 | 650 |
| 87 URLIndexPrivateData::~URLIndexPrivateData() {} | 651 URLIndexPrivateData::~URLIndexPrivateData() {} |
| 88 | 652 |
| 653 bool URLIndexPrivateData::IsShutdown() const { | |
| 654 base::AutoLock lock(lock_); | |
| 655 return shutdown_; | |
|
sky
2012/08/22 19:42:34
nit: spacing.
| |
| 656 } | |
| 657 | |
| 89 void URLIndexPrivateData::Clear() { | 658 void URLIndexPrivateData::Clear() { |
| 90 word_list_.clear(); | 659 word_list_.clear(); |
| 91 available_words_.clear(); | 660 available_words_.clear(); |
| 92 word_map_.clear(); | 661 word_map_.clear(); |
| 93 char_word_map_.clear(); | 662 char_word_map_.clear(); |
| 94 word_id_history_map_.clear(); | 663 word_id_history_map_.clear(); |
| 95 history_id_word_map_.clear(); | 664 history_id_word_map_.clear(); |
| 96 history_info_map_.clear(); | 665 history_info_map_.clear(); |
| 97 word_starts_map_.clear(); | 666 word_starts_map_.clear(); |
| 98 } | 667 } |
| 99 | 668 |
| 100 bool URLIndexPrivateData::Empty() const { | |
| 101 return history_info_map_.empty(); | |
| 102 } | |
| 103 | |
| 104 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { | |
| 105 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; | |
| 106 data_copy->word_list_ = word_list_; | |
| 107 data_copy->available_words_ = available_words_; | |
| 108 data_copy->word_map_ = word_map_; | |
| 109 data_copy->char_word_map_ = char_word_map_; | |
| 110 data_copy->word_id_history_map_ = word_id_history_map_; | |
| 111 data_copy->history_id_word_map_ = history_id_word_map_; | |
| 112 data_copy->history_info_map_ = history_info_map_; | |
| 113 data_copy->word_starts_map_ = word_starts_map_; | |
| 114 return data_copy; | |
| 115 // Not copied: | |
| 116 // search_term_cache_ | |
| 117 // pre_filter_item_count_ | |
| 118 // post_filter_item_count_ | |
| 119 // post_scoring_item_count_ | |
| 120 }; | |
| 121 | |
| 122 // Cache Updating -------------------------------------------------------------- | 669 // Cache Updating -------------------------------------------------------------- |
| 123 | 670 |
| 124 bool URLIndexPrivateData::IndexRow( | 671 bool URLIndexPrivateData::IndexRow(const URLRow& row) { |
| 125 const URLRow& row, | |
| 126 const std::string& languages, | |
| 127 const std::set<std::string>& scheme_whitelist) { | |
| 128 const GURL& gurl(row.url()); | 672 const GURL& gurl(row.url()); |
| 129 | 673 |
| 130 // Index only URLs with a whitelisted scheme. | 674 // Index only URLs with a whitelisted scheme. |
| 131 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) | 675 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist_)) |
| 132 return false; | 676 return false; |
| 133 | 677 |
| 134 URLID row_id = row.id(); | |
| 135 // Strip out username and password before saving and indexing. | 678 // Strip out username and password before saving and indexing. |
| 136 string16 url(net::FormatUrl(gurl, languages, | 679 string16 url(net::FormatUrl(gurl, languages_, |
| 137 net::kFormatUrlOmitUsernamePassword, | 680 net::kFormatUrlOmitUsernamePassword, |
| 138 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | 681 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
| 139 NULL, NULL, NULL)); | 682 NULL, NULL, NULL)); |
| 140 | 683 |
| 684 URLID row_id = row.id(); | |
| 141 HistoryID history_id = static_cast<HistoryID>(row_id); | 685 HistoryID history_id = static_cast<HistoryID>(row_id); |
| 686 DCHECK(history_info_map_.find(history_id) == history_info_map_.end()); | |
| 142 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); | 687 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); |
| 143 | 688 |
| 144 // Add the row for quick lookup in the history info store. | 689 // Add the row for quick lookup in the history info store. |
| 145 URLRow new_row(GURL(url), row_id); | 690 URLRow new_row(GURL(url), row_id); |
| 146 new_row.set_visit_count(row.visit_count()); | 691 new_row.set_visit_count(row.visit_count()); |
| 147 new_row.set_typed_count(row.typed_count()); | 692 new_row.set_typed_count(row.typed_count()); |
| 148 new_row.set_last_visit(row.last_visit()); | 693 new_row.set_last_visit(row.last_visit()); |
| 149 new_row.set_title(row.title()); | 694 new_row.set_title(row.title()); |
| 695 | |
| 696 CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL); | |
| 150 history_info_map_[history_id] = new_row; | 697 history_info_map_[history_id] = new_row; |
| 698 if (cache_enabled()) | |
| 699 cache_db_->AddHistoryToURLs(history_id, row); | |
| 151 | 700 |
| 152 // Index the words contained in the URL and title of the row. | 701 // Index the words contained in the URL and title of the row. |
| 153 RowWordStarts word_starts; | 702 RowWordStarts word_starts; |
| 154 AddRowWordsToIndex(new_row, &word_starts, languages); | 703 AddRowWordsToIndex(new_row, &word_starts); |
| 155 word_starts_map_[history_id] = word_starts; | 704 word_starts_map_[history_id] = word_starts; |
| 705 if (cache_enabled()) | |
| 706 cache_db_->AddRowWordStarts(history_id, word_starts); | |
| 156 return true; | 707 return true; |
| 157 } | 708 } |
| 158 | 709 |
| 159 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, | 710 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, |
| 160 RowWordStarts* word_starts, | 711 RowWordStarts* word_starts) { |
| 161 const std::string& languages) { | |
| 162 HistoryID history_id = static_cast<HistoryID>(row.id()); | 712 HistoryID history_id = static_cast<HistoryID>(row.id()); |
| 163 // Split URL into individual, unique words then add in the title words. | 713 // Split URL into individual, unique words then add in the title words. |
| 164 const GURL& gurl(row.url()); | 714 const GURL& gurl(row.url()); |
| 165 string16 url(net::FormatUrl(gurl, languages, | 715 string16 url(net::FormatUrl(gurl, languages_, |
| 166 net::kFormatUrlOmitUsernamePassword, | 716 net::kFormatUrlOmitUsernamePassword, |
| 167 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | 717 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
| 168 NULL, NULL, NULL)); | 718 NULL, NULL, NULL)); |
| 169 url = base::i18n::ToLower(url); | 719 url = base::i18n::ToLower(url); |
| 170 String16Set url_words = String16SetFromString16(url, | 720 String16Set url_words = String16SetFromString16(url, |
| 171 word_starts ? &word_starts->url_word_starts_ : NULL); | 721 word_starts ? &word_starts->url_word_starts_ : NULL); |
| 172 String16Set title_words = String16SetFromString16(row.title(), | 722 String16Set title_words = String16SetFromString16(row.title(), |
| 173 word_starts ? &word_starts->title_word_starts_ : NULL); | 723 word_starts ? &word_starts->title_word_starts_ : NULL); |
| 174 String16Set words; | 724 String16Set words; |
| 175 std::set_union(url_words.begin(), url_words.end(), | 725 std::set_union(url_words.begin(), url_words.end(), |
| (...skipping 12 matching lines...) Expand all Loading... | |
| 188 if (word_pos != word_map_.end()) | 738 if (word_pos != word_map_.end()) |
| 189 UpdateWordHistory(word_pos->second, history_id); | 739 UpdateWordHistory(word_pos->second, history_id); |
| 190 else | 740 else |
| 191 AddWordHistory(term, history_id); | 741 AddWordHistory(term, history_id); |
| 192 } | 742 } |
| 193 | 743 |
| 194 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, | 744 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, |
| 195 HistoryID history_id) { | 745 HistoryID history_id) { |
| 196 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); | 746 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); |
| 197 DCHECK(history_pos != word_id_history_map_.end()); | 747 DCHECK(history_pos != word_id_history_map_.end()); |
| 748 // There is no need to record changes to the word_id_history_map_ in the cache | |
| 749 // because it can be rebuilt from the history_id_word_map_'s table. | |
| 198 HistoryIDSet& history_id_set(history_pos->second); | 750 HistoryIDSet& history_id_set(history_pos->second); |
| 199 history_id_set.insert(history_id); | 751 history_id_set.insert(history_id); |
| 200 AddToHistoryIDWordMap(history_id, word_id); | 752 AddToHistoryIDWordMap(history_id, word_id); |
| 753 | |
| 754 // Add word_id/history_id entry to the word_history table | |
| 755 if (cache_enabled()) | |
| 756 cache_db_->AddHistoryToWordHistory(word_id, history_id); | |
| 201 } | 757 } |
| 202 | 758 |
| 203 void URLIndexPrivateData::AddWordHistory(const string16& term, | 759 void URLIndexPrivateData::AddWordHistory(const string16& term, |
| 204 HistoryID history_id) { | 760 HistoryID history_id) { |
| 205 WordID word_id = word_list_.size(); | 761 WordID word_id = word_list_.size(); |
| 206 if (available_words_.empty()) { | 762 if (available_words_.empty()) { |
| 207 word_list_.push_back(term); | 763 word_list_.push_back(term); |
| 208 } else { | 764 } else { |
| 209 word_id = *(available_words_.begin()); | 765 word_id = *(available_words_.begin()); |
| 210 word_list_[word_id] = term; | 766 word_list_[word_id] = term; |
| 211 available_words_.erase(word_id); | 767 available_words_.erase(word_id); |
| 212 } | 768 } |
| 213 word_map_[term] = word_id; | 769 word_map_[term] = word_id; |
| 214 | 770 |
| 771 // Add word_id/term entry to the words table; | |
| 772 if (cache_enabled()) | |
| 773 cache_db_->AddWordToWords(word_id, term); | |
| 774 | |
| 215 HistoryIDSet history_id_set; | 775 HistoryIDSet history_id_set; |
| 216 history_id_set.insert(history_id); | 776 history_id_set.insert(history_id); |
| 217 word_id_history_map_[word_id] = history_id_set; | 777 word_id_history_map_[word_id] = history_id_set; |
| 218 AddToHistoryIDWordMap(history_id, word_id); | 778 AddToHistoryIDWordMap(history_id, word_id); |
| 219 | 779 |
| 780 // Add word_id/history_id entry to the word_history table | |
| 781 if (cache_enabled()) | |
| 782 cache_db_->AddHistoryToWordHistory(word_id, history_id); | |
| 783 | |
| 220 // For each character in the newly added word (i.e. a word that is not | 784 // For each character in the newly added word (i.e. a word that is not |
| 221 // already in the word index), add the word to the character index. | 785 // already in the word index), add the word to the character index. |
| 222 Char16Set characters = Char16SetFromString16(term); | 786 Char16Set characters = Char16SetFromString16(term); |
| 223 for (Char16Set::iterator uni_char_iter = characters.begin(); | 787 for (Char16Set::iterator uni_char_iter = characters.begin(); |
| 224 uni_char_iter != characters.end(); ++uni_char_iter) { | 788 uni_char_iter != characters.end(); ++uni_char_iter) { |
| 225 char16 uni_char = *uni_char_iter; | 789 char16 uni_char = *uni_char_iter; |
| 226 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); | 790 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); |
| 227 if (char_iter != char_word_map_.end()) { | 791 if (char_iter != char_word_map_.end()) { |
| 228 // Update existing entry in the char/word index. | 792 // Update existing entry in the char/word index. |
| 229 WordIDSet& word_id_set(char_iter->second); | 793 WordIDSet& word_id_set(char_iter->second); |
| 230 word_id_set.insert(word_id); | 794 word_id_set.insert(word_id); |
| 231 } else { | 795 } else { |
| 232 // Create a new entry in the char/word index. | 796 // Create a new entry in the char/word index. |
| 233 WordIDSet word_id_set; | 797 WordIDSet word_id_set; |
| 234 word_id_set.insert(word_id); | 798 word_id_set.insert(word_id); |
| 235 char_word_map_[uni_char] = word_id_set; | 799 char_word_map_[uni_char] = word_id_set; |
| 236 } | 800 } |
| 801 // Add uni_char/word_id entry to the char_words database table. | |
| 802 if (cache_enabled()) | |
| 803 cache_db_->AddWordToCharWords(uni_char, word_id); | |
| 237 } | 804 } |
| 238 } | 805 } |
| 239 | 806 |
| 240 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { | 807 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { |
| 808 CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL); | |
| 241 RemoveRowWordsFromIndex(row); | 809 RemoveRowWordsFromIndex(row); |
| 242 HistoryID history_id = static_cast<HistoryID>(row.id()); | 810 HistoryID history_id = static_cast<HistoryID>(row.id()); |
| 243 history_info_map_.erase(history_id); | 811 history_info_map_.erase(history_id); |
| 244 word_starts_map_.erase(history_id); | 812 word_starts_map_.erase(history_id); |
| 813 if (cache_enabled()) | |
| 814 cache_db_->RemoveHistoryIDFromURLs(history_id); | |
| 245 } | 815 } |
| 246 | 816 |
| 247 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { | 817 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { |
| 248 // Remove the entries in history_id_word_map_ and word_id_history_map_ for | 818 // Remove the entries in history_id_word_map_ and word_id_history_map_ for |
| 249 // this row. | 819 // this row. |
| 250 HistoryID history_id = static_cast<HistoryID>(row.id()); | 820 HistoryID history_id = static_cast<HistoryID>(row.id()); |
| 251 WordIDSet word_id_set = history_id_word_map_[history_id]; | 821 WordIDSet word_id_set = history_id_word_map_[history_id]; |
| 252 history_id_word_map_.erase(history_id); | 822 history_id_word_map_.erase(history_id); |
| 253 | 823 |
| 254 // Reconcile any changes to word usage. | 824 // Reconcile any changes to word usage. |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 268 char_word_map_[uni_char].erase(word_id); | 838 char_word_map_[uni_char].erase(word_id); |
| 269 if (char_word_map_[uni_char].empty()) | 839 if (char_word_map_[uni_char].empty()) |
| 270 char_word_map_.erase(uni_char); // No longer in use. | 840 char_word_map_.erase(uni_char); // No longer in use. |
| 271 } | 841 } |
| 272 | 842 |
| 273 // Complete the removal of references to the word. | 843 // Complete the removal of references to the word. |
| 274 word_id_history_map_.erase(word_id); | 844 word_id_history_map_.erase(word_id); |
| 275 word_map_.erase(word); | 845 word_map_.erase(word); |
| 276 word_list_[word_id] = string16(); | 846 word_list_[word_id] = string16(); |
| 277 available_words_.insert(word_id); | 847 available_words_.insert(word_id); |
| 848 if (cache_enabled()) | |
| 849 cache_db_->RemoveWordFromWords(word_id); | |
| 278 } | 850 } |
| 851 if (cache_enabled()) | |
| 852 cache_db_->RemoveHistoryIDFromWordHistory(history_id); | |
| 279 } | 853 } |
| 280 | 854 |
| 281 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, | 855 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, |
| 282 WordID word_id) { | 856 WordID word_id) { |
| 283 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); | 857 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); |
| 284 if (iter != history_id_word_map_.end()) { | 858 if (iter != history_id_word_map_.end()) { |
| 285 WordIDSet& word_id_set(iter->second); | 859 WordIDSet& word_id_set(iter->second); |
| 286 word_id_set.insert(word_id); | 860 word_id_set.insert(word_id); |
| 287 } else { | 861 } else { |
| 288 WordIDSet word_id_set; | 862 WordIDSet word_id_set; |
| 289 word_id_set.insert(word_id); | 863 word_id_set.insert(word_id); |
| 290 history_id_word_map_[history_id] = word_id_set; | 864 history_id_word_map_[history_id] = word_id_set; |
| 291 } | 865 } |
| 292 } | 866 } |
| 293 | 867 |
| 294 bool URLIndexPrivateData::UpdateURL( | |
| 295 const URLRow& row, | |
| 296 const std::string& languages, | |
| 297 const std::set<std::string>& scheme_whitelist) { | |
| 298 // The row may or may not already be in our index. If it is not already | |
| 299 // indexed and it qualifies then it gets indexed. If it is already | |
| 300 // indexed and still qualifies then it gets updated, otherwise it | |
| 301 // is deleted from the index. | |
| 302 bool row_was_updated = false; | |
| 303 URLID row_id = row.id(); | |
| 304 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
| 305 if (row_pos == history_info_map_.end()) { | |
| 306 // This new row should be indexed if it qualifies. | |
| 307 URLRow new_row(row); | |
| 308 new_row.set_id(row_id); | |
| 309 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
| 310 IndexRow(new_row, languages, scheme_whitelist); | |
| 311 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
| 312 // This indexed row still qualifies and will be re-indexed. | |
| 313 // The url won't have changed but the title, visit count, etc. | |
| 314 // might have changed. | |
| 315 URLRow& row_to_update = row_pos->second; | |
| 316 bool title_updated = row_to_update.title() != row.title(); | |
| 317 if (row_to_update.visit_count() != row.visit_count() || | |
| 318 row_to_update.typed_count() != row.typed_count() || | |
| 319 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
| 320 row_to_update.set_visit_count(row.visit_count()); | |
| 321 row_to_update.set_typed_count(row.typed_count()); | |
| 322 row_to_update.set_last_visit(row.last_visit()); | |
| 323 // While the URL is guaranteed to remain stable, the title may have | |
| 324 // changed. If so, then update the index with the changed words. | |
| 325 if (title_updated) { | |
| 326 // Clear all words associated with this row and re-index both the | |
| 327 // URL and title. | |
| 328 RemoveRowWordsFromIndex(row_to_update); | |
| 329 row_to_update.set_title(row.title()); | |
| 330 RowWordStarts word_starts; | |
| 331 AddRowWordsToIndex(row_to_update, &word_starts, languages); | |
| 332 word_starts_map_[row_id] = word_starts; | |
| 333 } | |
| 334 row_was_updated = true; | |
| 335 } | |
| 336 } else { | |
| 337 // This indexed row no longer qualifies and will be de-indexed by | |
| 338 // clearing all words associated with this row. | |
| 339 RemoveRowFromIndex(row); | |
| 340 row_was_updated = true; | |
| 341 } | |
| 342 if (row_was_updated) | |
| 343 search_term_cache_.clear(); // This invalidates the cache. | |
| 344 return row_was_updated; | |
| 345 } | |
| 346 | |
| 347 // Helper functor for DeleteURL. | |
| 348 class HistoryInfoMapItemHasURL { | |
| 349 public: | |
| 350 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
| 351 | |
| 352 bool operator()(const std::pair<const HistoryID, URLRow>& item) { | |
| 353 return item.second.url() == url_; | |
| 354 } | |
| 355 | |
| 356 private: | |
| 357 const GURL& url_; | |
| 358 }; | |
| 359 | |
| 360 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
| 361 // Find the matching entry in the history_info_map_. | |
| 362 HistoryInfoMap::iterator pos = std::find_if( | |
| 363 history_info_map_.begin(), | |
| 364 history_info_map_.end(), | |
| 365 HistoryInfoMapItemHasURL(url)); | |
| 366 if (pos == history_info_map_.end()) | |
| 367 return false; | |
| 368 RemoveRowFromIndex(pos->second); | |
| 369 search_term_cache_.clear(); // This invalidates the cache. | |
| 370 return true; | |
| 371 } | |
| 372 | |
| 373 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | |
| 374 | |
| 375 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
| 376 const HistoryInfoMap& history_info_map) | |
| 377 : history_info_map_(history_info_map) { | |
| 378 } | |
| 379 | |
| 380 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
| 381 | |
| 382 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
| 383 const HistoryID h1, | |
| 384 const HistoryID h2) { | |
| 385 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
| 386 if (entry1 == history_info_map_.end()) | |
| 387 return false; | |
| 388 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
| 389 if (entry2 == history_info_map_.end()) | |
| 390 return true; | |
| 391 const URLRow& r1(entry1->second); | |
| 392 const URLRow& r2(entry2->second); | |
| 393 // First cut: typed count, visit count, recency. | |
| 394 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
| 395 // recently visited (within the last 12/24 hours) as highly important. Get | |
| 396 // input from mpearson. | |
| 397 if (r1.typed_count() != r2.typed_count()) | |
| 398 return (r1.typed_count() > r2.typed_count()); | |
| 399 if (r1.visit_count() != r2.visit_count()) | |
| 400 return (r1.visit_count() > r2.visit_count()); | |
| 401 return (r1.last_visit() > r2.last_visit()); | |
| 402 } | |
| 403 | |
| 404 // Cache Searching ------------------------------------------------------------- | |
| 405 | |
| 406 // NOTE: This is the main public search function. | |
| 407 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
| 408 const string16& search_string) { | |
| 409 pre_filter_item_count_ = 0; | |
| 410 post_filter_item_count_ = 0; | |
| 411 post_scoring_item_count_ = 0; | |
| 412 // The search string we receive may contain escaped characters. For reducing | |
| 413 // the index we need individual, lower-cased words, ignoring escapings. For | |
| 414 // the final filtering we need whitespace separated substrings possibly | |
| 415 // containing escaped characters. | |
| 416 string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
| 417 string16 lower_unescaped_string = | |
| 418 net::UnescapeURLComponent(lower_raw_string, | |
| 419 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
| 420 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
| 421 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
| 422 // four 'words': "colspec", "id", "mstone" and "release". | |
| 423 String16Vector lower_words( | |
| 424 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
| 425 ScoredHistoryMatches scored_items; | |
| 426 | |
| 427 // Do nothing if we have indexed no words (probably because we've not been | |
| 428 // initialized yet) or the search string has no words. | |
| 429 if (word_list_.empty() || lower_words.empty()) { | |
| 430 search_term_cache_.clear(); // Invalidate the term cache. | |
| 431 return scored_items; | |
| 432 } | |
| 433 | |
| 434 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
| 435 // approach. | |
| 436 ResetSearchTermCache(); | |
| 437 | |
| 438 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
| 439 | |
| 440 // Trim the candidate pool if it is large. Note that we do not filter out | |
| 441 // items that do not contain the search terms as proper substrings -- doing | |
| 442 // so is the performance-costly operation we are trying to avoid in order | |
| 443 // to maintain omnibox responsiveness. | |
| 444 const size_t kItemsToScoreLimit = 500; | |
| 445 pre_filter_item_count_ = history_id_set.size(); | |
| 446 // If we trim the results set we do not want to cache the results for next | |
| 447 // time as the user's ultimately desired result could easily be eliminated | |
| 448 // in this early rough filter. | |
| 449 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
| 450 if (was_trimmed) { | |
| 451 HistoryIDVector history_ids; | |
| 452 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 453 std::back_inserter(history_ids)); | |
| 454 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 455 // visit. | |
| 456 HistoryItemFactorGreater | |
| 457 item_factor_functor(history_info_map_); | |
| 458 std::partial_sort(history_ids.begin(), | |
| 459 history_ids.begin() + kItemsToScoreLimit, | |
| 460 history_ids.end(), | |
| 461 item_factor_functor); | |
| 462 history_id_set.clear(); | |
| 463 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
| 464 std::inserter(history_id_set, history_id_set.end())); | |
| 465 post_filter_item_count_ = history_id_set.size(); | |
| 466 } | |
| 467 | |
| 468 // Pass over all of the candidates filtering out any without a proper | |
| 469 // substring match, inserting those which pass in order by score. Note that | |
| 470 // in this step we are using the raw search string complete with escaped | |
| 471 // URL elements. When the user has specifically typed something akin to | |
| 472 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
| 473 // specific substring appears in the URL or page title. | |
| 474 | |
| 475 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
| 476 // we only want to break up the search string on 'true' whitespace rather than | |
| 477 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
| 478 // get two 'terms': "colspec=id%20mstone" and "release". | |
| 479 history::String16Vector lower_raw_terms; | |
| 480 Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms); | |
| 481 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
| 482 AddHistoryMatch(*this, lower_raw_string, | |
| 483 lower_raw_terms, base::Time::Now())).ScoredMatches(); | |
| 484 | |
| 485 // Select and sort only the top kMaxMatches results. | |
| 486 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | |
| 487 std::partial_sort(scored_items.begin(), | |
| 488 scored_items.begin() + | |
| 489 AutocompleteProvider::kMaxMatches, | |
| 490 scored_items.end(), | |
| 491 ScoredHistoryMatch::MatchScoreGreater); | |
| 492 scored_items.resize(AutocompleteProvider::kMaxMatches); | |
| 493 } else { | |
| 494 std::sort(scored_items.begin(), scored_items.end(), | |
| 495 ScoredHistoryMatch::MatchScoreGreater); | |
| 496 } | |
| 497 post_scoring_item_count_ = scored_items.size(); | |
| 498 | |
| 499 if (was_trimmed) { | |
| 500 search_term_cache_.clear(); // Invalidate the term cache. | |
| 501 } else { | |
| 502 // Remove any stale SearchTermCacheItems. | |
| 503 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 504 cache_iter != search_term_cache_.end(); ) { | |
| 505 if (!cache_iter->second.used_) | |
| 506 search_term_cache_.erase(cache_iter++); | |
| 507 else | |
| 508 ++cache_iter; | |
| 509 } | |
| 510 } | |
| 511 | |
| 512 return scored_items; | |
| 513 } | |
| 514 | |
| 515 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- | |
| 516 | |
| 517 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( | |
| 518 const URLIndexPrivateData& private_data, | |
| 519 const string16& lower_string, | |
| 520 const String16Vector& lower_terms, | |
| 521 const base::Time now) | |
| 522 : private_data_(private_data), | |
| 523 lower_string_(lower_string), | |
| 524 lower_terms_(lower_terms), | |
| 525 now_(now) {} | |
| 526 | |
| 527 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
| 528 | |
| 529 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
| 530 const HistoryID history_id) { | |
| 531 HistoryInfoMap::const_iterator hist_pos = | |
| 532 private_data_.history_info_map_.find(history_id); | |
| 533 if (hist_pos != private_data_.history_info_map_.end()) { | |
| 534 const URLRow& hist_item = hist_pos->second; | |
| 535 WordStartsMap::const_iterator starts_pos = | |
| 536 private_data_.word_starts_map_.find(history_id); | |
| 537 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
| 538 ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_, | |
| 539 starts_pos->second, now_); | |
| 540 if (match.raw_score > 0) | |
| 541 scored_matches_.push_back(match); | |
| 542 } | |
| 543 } | |
| 544 | |
| 545 void URLIndexPrivateData::ResetSearchTermCache() { | 868 void URLIndexPrivateData::ResetSearchTermCache() { |
| 546 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | 869 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); |
| 547 iter != search_term_cache_.end(); ++iter) | 870 iter != search_term_cache_.end(); ++iter) |
| 548 iter->second.used_ = false; | 871 iter->second.used_ = false; |
| 549 } | 872 } |
| 550 | 873 |
| 551 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | 874 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( |
| 552 const String16Vector& unsorted_words) { | 875 const String16Vector& unsorted_words) { |
| 553 // Break the terms down into individual terms (words), get the candidate | 876 // Break the terms down into individual terms (words), get the candidate |
| 554 // set for each term, and intersect each to get a final candidate list. | 877 // set for each term, and intersect each to get a final candidate list. |
| (...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 723 std::set_intersection(word_id_set.begin(), word_id_set.end(), | 1046 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
| 724 char_word_id_set.begin(), char_word_id_set.end(), | 1047 char_word_id_set.begin(), char_word_id_set.end(), |
| 725 std::inserter(new_word_id_set, | 1048 std::inserter(new_word_id_set, |
| 726 new_word_id_set.begin())); | 1049 new_word_id_set.begin())); |
| 727 word_id_set.swap(new_word_id_set); | 1050 word_id_set.swap(new_word_id_set); |
| 728 } | 1051 } |
| 729 } | 1052 } |
| 730 return word_id_set; | 1053 return word_id_set; |
| 731 } | 1054 } |
| 732 | 1055 |
| 733 // Cache Saving ---------------------------------------------------------------- | 1056 bool URLIndexPrivateData::RestoreFromCache(InMemoryURLCacheDatabase* cache_db) { |
| 734 | 1057 DCHECK(cache_db); |
| 735 // static | 1058 if (!cache_db->RestorePrivateData(this)) { |
| 736 void URLIndexPrivateData::WritePrivateDataToCacheFileTask( | 1059 cache_db->Reset(); |
| 737 scoped_refptr<URLIndexPrivateData> private_data, | 1060 return false; |
| 738 const FilePath& file_path, | 1061 } |
| 739 scoped_refptr<RefCountedBool> succeeded) { | 1062 return !Empty() && ValidateConsistency(); |
| 740 DCHECK(private_data.get()); | |
| 741 DCHECK(!file_path.empty()); | |
| 742 succeeded->set_value(private_data->SaveToFile(file_path)); | |
| 743 } | 1063 } |
| 744 | 1064 |
| 745 bool URLIndexPrivateData::SaveToFile(const FilePath& file_path) { | 1065 void URLIndexPrivateData::DeleteOldVersionCacheFile() const { |
| 746 base::TimeTicks beginning_time = base::TimeTicks::Now(); | 1066 if (history_dir_.empty()) |
| 747 InMemoryURLIndexCacheItem index_cache; | 1067 return; |
| 748 SavePrivateData(&index_cache); | 1068 FilePath path = history_dir_.Append(chrome::kHQPCacheFilename); |
| 749 std::string data; | 1069 file_util::Delete(path, false); |
| 750 if (!index_cache.SerializeToString(&data)) { | 1070 } |
| 751 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; | 1071 |
| 1072 bool URLIndexPrivateData::GetCacheDBPath(FilePath* file_path) { | |
| 1073 DCHECK(file_path); | |
| 1074 if (history_dir_.empty()) | |
| 752 return false; | 1075 return false; |
| 753 } | 1076 *file_path = history_dir_.Append(chrome::kHQPCacheDBFilename); |
| 754 | |
| 755 int size = data.size(); | |
| 756 if (file_util::WriteFile(file_path, data.c_str(), size) != size) { | |
| 757 LOG(WARNING) << "Failed to write " << file_path.value(); | |
| 758 return false; | |
| 759 } | |
| 760 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", | |
| 761 base::TimeTicks::Now() - beginning_time); | |
| 762 return true; | 1077 return true; |
| 763 } | 1078 } |
| 764 | 1079 |
| 765 void URLIndexPrivateData::SavePrivateData( | 1080 void URLIndexPrivateData::SetCacheDatabaseForTesting( |
| 766 InMemoryURLIndexCacheItem* cache) const { | 1081 InMemoryURLCacheDatabase* test_db) { |
| 767 DCHECK(cache); | 1082 cache_db_ = test_db; |
| 768 cache->set_timestamp(base::Time::Now().ToInternalValue()); | |
| 769 cache->set_version(saved_cache_version_); | |
| 770 // history_item_count_ is no longer used but rather than change the protobuf | |
| 771 // definition use a placeholder. This will go away with the switch to SQLite. | |
| 772 cache->set_history_item_count(0); | |
| 773 SaveWordList(cache); | |
| 774 SaveWordMap(cache); | |
| 775 SaveCharWordMap(cache); | |
| 776 SaveWordIDHistoryMap(cache); | |
| 777 SaveHistoryInfoMap(cache); | |
| 778 SaveWordStartsMap(cache); | |
| 779 } | |
| 780 | |
| 781 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { | |
| 782 if (word_list_.empty()) | |
| 783 return; | |
| 784 WordListItem* list_item = cache->mutable_word_list(); | |
| 785 list_item->set_word_count(word_list_.size()); | |
| 786 for (String16Vector::const_iterator iter = word_list_.begin(); | |
| 787 iter != word_list_.end(); ++iter) | |
| 788 list_item->add_word(UTF16ToUTF8(*iter)); | |
| 789 } | |
| 790 | |
| 791 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { | |
| 792 if (word_map_.empty()) | |
| 793 return; | |
| 794 WordMapItem* map_item = cache->mutable_word_map(); | |
| 795 map_item->set_item_count(word_map_.size()); | |
| 796 for (WordMap::const_iterator iter = word_map_.begin(); | |
| 797 iter != word_map_.end(); ++iter) { | |
| 798 WordMapEntry* map_entry = map_item->add_word_map_entry(); | |
| 799 map_entry->set_word(UTF16ToUTF8(iter->first)); | |
| 800 map_entry->set_word_id(iter->second); | |
| 801 } | |
| 802 } | |
| 803 | |
| 804 void URLIndexPrivateData::SaveCharWordMap( | |
| 805 InMemoryURLIndexCacheItem* cache) const { | |
| 806 if (char_word_map_.empty()) | |
| 807 return; | |
| 808 CharWordMapItem* map_item = cache->mutable_char_word_map(); | |
| 809 map_item->set_item_count(char_word_map_.size()); | |
| 810 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); | |
| 811 iter != char_word_map_.end(); ++iter) { | |
| 812 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); | |
| 813 map_entry->set_char_16(iter->first); | |
| 814 const WordIDSet& word_id_set(iter->second); | |
| 815 map_entry->set_item_count(word_id_set.size()); | |
| 816 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); | |
| 817 set_iter != word_id_set.end(); ++set_iter) | |
| 818 map_entry->add_word_id(*set_iter); | |
| 819 } | |
| 820 } | |
| 821 | |
| 822 void URLIndexPrivateData::SaveWordIDHistoryMap( | |
| 823 InMemoryURLIndexCacheItem* cache) const { | |
| 824 if (word_id_history_map_.empty()) | |
| 825 return; | |
| 826 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); | |
| 827 map_item->set_item_count(word_id_history_map_.size()); | |
| 828 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); | |
| 829 iter != word_id_history_map_.end(); ++iter) { | |
| 830 WordIDHistoryMapEntry* map_entry = | |
| 831 map_item->add_word_id_history_map_entry(); | |
| 832 map_entry->set_word_id(iter->first); | |
| 833 const HistoryIDSet& history_id_set(iter->second); | |
| 834 map_entry->set_item_count(history_id_set.size()); | |
| 835 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); | |
| 836 set_iter != history_id_set.end(); ++set_iter) | |
| 837 map_entry->add_history_id(*set_iter); | |
| 838 } | |
| 839 } | |
| 840 | |
| 841 void URLIndexPrivateData::SaveHistoryInfoMap( | |
| 842 InMemoryURLIndexCacheItem* cache) const { | |
| 843 if (history_info_map_.empty()) | |
| 844 return; | |
| 845 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); | |
| 846 map_item->set_item_count(history_info_map_.size()); | |
| 847 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
| 848 iter != history_info_map_.end(); ++iter) { | |
| 849 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); | |
| 850 map_entry->set_history_id(iter->first); | |
| 851 const URLRow& url_row(iter->second); | |
| 852 // Note: We only save information that contributes to the index so there | |
| 853 // is no need to save search_term_cache_ (not persistent). | |
| 854 map_entry->set_visit_count(url_row.visit_count()); | |
| 855 map_entry->set_typed_count(url_row.typed_count()); | |
| 856 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); | |
| 857 map_entry->set_url(url_row.url().spec()); | |
| 858 map_entry->set_title(UTF16ToUTF8(url_row.title())); | |
| 859 } | |
| 860 } | |
| 861 | |
| 862 void URLIndexPrivateData::SaveWordStartsMap( | |
| 863 InMemoryURLIndexCacheItem* cache) const { | |
| 864 if (word_starts_map_.empty()) | |
| 865 return; | |
| 866 // For unit testing: Enable saving of the cache as an earlier version to | |
| 867 // allow testing of cache file upgrading in ReadFromFile(). | |
| 868 // TODO(mrossetti): Instead of intruding on production code with this kind of | |
| 869 // test harness, save a copy of an older version cache with known results. | |
| 870 // Implement this when switching the caching over to SQLite. | |
| 871 if (saved_cache_version_ < 1) | |
| 872 return; | |
| 873 | |
| 874 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); | |
| 875 map_item->set_item_count(word_starts_map_.size()); | |
| 876 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); | |
| 877 iter != word_starts_map_.end(); ++iter) { | |
| 878 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); | |
| 879 map_entry->set_history_id(iter->first); | |
| 880 const RowWordStarts& word_starts(iter->second); | |
| 881 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); | |
| 882 i != word_starts.url_word_starts_.end(); ++i) | |
| 883 map_entry->add_url_word_starts(*i); | |
| 884 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); | |
| 885 i != word_starts.title_word_starts_.end(); ++i) | |
| 886 map_entry->add_title_word_starts(*i); | |
| 887 } | |
| 888 } | |
| 889 | |
| 890 // Cache Restoring ------------------------------------------------------------- | |
| 891 | |
| 892 // static | |
| 893 void URLIndexPrivateData::RestoreFromFileTask( | |
| 894 const FilePath& file_path, | |
| 895 scoped_refptr<URLIndexPrivateData> private_data, | |
| 896 const std::string& languages) { | |
| 897 private_data = URLIndexPrivateData::RestoreFromFile(file_path, languages); | |
| 898 } | |
| 899 | |
| 900 // static | |
| 901 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( | |
| 902 const FilePath& file_path, | |
| 903 const std::string& languages) { | |
| 904 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 905 if (!file_util::PathExists(file_path)) | |
| 906 return NULL; | |
| 907 std::string data; | |
| 908 // If there is no cache file then simply give up. This will cause us to | |
| 909 // attempt to rebuild from the history database. | |
| 910 if (!file_util::ReadFileToString(file_path, &data)) | |
| 911 return NULL; | |
| 912 | |
| 913 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); | |
| 914 InMemoryURLIndexCacheItem index_cache; | |
| 915 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { | |
| 916 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " | |
| 917 << file_path.value(); | |
| 918 return restored_data; | |
| 919 } | |
| 920 | |
| 921 if (!restored_data->RestorePrivateData(index_cache, languages)) | |
| 922 return NULL; | |
| 923 | |
| 924 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", | |
| 925 base::TimeTicks::Now() - beginning_time); | |
| 926 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
| 927 restored_data->history_id_word_map_.size()); | |
| 928 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); | |
| 929 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
| 930 restored_data->word_map_.size()); | |
| 931 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
| 932 restored_data->char_word_map_.size()); | |
| 933 if (restored_data->Empty()) | |
| 934 return NULL; // 'No data' is the same as a failed reload. | |
| 935 return restored_data; | |
| 936 } | |
| 937 | |
| 938 // static | |
| 939 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
| 940 HistoryDatabase* history_db, | |
| 941 const std::string& languages, | |
| 942 const std::set<std::string>& scheme_whitelist) { | |
| 943 if (!history_db) | |
| 944 return NULL; | |
| 945 | |
| 946 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 947 | |
| 948 scoped_refptr<URLIndexPrivateData> rebuilt_data(new URLIndexPrivateData); | |
| 949 URLDatabase::URLEnumerator history_enum; | |
| 950 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
| 951 return NULL; | |
| 952 for (URLRow row; history_enum.GetNextURL(&row); ) | |
| 953 rebuilt_data->IndexRow(row, languages, scheme_whitelist); | |
| 954 | |
| 955 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
| 956 base::TimeTicks::Now() - beginning_time); | |
| 957 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
| 958 rebuilt_data->history_id_word_map_.size()); | |
| 959 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
| 960 rebuilt_data->word_map_.size()); | |
| 961 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
| 962 rebuilt_data->char_word_map_.size()); | |
| 963 return rebuilt_data; | |
| 964 } | |
| 965 | |
| 966 bool URLIndexPrivateData::RestorePrivateData( | |
| 967 const InMemoryURLIndexCacheItem& cache, | |
| 968 const std::string& languages) { | |
| 969 if (cache.has_version()) | |
| 970 restored_cache_version_ = cache.version(); | |
| 971 return RestoreWordList(cache) && RestoreWordMap(cache) && | |
| 972 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && | |
| 973 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); | |
| 974 } | |
| 975 | |
| 976 bool URLIndexPrivateData::RestoreWordList( | |
| 977 const InMemoryURLIndexCacheItem& cache) { | |
| 978 if (!cache.has_word_list()) | |
| 979 return false; | |
| 980 const WordListItem& list_item(cache.word_list()); | |
| 981 uint32 expected_item_count = list_item.word_count(); | |
| 982 uint32 actual_item_count = list_item.word_size(); | |
| 983 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 984 return false; | |
| 985 const RepeatedPtrField<std::string>& words(list_item.word()); | |
| 986 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); | |
| 987 iter != words.end(); ++iter) | |
| 988 word_list_.push_back(UTF8ToUTF16(*iter)); | |
| 989 return true; | |
| 990 } | |
| 991 | |
| 992 bool URLIndexPrivateData::RestoreWordMap( | |
| 993 const InMemoryURLIndexCacheItem& cache) { | |
| 994 if (!cache.has_word_map()) | |
| 995 return false; | |
| 996 const WordMapItem& list_item(cache.word_map()); | |
| 997 uint32 expected_item_count = list_item.item_count(); | |
| 998 uint32 actual_item_count = list_item.word_map_entry_size(); | |
| 999 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1000 return false; | |
| 1001 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); | |
| 1002 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); | |
| 1003 iter != entries.end(); ++iter) | |
| 1004 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id(); | |
| 1005 return true; | |
| 1006 } | |
| 1007 | |
| 1008 bool URLIndexPrivateData::RestoreCharWordMap( | |
| 1009 const InMemoryURLIndexCacheItem& cache) { | |
| 1010 if (!cache.has_char_word_map()) | |
| 1011 return false; | |
| 1012 const CharWordMapItem& list_item(cache.char_word_map()); | |
| 1013 uint32 expected_item_count = list_item.item_count(); | |
| 1014 uint32 actual_item_count = list_item.char_word_map_entry_size(); | |
| 1015 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1016 return false; | |
| 1017 const RepeatedPtrField<CharWordMapEntry>& | |
| 1018 entries(list_item.char_word_map_entry()); | |
| 1019 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = | |
| 1020 entries.begin(); iter != entries.end(); ++iter) { | |
| 1021 expected_item_count = iter->item_count(); | |
| 1022 actual_item_count = iter->word_id_size(); | |
| 1023 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1024 return false; | |
| 1025 char16 uni_char = static_cast<char16>(iter->char_16()); | |
| 1026 WordIDSet word_id_set; | |
| 1027 const RepeatedField<int32>& word_ids(iter->word_id()); | |
| 1028 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); | |
| 1029 jiter != word_ids.end(); ++jiter) | |
| 1030 word_id_set.insert(*jiter); | |
| 1031 char_word_map_[uni_char] = word_id_set; | |
| 1032 } | |
| 1033 return true; | |
| 1034 } | |
| 1035 | |
| 1036 bool URLIndexPrivateData::RestoreWordIDHistoryMap( | |
| 1037 const InMemoryURLIndexCacheItem& cache) { | |
| 1038 if (!cache.has_word_id_history_map()) | |
| 1039 return false; | |
| 1040 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); | |
| 1041 uint32 expected_item_count = list_item.item_count(); | |
| 1042 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); | |
| 1043 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1044 return false; | |
| 1045 const RepeatedPtrField<WordIDHistoryMapEntry>& | |
| 1046 entries(list_item.word_id_history_map_entry()); | |
| 1047 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = | |
| 1048 entries.begin(); iter != entries.end(); ++iter) { | |
| 1049 expected_item_count = iter->item_count(); | |
| 1050 actual_item_count = iter->history_id_size(); | |
| 1051 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1052 return false; | |
| 1053 WordID word_id = iter->word_id(); | |
| 1054 HistoryIDSet history_id_set; | |
| 1055 const RepeatedField<int64>& history_ids(iter->history_id()); | |
| 1056 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); | |
| 1057 jiter != history_ids.end(); ++jiter) { | |
| 1058 history_id_set.insert(*jiter); | |
| 1059 AddToHistoryIDWordMap(*jiter, word_id); | |
| 1060 } | |
| 1061 word_id_history_map_[word_id] = history_id_set; | |
| 1062 } | |
| 1063 return true; | |
| 1064 } | |
| 1065 | |
| 1066 bool URLIndexPrivateData::RestoreHistoryInfoMap( | |
| 1067 const InMemoryURLIndexCacheItem& cache) { | |
| 1068 if (!cache.has_history_info_map()) | |
| 1069 return false; | |
| 1070 const HistoryInfoMapItem& list_item(cache.history_info_map()); | |
| 1071 uint32 expected_item_count = list_item.item_count(); | |
| 1072 uint32 actual_item_count = list_item.history_info_map_entry_size(); | |
| 1073 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1074 return false; | |
| 1075 const RepeatedPtrField<HistoryInfoMapEntry>& | |
| 1076 entries(list_item.history_info_map_entry()); | |
| 1077 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = | |
| 1078 entries.begin(); iter != entries.end(); ++iter) { | |
| 1079 HistoryID history_id = iter->history_id(); | |
| 1080 GURL url(iter->url()); | |
| 1081 URLRow url_row(url, history_id); | |
| 1082 url_row.set_visit_count(iter->visit_count()); | |
| 1083 url_row.set_typed_count(iter->typed_count()); | |
| 1084 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); | |
| 1085 if (iter->has_title()) { | |
| 1086 string16 title(UTF8ToUTF16(iter->title())); | |
| 1087 url_row.set_title(title); | |
| 1088 } | |
| 1089 history_info_map_[history_id] = url_row; | |
| 1090 } | |
| 1091 return true; | |
| 1092 } | |
| 1093 | |
| 1094 bool URLIndexPrivateData::RestoreWordStartsMap( | |
| 1095 const InMemoryURLIndexCacheItem& cache, | |
| 1096 const std::string& languages) { | |
| 1097 // Note that this function must be called after RestoreHistoryInfoMap() has | |
| 1098 // been run as the word starts may have to be recalculated from the urls and | |
| 1099 // page titles. | |
| 1100 if (cache.has_word_starts_map()) { | |
| 1101 const WordStartsMapItem& list_item(cache.word_starts_map()); | |
| 1102 uint32 expected_item_count = list_item.item_count(); | |
| 1103 uint32 actual_item_count = list_item.word_starts_map_entry_size(); | |
| 1104 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1105 return false; | |
| 1106 const RepeatedPtrField<WordStartsMapEntry>& | |
| 1107 entries(list_item.word_starts_map_entry()); | |
| 1108 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = | |
| 1109 entries.begin(); iter != entries.end(); ++iter) { | |
| 1110 HistoryID history_id = iter->history_id(); | |
| 1111 RowWordStarts word_starts; | |
| 1112 // Restore the URL word starts. | |
| 1113 const RepeatedField<int32>& url_starts(iter->url_word_starts()); | |
| 1114 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); | |
| 1115 jiter != url_starts.end(); ++jiter) | |
| 1116 word_starts.url_word_starts_.push_back(*jiter); | |
| 1117 // Restore the page title word starts. | |
| 1118 const RepeatedField<int32>& title_starts(iter->title_word_starts()); | |
| 1119 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); | |
| 1120 jiter != title_starts.end(); ++jiter) | |
| 1121 word_starts.title_word_starts_.push_back(*jiter); | |
| 1122 word_starts_map_[history_id] = word_starts; | |
| 1123 } | |
| 1124 } else { | |
| 1125 // Since the cache did not contain any word starts we must rebuild then from | |
| 1126 // the URL and page titles. | |
| 1127 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
| 1128 iter != history_info_map_.end(); ++iter) { | |
| 1129 RowWordStarts word_starts; | |
| 1130 const URLRow& row(iter->second); | |
| 1131 string16 url(net::FormatUrl(row.url(), languages, | |
| 1132 net::kFormatUrlOmitUsernamePassword, | |
| 1133 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | |
| 1134 NULL, NULL, NULL)); | |
| 1135 url = base::i18n::ToLower(url); | |
| 1136 String16VectorFromString16(url, false, &word_starts.url_word_starts_); | |
| 1137 String16VectorFromString16( | |
| 1138 row.title(), false, &word_starts.title_word_starts_); | |
| 1139 word_starts_map_[iter->first] = word_starts; | |
| 1140 } | |
| 1141 } | |
| 1142 return true; | |
| 1143 } | 1083 } |
| 1144 | 1084 |
| 1145 // static | 1085 // static |
| 1146 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | 1086 bool URLIndexPrivateData::URLSchemeIsWhitelisted( |
| 1147 const GURL& gurl, | 1087 const GURL& gurl, |
| 1148 const std::set<std::string>& whitelist) { | 1088 const std::set<std::string>& whitelist) { |
| 1149 return whitelist.find(gurl.scheme()) != whitelist.end(); | 1089 return whitelist.find(gurl.scheme()) != whitelist.end(); |
| 1150 } | 1090 } |
| 1151 | 1091 |
| 1152 } // namespace history | 1092 } // namespace history |
| OLD | NEW |