| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/browser/history/url_index_private_data.h" | |
| 6 | |
| 7 #include <functional> | |
| 8 #include <iterator> | |
| 9 #include <limits> | |
| 10 #include <numeric> | |
| 11 #include <string> | |
| 12 #include <vector> | |
| 13 | |
| 14 #include "base/basictypes.h" | |
| 15 #include "base/files/file_util.h" | |
| 16 #include "base/i18n/break_iterator.h" | |
| 17 #include "base/i18n/case_conversion.h" | |
| 18 #include "base/metrics/histogram.h" | |
| 19 #include "base/strings/string_util.h" | |
| 20 #include "base/strings/utf_string_conversions.h" | |
| 21 #include "base/time/time.h" | |
| 22 #include "chrome/browser/history/history_service.h" | |
| 23 #include "chrome/browser/history/in_memory_url_index.h" | |
| 24 #include "components/bookmarks/browser/bookmark_utils.h" | |
| 25 #include "components/history/core/browser/history_database.h" | |
| 26 #include "components/history/core/browser/history_db_task.h" | |
| 27 #include "net/base/net_util.h" | |
| 28 | |
| 29 #if defined(USE_SYSTEM_PROTOBUF) | |
| 30 #include <google/protobuf/repeated_field.h> | |
| 31 #else | |
| 32 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | |
| 33 #endif | |
| 34 | |
| 35 using google::protobuf::RepeatedField; | |
| 36 using google::protobuf::RepeatedPtrField; | |
| 37 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
| 38 | |
| 39 namespace { | |
| 40 static const size_t kMaxVisitsToStoreInCache = 10u; | |
| 41 } // anonymous namespace | |
| 42 | |
| 43 namespace history { | |
| 44 | |
| 45 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | |
| 46 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | |
| 47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | |
| 48 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; | |
| 49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry | |
| 50 CharWordMapEntry; | |
| 51 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | |
| 52 WordIDHistoryMapItem; | |
| 53 typedef imui:: | |
| 54 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | |
| 55 WordIDHistoryMapEntry; | |
| 56 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; | |
| 57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | |
| 58 HistoryInfoMapEntry; | |
| 59 typedef imui:: | |
| 60 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo | |
| 61 HistoryInfoMapEntry_VisitInfo; | |
| 62 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; | |
| 63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry | |
| 64 WordStartsMapEntry; | |
| 65 | |
| 66 | |
| 67 // Algorithm Functions --------------------------------------------------------- | |
| 68 | |
| 69 // Comparison function for sorting search terms by descending length. | |
| 70 bool LengthGreater(const base::string16& string_a, | |
| 71 const base::string16& string_b) { | |
| 72 return string_a.length() > string_b.length(); | |
| 73 } | |
| 74 | |
| 75 | |
| 76 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- | |
| 77 | |
| 78 // HistoryDBTask used to update the recent visit data for a particular | |
| 79 // row from the history database. | |
| 80 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask { | |
| 81 public: | |
| 82 explicit UpdateRecentVisitsFromHistoryDBTask( | |
| 83 URLIndexPrivateData* private_data, | |
| 84 URLID url_id); | |
| 85 | |
| 86 bool RunOnDBThread(HistoryBackend* backend, | |
| 87 history::HistoryDatabase* db) override; | |
| 88 void DoneRunOnMainThread() override; | |
| 89 | |
| 90 private: | |
| 91 ~UpdateRecentVisitsFromHistoryDBTask() override; | |
| 92 | |
| 93 // The URLIndexPrivateData that gets updated after the historyDB | |
| 94 // task returns. | |
| 95 URLIndexPrivateData* private_data_; | |
| 96 // The ID of the URL to get visits for and then update. | |
| 97 URLID url_id_; | |
| 98 // Whether fetching the recent visits for the URL succeeded. | |
| 99 bool succeeded_; | |
| 100 // The awaited data that's shown to private_data_ for it to copy and | |
| 101 // store. | |
| 102 VisitVector recent_visits_; | |
| 103 | |
| 104 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); | |
| 105 }; | |
| 106 | |
| 107 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( | |
| 108 URLIndexPrivateData* private_data, | |
| 109 URLID url_id) | |
| 110 : private_data_(private_data), | |
| 111 url_id_(url_id), | |
| 112 succeeded_(false) { | |
| 113 } | |
| 114 | |
| 115 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( | |
| 116 HistoryBackend* backend, | |
| 117 HistoryDatabase* db) { | |
| 118 // Make sure the private data is going to get as many recent visits as | |
| 119 // ScoredHistoryMatch::GetFrequency() hopes to use. | |
| 120 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); | |
| 121 succeeded_ = db->GetMostRecentVisitsForURL(url_id_, | |
| 122 kMaxVisitsToStoreInCache, | |
| 123 &recent_visits_); | |
| 124 if (!succeeded_) | |
| 125 recent_visits_.clear(); | |
| 126 return true; // Always claim to be done; do not retry failures. | |
| 127 } | |
| 128 | |
| 129 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { | |
| 130 if (succeeded_) | |
| 131 private_data_->UpdateRecentVisits(url_id_, recent_visits_); | |
| 132 } | |
| 133 | |
| 134 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { | |
| 135 } | |
| 136 | |
| 137 | |
| 138 // URLIndexPrivateData --------------------------------------------------------- | |
| 139 | |
| 140 URLIndexPrivateData::URLIndexPrivateData() | |
| 141 : restored_cache_version_(0), | |
| 142 saved_cache_version_(kCurrentCacheFileVersion), | |
| 143 pre_filter_item_count_(0), | |
| 144 post_filter_item_count_(0), | |
| 145 post_scoring_item_count_(0) { | |
| 146 } | |
| 147 | |
| 148 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
| 149 base::string16 search_string, | |
| 150 size_t cursor_position, | |
| 151 size_t max_matches, | |
| 152 const std::string& languages, | |
| 153 const history::ScoredHistoryMatch::Builder& builder) { | |
| 154 // If cursor position is set and useful (not at either end of the | |
| 155 // string), allow the search string to be broken at cursor position. | |
| 156 // We do this by pretending there's a space where the cursor is. | |
| 157 if ((cursor_position != base::string16::npos) && | |
| 158 (cursor_position < search_string.length()) && | |
| 159 (cursor_position > 0)) { | |
| 160 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
| 161 } | |
| 162 pre_filter_item_count_ = 0; | |
| 163 post_filter_item_count_ = 0; | |
| 164 post_scoring_item_count_ = 0; | |
| 165 // The search string we receive may contain escaped characters. For reducing | |
| 166 // the index we need individual, lower-cased words, ignoring escapings. For | |
| 167 // the final filtering we need whitespace separated substrings possibly | |
| 168 // containing escaped characters. | |
| 169 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
| 170 base::string16 lower_unescaped_string = | |
| 171 net::UnescapeURLComponent(lower_raw_string, | |
| 172 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
| 173 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
| 174 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
| 175 // four 'words': "colspec", "id", "mstone" and "release". | |
| 176 String16Vector lower_words( | |
| 177 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
| 178 ScoredHistoryMatches scored_items; | |
| 179 | |
| 180 // Do nothing if we have indexed no words (probably because we've not been | |
| 181 // initialized yet) or the search string has no words. | |
| 182 if (word_list_.empty() || lower_words.empty()) { | |
| 183 search_term_cache_.clear(); // Invalidate the term cache. | |
| 184 return scored_items; | |
| 185 } | |
| 186 | |
| 187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
| 188 // approach. | |
| 189 ResetSearchTermCache(); | |
| 190 | |
| 191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
| 192 | |
| 193 // Trim the candidate pool if it is large. Note that we do not filter out | |
| 194 // items that do not contain the search terms as proper substrings -- doing | |
| 195 // so is the performance-costly operation we are trying to avoid in order | |
| 196 // to maintain omnibox responsiveness. | |
| 197 const size_t kItemsToScoreLimit = 500; | |
| 198 pre_filter_item_count_ = history_id_set.size(); | |
| 199 // If we trim the results set we do not want to cache the results for next | |
| 200 // time as the user's ultimately desired result could easily be eliminated | |
| 201 // in this early rough filter. | |
| 202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
| 203 if (was_trimmed) { | |
| 204 HistoryIDVector history_ids; | |
| 205 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 206 std::back_inserter(history_ids)); | |
| 207 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 208 // visit. | |
| 209 HistoryItemFactorGreater | |
| 210 item_factor_functor(history_info_map_); | |
| 211 std::partial_sort(history_ids.begin(), | |
| 212 history_ids.begin() + kItemsToScoreLimit, | |
| 213 history_ids.end(), | |
| 214 item_factor_functor); | |
| 215 history_id_set.clear(); | |
| 216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
| 217 std::inserter(history_id_set, history_id_set.end())); | |
| 218 post_filter_item_count_ = history_id_set.size(); | |
| 219 } | |
| 220 | |
| 221 // Pass over all of the candidates filtering out any without a proper | |
| 222 // substring match, inserting those which pass in order by score. Note that | |
| 223 // in this step we are using the raw search string complete with escaped | |
| 224 // URL elements. When the user has specifically typed something akin to | |
| 225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
| 226 // specific substring appears in the URL or page title. | |
| 227 | |
| 228 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
| 229 // we only want to break up the search string on 'true' whitespace rather than | |
| 230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
| 231 // get two 'terms': "colspec=id%20mstone" and "release". | |
| 232 history::String16Vector lower_raw_terms; | |
| 233 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16, | |
| 234 &lower_raw_terms) == 0) { | |
| 235 // Don't score matches when there are no terms to score against. (It's | |
| 236 // possible that the word break iterater that extracts words to search | |
| 237 // for in the database allows some whitespace "words" whereas Tokenize | |
| 238 // excludes a long list of whitespace.) One could write a scoring | |
| 239 // function that gives a reasonable order to matches when there | |
| 240 // are no terms (i.e., all the words are some form of whitespace), | |
| 241 // but this is such a rare edge case that it's not worth the time. | |
| 242 return scored_items; | |
| 243 } | |
| 244 scored_items = | |
| 245 std::for_each( | |
| 246 history_id_set.begin(), history_id_set.end(), | |
| 247 AddHistoryMatch(*this, languages, lower_raw_string, lower_raw_terms, | |
| 248 base::Time::Now(), builder)).ScoredMatches(); | |
| 249 | |
| 250 // Select and sort only the top |max_matches| results. | |
| 251 if (scored_items.size() > max_matches) { | |
| 252 std::partial_sort(scored_items.begin(), | |
| 253 scored_items.begin() + | |
| 254 max_matches, | |
| 255 scored_items.end(), | |
| 256 ScoredHistoryMatch::MatchScoreGreater); | |
| 257 scored_items.resize(max_matches); | |
| 258 } else { | |
| 259 std::sort(scored_items.begin(), scored_items.end(), | |
| 260 ScoredHistoryMatch::MatchScoreGreater); | |
| 261 } | |
| 262 post_scoring_item_count_ = scored_items.size(); | |
| 263 | |
| 264 if (was_trimmed) { | |
| 265 search_term_cache_.clear(); // Invalidate the term cache. | |
| 266 } else { | |
| 267 // Remove any stale SearchTermCacheItems. | |
| 268 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 269 cache_iter != search_term_cache_.end(); ) { | |
| 270 if (!cache_iter->second.used_) | |
| 271 search_term_cache_.erase(cache_iter++); | |
| 272 else | |
| 273 ++cache_iter; | |
| 274 } | |
| 275 } | |
| 276 | |
| 277 return scored_items; | |
| 278 } | |
| 279 | |
| 280 bool URLIndexPrivateData::UpdateURL( | |
| 281 HistoryService* history_service, | |
| 282 const URLRow& row, | |
| 283 const std::string& languages, | |
| 284 const std::set<std::string>& scheme_whitelist, | |
| 285 base::CancelableTaskTracker* tracker) { | |
| 286 // The row may or may not already be in our index. If it is not already | |
| 287 // indexed and it qualifies then it gets indexed. If it is already | |
| 288 // indexed and still qualifies then it gets updated, otherwise it | |
| 289 // is deleted from the index. | |
| 290 bool row_was_updated = false; | |
| 291 URLID row_id = row.id(); | |
| 292 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
| 293 if (row_pos == history_info_map_.end()) { | |
| 294 // This new row should be indexed if it qualifies. | |
| 295 URLRow new_row(row); | |
| 296 new_row.set_id(row_id); | |
| 297 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
| 298 IndexRow(NULL, | |
| 299 history_service, | |
| 300 new_row, | |
| 301 languages, | |
| 302 scheme_whitelist, | |
| 303 tracker); | |
| 304 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
| 305 // This indexed row still qualifies and will be re-indexed. | |
| 306 // The url won't have changed but the title, visit count, etc. | |
| 307 // might have changed. | |
| 308 URLRow& row_to_update = row_pos->second.url_row; | |
| 309 bool title_updated = row_to_update.title() != row.title(); | |
| 310 if (row_to_update.visit_count() != row.visit_count() || | |
| 311 row_to_update.typed_count() != row.typed_count() || | |
| 312 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
| 313 row_to_update.set_visit_count(row.visit_count()); | |
| 314 row_to_update.set_typed_count(row.typed_count()); | |
| 315 row_to_update.set_last_visit(row.last_visit()); | |
| 316 // If something appears to have changed, update the recent visits | |
| 317 // information. | |
| 318 ScheduleUpdateRecentVisits(history_service, row_id, tracker); | |
| 319 // While the URL is guaranteed to remain stable, the title may have | |
| 320 // changed. If so, then update the index with the changed words. | |
| 321 if (title_updated) { | |
| 322 // Clear all words associated with this row and re-index both the | |
| 323 // URL and title. | |
| 324 RemoveRowWordsFromIndex(row_to_update); | |
| 325 row_to_update.set_title(row.title()); | |
| 326 RowWordStarts word_starts; | |
| 327 AddRowWordsToIndex(row_to_update, &word_starts, languages); | |
| 328 word_starts_map_[row_id] = word_starts; | |
| 329 } | |
| 330 row_was_updated = true; | |
| 331 } | |
| 332 } else { | |
| 333 // This indexed row no longer qualifies and will be de-indexed by | |
| 334 // clearing all words associated with this row. | |
| 335 RemoveRowFromIndex(row); | |
| 336 row_was_updated = true; | |
| 337 } | |
| 338 if (row_was_updated) | |
| 339 search_term_cache_.clear(); // This invalidates the cache. | |
| 340 return row_was_updated; | |
| 341 } | |
| 342 | |
| 343 void URLIndexPrivateData::UpdateRecentVisits( | |
| 344 URLID url_id, | |
| 345 const VisitVector& recent_visits) { | |
| 346 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); | |
| 347 if (row_pos != history_info_map_.end()) { | |
| 348 VisitInfoVector* visits = &row_pos->second.visits; | |
| 349 visits->clear(); | |
| 350 const size_t size = | |
| 351 std::min(recent_visits.size(), kMaxVisitsToStoreInCache); | |
| 352 visits->reserve(size); | |
| 353 for (size_t i = 0; i < size; i++) { | |
| 354 // Copy from the VisitVector the only fields visits needs. | |
| 355 visits->push_back(std::make_pair(recent_visits[i].visit_time, | |
| 356 recent_visits[i].transition)); | |
| 357 } | |
| 358 } | |
| 359 // Else: Oddly, the URL doesn't seem to exist in the private index. | |
| 360 // Ignore this update. This can happen if, for instance, the user | |
| 361 // removes the URL from URLIndexPrivateData before the historyDB call | |
| 362 // returns. | |
| 363 } | |
| 364 | |
| 365 void URLIndexPrivateData::ScheduleUpdateRecentVisits( | |
| 366 HistoryService* history_service, | |
| 367 URLID url_id, | |
| 368 base::CancelableTaskTracker* tracker) { | |
| 369 history_service->ScheduleDBTask( | |
| 370 scoped_ptr<history::HistoryDBTask>( | |
| 371 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker); | |
| 372 } | |
| 373 | |
| 374 // Helper functor for DeleteURL. | |
| 375 class HistoryInfoMapItemHasURL { | |
| 376 public: | |
| 377 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
| 378 | |
| 379 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) { | |
| 380 return item.second.url_row.url() == url_; | |
| 381 } | |
| 382 | |
| 383 private: | |
| 384 const GURL& url_; | |
| 385 }; | |
| 386 | |
| 387 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
| 388 // Find the matching entry in the history_info_map_. | |
| 389 HistoryInfoMap::iterator pos = std::find_if( | |
| 390 history_info_map_.begin(), | |
| 391 history_info_map_.end(), | |
| 392 HistoryInfoMapItemHasURL(url)); | |
| 393 if (pos == history_info_map_.end()) | |
| 394 return false; | |
| 395 RemoveRowFromIndex(pos->second.url_row); | |
| 396 search_term_cache_.clear(); // This invalidates the cache. | |
| 397 return true; | |
| 398 } | |
| 399 | |
| 400 // static | |
| 401 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( | |
| 402 const base::FilePath& file_path, | |
| 403 const std::string& languages) { | |
| 404 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 405 if (!base::PathExists(file_path)) | |
| 406 return NULL; | |
| 407 std::string data; | |
| 408 // If there is no cache file then simply give up. This will cause us to | |
| 409 // attempt to rebuild from the history database. | |
| 410 if (!base::ReadFileToString(file_path, &data)) | |
| 411 return NULL; | |
| 412 | |
| 413 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); | |
| 414 InMemoryURLIndexCacheItem index_cache; | |
| 415 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { | |
| 416 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " | |
| 417 << file_path.value(); | |
| 418 return restored_data; | |
| 419 } | |
| 420 | |
| 421 if (!restored_data->RestorePrivateData(index_cache, languages)) | |
| 422 return NULL; | |
| 423 | |
| 424 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", | |
| 425 base::TimeTicks::Now() - beginning_time); | |
| 426 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
| 427 restored_data->history_id_word_map_.size()); | |
| 428 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); | |
| 429 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
| 430 restored_data->word_map_.size()); | |
| 431 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
| 432 restored_data->char_word_map_.size()); | |
| 433 if (restored_data->Empty()) | |
| 434 return NULL; // 'No data' is the same as a failed reload. | |
| 435 return restored_data; | |
| 436 } | |
| 437 | |
| 438 // static | |
| 439 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
| 440 HistoryDatabase* history_db, | |
| 441 const std::string& languages, | |
| 442 const std::set<std::string>& scheme_whitelist) { | |
| 443 if (!history_db) | |
| 444 return NULL; | |
| 445 | |
| 446 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 447 | |
| 448 scoped_refptr<URLIndexPrivateData> | |
| 449 rebuilt_data(new URLIndexPrivateData); | |
| 450 URLDatabase::URLEnumerator history_enum; | |
| 451 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
| 452 return NULL; | |
| 453 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); | |
| 454 for (URLRow row; history_enum.GetNextURL(&row); ) { | |
| 455 rebuilt_data->IndexRow( | |
| 456 history_db, NULL, row, languages, scheme_whitelist, NULL); | |
| 457 } | |
| 458 | |
| 459 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
| 460 base::TimeTicks::Now() - beginning_time); | |
| 461 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
| 462 rebuilt_data->history_id_word_map_.size()); | |
| 463 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
| 464 rebuilt_data->word_map_.size()); | |
| 465 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
| 466 rebuilt_data->char_word_map_.size()); | |
| 467 return rebuilt_data; | |
| 468 } | |
| 469 | |
| 470 // static | |
| 471 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( | |
| 472 scoped_refptr<URLIndexPrivateData> private_data, | |
| 473 const base::FilePath& file_path) { | |
| 474 DCHECK(private_data.get()); | |
| 475 DCHECK(!file_path.empty()); | |
| 476 return private_data->SaveToFile(file_path); | |
| 477 } | |
| 478 | |
| 479 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { | |
| 480 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; | |
| 481 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; | |
| 482 data_copy->word_list_ = word_list_; | |
| 483 data_copy->available_words_ = available_words_; | |
| 484 data_copy->word_map_ = word_map_; | |
| 485 data_copy->char_word_map_ = char_word_map_; | |
| 486 data_copy->word_id_history_map_ = word_id_history_map_; | |
| 487 data_copy->history_id_word_map_ = history_id_word_map_; | |
| 488 data_copy->history_info_map_ = history_info_map_; | |
| 489 data_copy->word_starts_map_ = word_starts_map_; | |
| 490 return data_copy; | |
| 491 // Not copied: | |
| 492 // search_term_cache_ | |
| 493 // pre_filter_item_count_ | |
| 494 // post_filter_item_count_ | |
| 495 // post_scoring_item_count_ | |
| 496 } | |
| 497 | |
| 498 bool URLIndexPrivateData::Empty() const { | |
| 499 return history_info_map_.empty(); | |
| 500 } | |
| 501 | |
| 502 void URLIndexPrivateData::Clear() { | |
| 503 last_time_rebuilt_from_history_ = base::Time(); | |
| 504 word_list_.clear(); | |
| 505 available_words_.clear(); | |
| 506 word_map_.clear(); | |
| 507 char_word_map_.clear(); | |
| 508 word_id_history_map_.clear(); | |
| 509 history_id_word_map_.clear(); | |
| 510 history_info_map_.clear(); | |
| 511 word_starts_map_.clear(); | |
| 512 } | |
| 513 | |
| 514 URLIndexPrivateData::~URLIndexPrivateData() {} | |
| 515 | |
| 516 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | |
| 517 const String16Vector& unsorted_words) { | |
| 518 // Break the terms down into individual terms (words), get the candidate | |
| 519 // set for each term, and intersect each to get a final candidate list. | |
| 520 // Note that a single 'term' from the user's perspective might be | |
| 521 // a string like "http://www.somewebsite.com" which, from our perspective, | |
| 522 // is four words: 'http', 'www', 'somewebsite', and 'com'. | |
| 523 HistoryIDSet history_id_set; | |
| 524 String16Vector words(unsorted_words); | |
| 525 // Sort the words into the longest first as such are likely to narrow down | |
| 526 // the results quicker. Also, single character words are the most expensive | |
| 527 // to process so save them for last. | |
| 528 std::sort(words.begin(), words.end(), LengthGreater); | |
| 529 for (String16Vector::iterator iter = words.begin(); iter != words.end(); | |
| 530 ++iter) { | |
| 531 base::string16 uni_word = *iter; | |
| 532 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | |
| 533 if (term_history_set.empty()) { | |
| 534 history_id_set.clear(); | |
| 535 break; | |
| 536 } | |
| 537 if (iter == words.begin()) { | |
| 538 history_id_set.swap(term_history_set); | |
| 539 } else { | |
| 540 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( | |
| 541 history_id_set, term_history_set); | |
| 542 history_id_set.swap(new_history_id_set); | |
| 543 } | |
| 544 } | |
| 545 return history_id_set; | |
| 546 } | |
| 547 | |
| 548 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( | |
| 549 const base::string16& term) { | |
| 550 if (term.empty()) | |
| 551 return HistoryIDSet(); | |
| 552 | |
| 553 // TODO(mrossetti): Consider optimizing for very common terms such as | |
| 554 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently | |
| 555 // occuring words in the user's searches. | |
| 556 | |
| 557 size_t term_length = term.length(); | |
| 558 WordIDSet word_id_set; | |
| 559 if (term_length > 1) { | |
| 560 // See if this term or a prefix thereof is present in the cache. | |
| 561 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); | |
| 562 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 563 cache_iter != search_term_cache_.end(); ++cache_iter) { | |
| 564 if (StartsWith(term, cache_iter->first, false) && | |
| 565 (best_prefix == search_term_cache_.end() || | |
| 566 cache_iter->first.length() > best_prefix->first.length())) | |
| 567 best_prefix = cache_iter; | |
| 568 } | |
| 569 | |
| 570 // If a prefix was found then determine the leftover characters to be used | |
| 571 // for further refining the results from that prefix. | |
| 572 Char16Set prefix_chars; | |
| 573 base::string16 leftovers(term); | |
| 574 if (best_prefix != search_term_cache_.end()) { | |
| 575 // If the prefix is an exact match for the term then grab the cached | |
| 576 // results and we're done. | |
| 577 size_t prefix_length = best_prefix->first.length(); | |
| 578 if (prefix_length == term_length) { | |
| 579 best_prefix->second.used_ = true; | |
| 580 return best_prefix->second.history_id_set_; | |
| 581 } | |
| 582 | |
| 583 // Otherwise we have a handy starting point. | |
| 584 // If there are no history results for this prefix then we can bail early | |
| 585 // as there will be no history results for the full term. | |
| 586 if (best_prefix->second.history_id_set_.empty()) { | |
| 587 search_term_cache_[term] = SearchTermCacheItem(); | |
| 588 return HistoryIDSet(); | |
| 589 } | |
| 590 word_id_set = best_prefix->second.word_id_set_; | |
| 591 prefix_chars = Char16SetFromString16(best_prefix->first); | |
| 592 leftovers = term.substr(prefix_length); | |
| 593 } | |
| 594 | |
| 595 // Filter for each remaining, unique character in the term. | |
| 596 Char16Set leftover_chars = Char16SetFromString16(leftovers); | |
| 597 Char16Set unique_chars = | |
| 598 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); | |
| 599 | |
| 600 // Reduce the word set with any leftover, unprocessed characters. | |
| 601 if (!unique_chars.empty()) { | |
| 602 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); | |
| 603 // We might come up empty on the leftovers. | |
| 604 if (leftover_set.empty()) { | |
| 605 search_term_cache_[term] = SearchTermCacheItem(); | |
| 606 return HistoryIDSet(); | |
| 607 } | |
| 608 // Or there may not have been a prefix from which to start. | |
| 609 if (prefix_chars.empty()) { | |
| 610 word_id_set.swap(leftover_set); | |
| 611 } else { | |
| 612 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | |
| 613 word_id_set, leftover_set); | |
| 614 word_id_set.swap(new_word_id_set); | |
| 615 } | |
| 616 } | |
| 617 | |
| 618 // We must filter the word list because the resulting word set surely | |
| 619 // contains words which do not have the search term as a proper subset. | |
| 620 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); | |
| 621 word_set_iter != word_id_set.end(); ) { | |
| 622 if (word_list_[*word_set_iter].find(term) == base::string16::npos) | |
| 623 word_id_set.erase(word_set_iter++); | |
| 624 else | |
| 625 ++word_set_iter; | |
| 626 } | |
| 627 } else { | |
| 628 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); | |
| 629 } | |
| 630 | |
| 631 // If any words resulted then we can compose a set of history IDs by unioning | |
| 632 // the sets from each word. | |
| 633 HistoryIDSet history_id_set; | |
| 634 if (!word_id_set.empty()) { | |
| 635 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | |
| 636 word_id_iter != word_id_set.end(); ++word_id_iter) { | |
| 637 WordID word_id = *word_id_iter; | |
| 638 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | |
| 639 if (word_iter != word_id_history_map_.end()) { | |
| 640 HistoryIDSet& word_history_id_set(word_iter->second); | |
| 641 history_id_set.insert(word_history_id_set.begin(), | |
| 642 word_history_id_set.end()); | |
| 643 } | |
| 644 } | |
| 645 } | |
| 646 | |
| 647 // Record a new cache entry for this word if the term is longer than | |
| 648 // a single character. | |
| 649 if (term_length > 1) | |
| 650 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); | |
| 651 | |
| 652 return history_id_set; | |
| 653 } | |
| 654 | |
| 655 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( | |
| 656 const Char16Set& term_chars) { | |
| 657 WordIDSet word_id_set; | |
| 658 for (Char16Set::const_iterator c_iter = term_chars.begin(); | |
| 659 c_iter != term_chars.end(); ++c_iter) { | |
| 660 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); | |
| 661 if (char_iter == char_word_map_.end()) { | |
| 662 // A character was not found so there are no matching results: bail. | |
| 663 word_id_set.clear(); | |
| 664 break; | |
| 665 } | |
| 666 WordIDSet& char_word_id_set(char_iter->second); | |
| 667 // It is possible for there to no longer be any words associated with | |
| 668 // a particular character. Give up in that case. | |
| 669 if (char_word_id_set.empty()) { | |
| 670 word_id_set.clear(); | |
| 671 break; | |
| 672 } | |
| 673 | |
| 674 if (c_iter == term_chars.begin()) { | |
| 675 // First character results becomes base set of results. | |
| 676 word_id_set = char_word_id_set; | |
| 677 } else { | |
| 678 // Subsequent character results get intersected in. | |
| 679 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | |
| 680 word_id_set, char_word_id_set); | |
| 681 word_id_set.swap(new_word_id_set); | |
| 682 } | |
| 683 } | |
| 684 return word_id_set; | |
| 685 } | |
| 686 | |
| 687 bool URLIndexPrivateData::IndexRow( | |
| 688 HistoryDatabase* history_db, | |
| 689 HistoryService* history_service, | |
| 690 const URLRow& row, | |
| 691 const std::string& languages, | |
| 692 const std::set<std::string>& scheme_whitelist, | |
| 693 base::CancelableTaskTracker* tracker) { | |
| 694 const GURL& gurl(row.url()); | |
| 695 | |
| 696 // Index only URLs with a whitelisted scheme. | |
| 697 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) | |
| 698 return false; | |
| 699 | |
| 700 URLID row_id = row.id(); | |
| 701 // Strip out username and password before saving and indexing. | |
| 702 base::string16 url(net::FormatUrl(gurl, languages, | |
| 703 net::kFormatUrlOmitUsernamePassword, | |
| 704 net::UnescapeRule::NONE, | |
| 705 NULL, NULL, NULL)); | |
| 706 | |
| 707 HistoryID history_id = static_cast<HistoryID>(row_id); | |
| 708 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); | |
| 709 | |
| 710 // Add the row for quick lookup in the history info store. | |
| 711 URLRow new_row(GURL(url), row_id); | |
| 712 new_row.set_visit_count(row.visit_count()); | |
| 713 new_row.set_typed_count(row.typed_count()); | |
| 714 new_row.set_last_visit(row.last_visit()); | |
| 715 new_row.set_title(row.title()); | |
| 716 history_info_map_[history_id].url_row = new_row; | |
| 717 | |
| 718 // Index the words contained in the URL and title of the row. | |
| 719 RowWordStarts word_starts; | |
| 720 AddRowWordsToIndex(new_row, &word_starts, languages); | |
| 721 word_starts_map_[history_id] = word_starts; | |
| 722 | |
| 723 // Update the recent visits information or schedule the update | |
| 724 // as appropriate. | |
| 725 if (history_db) { | |
| 726 // We'd like to check that we're on the history DB thread. | |
| 727 // However, unittest code actually calls this on the UI thread. | |
| 728 // So we don't do any thread checks. | |
| 729 VisitVector recent_visits; | |
| 730 // Make sure the private data is going to get as many recent visits as | |
| 731 // ScoredHistoryMatch::GetFrequency() hopes to use. | |
| 732 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); | |
| 733 if (history_db->GetMostRecentVisitsForURL(row_id, | |
| 734 kMaxVisitsToStoreInCache, | |
| 735 &recent_visits)) | |
| 736 UpdateRecentVisits(row_id, recent_visits); | |
| 737 } else { | |
| 738 DCHECK(tracker); | |
| 739 DCHECK(history_service); | |
| 740 ScheduleUpdateRecentVisits(history_service, row_id, tracker); | |
| 741 } | |
| 742 | |
| 743 return true; | |
| 744 } | |
| 745 | |
| 746 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, | |
| 747 RowWordStarts* word_starts, | |
| 748 const std::string& languages) { | |
| 749 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
| 750 // Split URL into individual, unique words then add in the title words. | |
| 751 const GURL& gurl(row.url()); | |
| 752 const base::string16& url = | |
| 753 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL); | |
| 754 String16Set url_words = String16SetFromString16(url, | |
| 755 word_starts ? &word_starts->url_word_starts_ : NULL); | |
| 756 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title()); | |
| 757 String16Set title_words = String16SetFromString16(title, | |
| 758 word_starts ? &word_starts->title_word_starts_ : NULL); | |
| 759 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words); | |
| 760 for (String16Set::iterator word_iter = words.begin(); | |
| 761 word_iter != words.end(); ++word_iter) | |
| 762 AddWordToIndex(*word_iter, history_id); | |
| 763 | |
| 764 search_term_cache_.clear(); // Invalidate the term cache. | |
| 765 } | |
| 766 | |
| 767 void URLIndexPrivateData::AddWordToIndex(const base::string16& term, | |
| 768 HistoryID history_id) { | |
| 769 WordMap::iterator word_pos = word_map_.find(term); | |
| 770 if (word_pos != word_map_.end()) | |
| 771 UpdateWordHistory(word_pos->second, history_id); | |
| 772 else | |
| 773 AddWordHistory(term, history_id); | |
| 774 } | |
| 775 | |
| 776 void URLIndexPrivateData::AddWordHistory(const base::string16& term, | |
| 777 HistoryID history_id) { | |
| 778 WordID word_id = word_list_.size(); | |
| 779 if (available_words_.empty()) { | |
| 780 word_list_.push_back(term); | |
| 781 } else { | |
| 782 word_id = *(available_words_.begin()); | |
| 783 word_list_[word_id] = term; | |
| 784 available_words_.erase(word_id); | |
| 785 } | |
| 786 word_map_[term] = word_id; | |
| 787 | |
| 788 HistoryIDSet history_id_set; | |
| 789 history_id_set.insert(history_id); | |
| 790 word_id_history_map_[word_id] = history_id_set; | |
| 791 AddToHistoryIDWordMap(history_id, word_id); | |
| 792 | |
| 793 // For each character in the newly added word (i.e. a word that is not | |
| 794 // already in the word index), add the word to the character index. | |
| 795 Char16Set characters = Char16SetFromString16(term); | |
| 796 for (Char16Set::iterator uni_char_iter = characters.begin(); | |
| 797 uni_char_iter != characters.end(); ++uni_char_iter) { | |
| 798 base::char16 uni_char = *uni_char_iter; | |
| 799 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); | |
| 800 if (char_iter != char_word_map_.end()) { | |
| 801 // Update existing entry in the char/word index. | |
| 802 WordIDSet& word_id_set(char_iter->second); | |
| 803 word_id_set.insert(word_id); | |
| 804 } else { | |
| 805 // Create a new entry in the char/word index. | |
| 806 WordIDSet word_id_set; | |
| 807 word_id_set.insert(word_id); | |
| 808 char_word_map_[uni_char] = word_id_set; | |
| 809 } | |
| 810 } | |
| 811 } | |
| 812 | |
| 813 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, | |
| 814 HistoryID history_id) { | |
| 815 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); | |
| 816 DCHECK(history_pos != word_id_history_map_.end()); | |
| 817 HistoryIDSet& history_id_set(history_pos->second); | |
| 818 history_id_set.insert(history_id); | |
| 819 AddToHistoryIDWordMap(history_id, word_id); | |
| 820 } | |
| 821 | |
| 822 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, | |
| 823 WordID word_id) { | |
| 824 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); | |
| 825 if (iter != history_id_word_map_.end()) { | |
| 826 WordIDSet& word_id_set(iter->second); | |
| 827 word_id_set.insert(word_id); | |
| 828 } else { | |
| 829 WordIDSet word_id_set; | |
| 830 word_id_set.insert(word_id); | |
| 831 history_id_word_map_[history_id] = word_id_set; | |
| 832 } | |
| 833 } | |
| 834 | |
| 835 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { | |
| 836 RemoveRowWordsFromIndex(row); | |
| 837 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
| 838 history_info_map_.erase(history_id); | |
| 839 word_starts_map_.erase(history_id); | |
| 840 } | |
| 841 | |
| 842 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { | |
| 843 // Remove the entries in history_id_word_map_ and word_id_history_map_ for | |
| 844 // this row. | |
| 845 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
| 846 WordIDSet word_id_set = history_id_word_map_[history_id]; | |
| 847 history_id_word_map_.erase(history_id); | |
| 848 | |
| 849 // Reconcile any changes to word usage. | |
| 850 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | |
| 851 word_id_iter != word_id_set.end(); ++word_id_iter) { | |
| 852 WordID word_id = *word_id_iter; | |
| 853 word_id_history_map_[word_id].erase(history_id); | |
| 854 if (!word_id_history_map_[word_id].empty()) | |
| 855 continue; // The word is still in use. | |
| 856 | |
| 857 // The word is no longer in use. Reconcile any changes to character usage. | |
| 858 base::string16 word = word_list_[word_id]; | |
| 859 Char16Set characters = Char16SetFromString16(word); | |
| 860 for (Char16Set::iterator uni_char_iter = characters.begin(); | |
| 861 uni_char_iter != characters.end(); ++uni_char_iter) { | |
| 862 base::char16 uni_char = *uni_char_iter; | |
| 863 char_word_map_[uni_char].erase(word_id); | |
| 864 if (char_word_map_[uni_char].empty()) | |
| 865 char_word_map_.erase(uni_char); // No longer in use. | |
| 866 } | |
| 867 | |
| 868 // Complete the removal of references to the word. | |
| 869 word_id_history_map_.erase(word_id); | |
| 870 word_map_.erase(word); | |
| 871 word_list_[word_id] = base::string16(); | |
| 872 available_words_.insert(word_id); | |
| 873 } | |
| 874 } | |
| 875 | |
| 876 void URLIndexPrivateData::ResetSearchTermCache() { | |
| 877 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | |
| 878 iter != search_term_cache_.end(); ++iter) | |
| 879 iter->second.used_ = false; | |
| 880 } | |
| 881 | |
| 882 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { | |
| 883 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 884 InMemoryURLIndexCacheItem index_cache; | |
| 885 SavePrivateData(&index_cache); | |
| 886 std::string data; | |
| 887 if (!index_cache.SerializeToString(&data)) { | |
| 888 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; | |
| 889 return false; | |
| 890 } | |
| 891 | |
| 892 int size = data.size(); | |
| 893 if (base::WriteFile(file_path, data.c_str(), size) != size) { | |
| 894 LOG(WARNING) << "Failed to write " << file_path.value(); | |
| 895 return false; | |
| 896 } | |
| 897 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", | |
| 898 base::TimeTicks::Now() - beginning_time); | |
| 899 return true; | |
| 900 } | |
| 901 | |
| 902 void URLIndexPrivateData::SavePrivateData( | |
| 903 InMemoryURLIndexCacheItem* cache) const { | |
| 904 DCHECK(cache); | |
| 905 cache->set_last_rebuild_timestamp( | |
| 906 last_time_rebuilt_from_history_.ToInternalValue()); | |
| 907 cache->set_version(saved_cache_version_); | |
| 908 // history_item_count_ is no longer used but rather than change the protobuf | |
| 909 // definition use a placeholder. This will go away with the switch to SQLite. | |
| 910 cache->set_history_item_count(0); | |
| 911 SaveWordList(cache); | |
| 912 SaveWordMap(cache); | |
| 913 SaveCharWordMap(cache); | |
| 914 SaveWordIDHistoryMap(cache); | |
| 915 SaveHistoryInfoMap(cache); | |
| 916 SaveWordStartsMap(cache); | |
| 917 } | |
| 918 | |
| 919 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { | |
| 920 if (word_list_.empty()) | |
| 921 return; | |
| 922 WordListItem* list_item = cache->mutable_word_list(); | |
| 923 list_item->set_word_count(word_list_.size()); | |
| 924 for (String16Vector::const_iterator iter = word_list_.begin(); | |
| 925 iter != word_list_.end(); ++iter) | |
| 926 list_item->add_word(base::UTF16ToUTF8(*iter)); | |
| 927 } | |
| 928 | |
| 929 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { | |
| 930 if (word_map_.empty()) | |
| 931 return; | |
| 932 WordMapItem* map_item = cache->mutable_word_map(); | |
| 933 map_item->set_item_count(word_map_.size()); | |
| 934 for (WordMap::const_iterator iter = word_map_.begin(); | |
| 935 iter != word_map_.end(); ++iter) { | |
| 936 WordMapEntry* map_entry = map_item->add_word_map_entry(); | |
| 937 map_entry->set_word(base::UTF16ToUTF8(iter->first)); | |
| 938 map_entry->set_word_id(iter->second); | |
| 939 } | |
| 940 } | |
| 941 | |
| 942 void URLIndexPrivateData::SaveCharWordMap( | |
| 943 InMemoryURLIndexCacheItem* cache) const { | |
| 944 if (char_word_map_.empty()) | |
| 945 return; | |
| 946 CharWordMapItem* map_item = cache->mutable_char_word_map(); | |
| 947 map_item->set_item_count(char_word_map_.size()); | |
| 948 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); | |
| 949 iter != char_word_map_.end(); ++iter) { | |
| 950 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); | |
| 951 map_entry->set_char_16(iter->first); | |
| 952 const WordIDSet& word_id_set(iter->second); | |
| 953 map_entry->set_item_count(word_id_set.size()); | |
| 954 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); | |
| 955 set_iter != word_id_set.end(); ++set_iter) | |
| 956 map_entry->add_word_id(*set_iter); | |
| 957 } | |
| 958 } | |
| 959 | |
| 960 void URLIndexPrivateData::SaveWordIDHistoryMap( | |
| 961 InMemoryURLIndexCacheItem* cache) const { | |
| 962 if (word_id_history_map_.empty()) | |
| 963 return; | |
| 964 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); | |
| 965 map_item->set_item_count(word_id_history_map_.size()); | |
| 966 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); | |
| 967 iter != word_id_history_map_.end(); ++iter) { | |
| 968 WordIDHistoryMapEntry* map_entry = | |
| 969 map_item->add_word_id_history_map_entry(); | |
| 970 map_entry->set_word_id(iter->first); | |
| 971 const HistoryIDSet& history_id_set(iter->second); | |
| 972 map_entry->set_item_count(history_id_set.size()); | |
| 973 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); | |
| 974 set_iter != history_id_set.end(); ++set_iter) | |
| 975 map_entry->add_history_id(*set_iter); | |
| 976 } | |
| 977 } | |
| 978 | |
| 979 void URLIndexPrivateData::SaveHistoryInfoMap( | |
| 980 InMemoryURLIndexCacheItem* cache) const { | |
| 981 if (history_info_map_.empty()) | |
| 982 return; | |
| 983 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); | |
| 984 map_item->set_item_count(history_info_map_.size()); | |
| 985 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
| 986 iter != history_info_map_.end(); ++iter) { | |
| 987 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); | |
| 988 map_entry->set_history_id(iter->first); | |
| 989 const URLRow& url_row(iter->second.url_row); | |
| 990 // Note: We only save information that contributes to the index so there | |
| 991 // is no need to save search_term_cache_ (not persistent). | |
| 992 map_entry->set_visit_count(url_row.visit_count()); | |
| 993 map_entry->set_typed_count(url_row.typed_count()); | |
| 994 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); | |
| 995 map_entry->set_url(url_row.url().spec()); | |
| 996 map_entry->set_title(base::UTF16ToUTF8(url_row.title())); | |
| 997 const VisitInfoVector& visits(iter->second.visits); | |
| 998 for (VisitInfoVector::const_iterator visit_iter = visits.begin(); | |
| 999 visit_iter != visits.end(); ++visit_iter) { | |
| 1000 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); | |
| 1001 visit_info->set_visit_time(visit_iter->first.ToInternalValue()); | |
| 1002 visit_info->set_transition_type(visit_iter->second); | |
| 1003 } | |
| 1004 } | |
| 1005 } | |
| 1006 | |
| 1007 void URLIndexPrivateData::SaveWordStartsMap( | |
| 1008 InMemoryURLIndexCacheItem* cache) const { | |
| 1009 if (word_starts_map_.empty()) | |
| 1010 return; | |
| 1011 // For unit testing: Enable saving of the cache as an earlier version to | |
| 1012 // allow testing of cache file upgrading in ReadFromFile(). | |
| 1013 // TODO(mrossetti): Instead of intruding on production code with this kind of | |
| 1014 // test harness, save a copy of an older version cache with known results. | |
| 1015 // Implement this when switching the caching over to SQLite. | |
| 1016 if (saved_cache_version_ < 1) | |
| 1017 return; | |
| 1018 | |
| 1019 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); | |
| 1020 map_item->set_item_count(word_starts_map_.size()); | |
| 1021 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); | |
| 1022 iter != word_starts_map_.end(); ++iter) { | |
| 1023 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); | |
| 1024 map_entry->set_history_id(iter->first); | |
| 1025 const RowWordStarts& word_starts(iter->second); | |
| 1026 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); | |
| 1027 i != word_starts.url_word_starts_.end(); ++i) | |
| 1028 map_entry->add_url_word_starts(*i); | |
| 1029 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); | |
| 1030 i != word_starts.title_word_starts_.end(); ++i) | |
| 1031 map_entry->add_title_word_starts(*i); | |
| 1032 } | |
| 1033 } | |
| 1034 | |
| 1035 bool URLIndexPrivateData::RestorePrivateData( | |
| 1036 const InMemoryURLIndexCacheItem& cache, | |
| 1037 const std::string& languages) { | |
| 1038 last_time_rebuilt_from_history_ = | |
| 1039 base::Time::FromInternalValue(cache.last_rebuild_timestamp()); | |
| 1040 const base::TimeDelta rebuilt_ago = | |
| 1041 base::Time::Now() - last_time_rebuilt_from_history_; | |
| 1042 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || | |
| 1043 (rebuilt_ago < base::TimeDelta::FromDays(-1))) { | |
| 1044 // Cache is more than a week old or, somehow, from some time in the future. | |
| 1045 // It's probably a good time to rebuild the index from history to | |
| 1046 // allow synced entries to now appear, expired entries to disappear, etc. | |
| 1047 // Allow one day in the future to make the cache not rebuild on simple | |
| 1048 // system clock changes such as time zone changes. | |
| 1049 return false; | |
| 1050 } | |
| 1051 if (cache.has_version()) { | |
| 1052 if (cache.version() < kCurrentCacheFileVersion) { | |
| 1053 // Don't try to restore an old format cache file. (This will cause | |
| 1054 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData | |
| 1055 // from history.) | |
| 1056 return false; | |
| 1057 } | |
| 1058 restored_cache_version_ = cache.version(); | |
| 1059 } | |
| 1060 return RestoreWordList(cache) && RestoreWordMap(cache) && | |
| 1061 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && | |
| 1062 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); | |
| 1063 } | |
| 1064 | |
| 1065 bool URLIndexPrivateData::RestoreWordList( | |
| 1066 const InMemoryURLIndexCacheItem& cache) { | |
| 1067 if (!cache.has_word_list()) | |
| 1068 return false; | |
| 1069 const WordListItem& list_item(cache.word_list()); | |
| 1070 uint32 expected_item_count = list_item.word_count(); | |
| 1071 uint32 actual_item_count = list_item.word_size(); | |
| 1072 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1073 return false; | |
| 1074 const RepeatedPtrField<std::string>& words(list_item.word()); | |
| 1075 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); | |
| 1076 iter != words.end(); ++iter) | |
| 1077 word_list_.push_back(base::UTF8ToUTF16(*iter)); | |
| 1078 return true; | |
| 1079 } | |
| 1080 | |
| 1081 bool URLIndexPrivateData::RestoreWordMap( | |
| 1082 const InMemoryURLIndexCacheItem& cache) { | |
| 1083 if (!cache.has_word_map()) | |
| 1084 return false; | |
| 1085 const WordMapItem& list_item(cache.word_map()); | |
| 1086 uint32 expected_item_count = list_item.item_count(); | |
| 1087 uint32 actual_item_count = list_item.word_map_entry_size(); | |
| 1088 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1089 return false; | |
| 1090 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); | |
| 1091 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); | |
| 1092 iter != entries.end(); ++iter) | |
| 1093 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id(); | |
| 1094 return true; | |
| 1095 } | |
| 1096 | |
| 1097 bool URLIndexPrivateData::RestoreCharWordMap( | |
| 1098 const InMemoryURLIndexCacheItem& cache) { | |
| 1099 if (!cache.has_char_word_map()) | |
| 1100 return false; | |
| 1101 const CharWordMapItem& list_item(cache.char_word_map()); | |
| 1102 uint32 expected_item_count = list_item.item_count(); | |
| 1103 uint32 actual_item_count = list_item.char_word_map_entry_size(); | |
| 1104 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1105 return false; | |
| 1106 const RepeatedPtrField<CharWordMapEntry>& | |
| 1107 entries(list_item.char_word_map_entry()); | |
| 1108 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = | |
| 1109 entries.begin(); iter != entries.end(); ++iter) { | |
| 1110 expected_item_count = iter->item_count(); | |
| 1111 actual_item_count = iter->word_id_size(); | |
| 1112 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1113 return false; | |
| 1114 base::char16 uni_char = static_cast<base::char16>(iter->char_16()); | |
| 1115 WordIDSet word_id_set; | |
| 1116 const RepeatedField<int32>& word_ids(iter->word_id()); | |
| 1117 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); | |
| 1118 jiter != word_ids.end(); ++jiter) | |
| 1119 word_id_set.insert(*jiter); | |
| 1120 char_word_map_[uni_char] = word_id_set; | |
| 1121 } | |
| 1122 return true; | |
| 1123 } | |
| 1124 | |
| 1125 bool URLIndexPrivateData::RestoreWordIDHistoryMap( | |
| 1126 const InMemoryURLIndexCacheItem& cache) { | |
| 1127 if (!cache.has_word_id_history_map()) | |
| 1128 return false; | |
| 1129 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); | |
| 1130 uint32 expected_item_count = list_item.item_count(); | |
| 1131 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); | |
| 1132 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1133 return false; | |
| 1134 const RepeatedPtrField<WordIDHistoryMapEntry>& | |
| 1135 entries(list_item.word_id_history_map_entry()); | |
| 1136 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = | |
| 1137 entries.begin(); iter != entries.end(); ++iter) { | |
| 1138 expected_item_count = iter->item_count(); | |
| 1139 actual_item_count = iter->history_id_size(); | |
| 1140 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1141 return false; | |
| 1142 WordID word_id = iter->word_id(); | |
| 1143 HistoryIDSet history_id_set; | |
| 1144 const RepeatedField<int64>& history_ids(iter->history_id()); | |
| 1145 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); | |
| 1146 jiter != history_ids.end(); ++jiter) { | |
| 1147 history_id_set.insert(*jiter); | |
| 1148 AddToHistoryIDWordMap(*jiter, word_id); | |
| 1149 } | |
| 1150 word_id_history_map_[word_id] = history_id_set; | |
| 1151 } | |
| 1152 return true; | |
| 1153 } | |
| 1154 | |
| 1155 bool URLIndexPrivateData::RestoreHistoryInfoMap( | |
| 1156 const InMemoryURLIndexCacheItem& cache) { | |
| 1157 if (!cache.has_history_info_map()) | |
| 1158 return false; | |
| 1159 const HistoryInfoMapItem& list_item(cache.history_info_map()); | |
| 1160 uint32 expected_item_count = list_item.item_count(); | |
| 1161 uint32 actual_item_count = list_item.history_info_map_entry_size(); | |
| 1162 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1163 return false; | |
| 1164 const RepeatedPtrField<HistoryInfoMapEntry>& | |
| 1165 entries(list_item.history_info_map_entry()); | |
| 1166 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = | |
| 1167 entries.begin(); iter != entries.end(); ++iter) { | |
| 1168 HistoryID history_id = iter->history_id(); | |
| 1169 GURL url(iter->url()); | |
| 1170 URLRow url_row(url, history_id); | |
| 1171 url_row.set_visit_count(iter->visit_count()); | |
| 1172 url_row.set_typed_count(iter->typed_count()); | |
| 1173 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); | |
| 1174 if (iter->has_title()) { | |
| 1175 base::string16 title(base::UTF8ToUTF16(iter->title())); | |
| 1176 url_row.set_title(title); | |
| 1177 } | |
| 1178 history_info_map_[history_id].url_row = url_row; | |
| 1179 | |
| 1180 // Restore visits list. | |
| 1181 VisitInfoVector visits; | |
| 1182 visits.reserve(iter->visits_size()); | |
| 1183 for (int i = 0; i < iter->visits_size(); ++i) { | |
| 1184 visits.push_back(std::make_pair( | |
| 1185 base::Time::FromInternalValue(iter->visits(i).visit_time()), | |
| 1186 ui::PageTransitionFromInt(iter->visits(i).transition_type()))); | |
| 1187 } | |
| 1188 history_info_map_[history_id].visits = visits; | |
| 1189 } | |
| 1190 return true; | |
| 1191 } | |
| 1192 | |
| 1193 bool URLIndexPrivateData::RestoreWordStartsMap( | |
| 1194 const InMemoryURLIndexCacheItem& cache, | |
| 1195 const std::string& languages) { | |
| 1196 // Note that this function must be called after RestoreHistoryInfoMap() has | |
| 1197 // been run as the word starts may have to be recalculated from the urls and | |
| 1198 // page titles. | |
| 1199 if (cache.has_word_starts_map()) { | |
| 1200 const WordStartsMapItem& list_item(cache.word_starts_map()); | |
| 1201 uint32 expected_item_count = list_item.item_count(); | |
| 1202 uint32 actual_item_count = list_item.word_starts_map_entry_size(); | |
| 1203 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1204 return false; | |
| 1205 const RepeatedPtrField<WordStartsMapEntry>& | |
| 1206 entries(list_item.word_starts_map_entry()); | |
| 1207 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = | |
| 1208 entries.begin(); iter != entries.end(); ++iter) { | |
| 1209 HistoryID history_id = iter->history_id(); | |
| 1210 RowWordStarts word_starts; | |
| 1211 // Restore the URL word starts. | |
| 1212 const RepeatedField<int32>& url_starts(iter->url_word_starts()); | |
| 1213 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); | |
| 1214 jiter != url_starts.end(); ++jiter) | |
| 1215 word_starts.url_word_starts_.push_back(*jiter); | |
| 1216 // Restore the page title word starts. | |
| 1217 const RepeatedField<int32>& title_starts(iter->title_word_starts()); | |
| 1218 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); | |
| 1219 jiter != title_starts.end(); ++jiter) | |
| 1220 word_starts.title_word_starts_.push_back(*jiter); | |
| 1221 word_starts_map_[history_id] = word_starts; | |
| 1222 } | |
| 1223 } else { | |
| 1224 // Since the cache did not contain any word starts we must rebuild then from | |
| 1225 // the URL and page titles. | |
| 1226 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
| 1227 iter != history_info_map_.end(); ++iter) { | |
| 1228 RowWordStarts word_starts; | |
| 1229 const URLRow& row(iter->second.url_row); | |
| 1230 const base::string16& url = | |
| 1231 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL); | |
| 1232 String16VectorFromString16(url, false, &word_starts.url_word_starts_); | |
| 1233 const base::string16& title = | |
| 1234 bookmarks::CleanUpTitleForMatching(row.title()); | |
| 1235 String16VectorFromString16(title, false, &word_starts.title_word_starts_); | |
| 1236 word_starts_map_[iter->first] = word_starts; | |
| 1237 } | |
| 1238 } | |
| 1239 return true; | |
| 1240 } | |
| 1241 | |
| 1242 // static | |
| 1243 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | |
| 1244 const GURL& gurl, | |
| 1245 const std::set<std::string>& whitelist) { | |
| 1246 return whitelist.find(gurl.scheme()) != whitelist.end(); | |
| 1247 } | |
| 1248 | |
| 1249 | |
| 1250 // SearchTermCacheItem --------------------------------------------------------- | |
| 1251 | |
| 1252 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | |
| 1253 const WordIDSet& word_id_set, | |
| 1254 const HistoryIDSet& history_id_set) | |
| 1255 : word_id_set_(word_id_set), | |
| 1256 history_id_set_(history_id_set), | |
| 1257 used_(true) {} | |
| 1258 | |
| 1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() | |
| 1260 : used_(true) {} | |
| 1261 | |
| 1262 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} | |
| 1263 | |
| 1264 | |
| 1265 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- | |
| 1266 | |
| 1267 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( | |
| 1268 const URLIndexPrivateData& private_data, | |
| 1269 const std::string& languages, | |
| 1270 const base::string16& lower_string, | |
| 1271 const String16Vector& lower_terms, | |
| 1272 const base::Time now, | |
| 1273 const ScoredHistoryMatch::Builder& builder) | |
| 1274 : private_data_(private_data), | |
| 1275 languages_(languages), | |
| 1276 builder_(builder), | |
| 1277 lower_string_(lower_string), | |
| 1278 lower_terms_(lower_terms), | |
| 1279 now_(now) { | |
| 1280 // Calculate offsets for each term. For instance, the offset for | |
| 1281 // ".net" should be 1, indicating that the actual word-part of the term | |
| 1282 // starts at offset 1. | |
| 1283 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u); | |
| 1284 for (size_t i = 0; i < lower_terms_.size(); ++i) { | |
| 1285 base::i18n::BreakIterator iter(lower_terms_[i], | |
| 1286 base::i18n::BreakIterator::BREAK_WORD); | |
| 1287 // If the iterator doesn't work, assume an offset of 0. | |
| 1288 if (!iter.Init()) | |
| 1289 continue; | |
| 1290 // Find the first word start. | |
| 1291 while (iter.Advance() && !iter.IsWord()) {} | |
| 1292 if (iter.IsWord()) | |
| 1293 lower_terms_to_word_starts_offsets_[i] = iter.prev(); | |
| 1294 // Else: the iterator didn't find a word break. Assume an offset of 0. | |
| 1295 } | |
| 1296 } | |
| 1297 | |
| 1298 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
| 1299 | |
| 1300 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
| 1301 const HistoryID history_id) { | |
| 1302 HistoryInfoMap::const_iterator hist_pos = | |
| 1303 private_data_.history_info_map_.find(history_id); | |
| 1304 if (hist_pos != private_data_.history_info_map_.end()) { | |
| 1305 const URLRow& hist_item = hist_pos->second.url_row; | |
| 1306 const VisitInfoVector& visits = hist_pos->second.visits; | |
| 1307 WordStartsMap::const_iterator starts_pos = | |
| 1308 private_data_.word_starts_map_.find(history_id); | |
| 1309 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
| 1310 ScoredHistoryMatch match = builder_.Build( | |
| 1311 hist_item, visits, languages_, lower_string_, lower_terms_, | |
| 1312 lower_terms_to_word_starts_offsets_, starts_pos->second, now_); | |
| 1313 if (match.raw_score > 0) | |
| 1314 scored_matches_.push_back(match); | |
| 1315 } | |
| 1316 } | |
| 1317 | |
| 1318 | |
| 1319 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | |
| 1320 | |
| 1321 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
| 1322 const HistoryInfoMap& history_info_map) | |
| 1323 : history_info_map_(history_info_map) { | |
| 1324 } | |
| 1325 | |
| 1326 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
| 1327 | |
| 1328 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
| 1329 const HistoryID h1, | |
| 1330 const HistoryID h2) { | |
| 1331 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
| 1332 if (entry1 == history_info_map_.end()) | |
| 1333 return false; | |
| 1334 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
| 1335 if (entry2 == history_info_map_.end()) | |
| 1336 return true; | |
| 1337 const URLRow& r1(entry1->second.url_row); | |
| 1338 const URLRow& r2(entry2->second.url_row); | |
| 1339 // First cut: typed count, visit count, recency. | |
| 1340 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
| 1341 // recently visited (within the last 12/24 hours) as highly important. Get | |
| 1342 // input from mpearson. | |
| 1343 if (r1.typed_count() != r2.typed_count()) | |
| 1344 return (r1.typed_count() > r2.typed_count()); | |
| 1345 if (r1.visit_count() != r2.visit_count()) | |
| 1346 return (r1.visit_count() > r2.visit_count()); | |
| 1347 return (r1.last_visit() > r2.last_visit()); | |
| 1348 } | |
| 1349 | |
| 1350 } // namespace history | |
| OLD | NEW |