| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/browser/autocomplete/url_index_private_data.h" | |
| 6 | |
| 7 #include <functional> | |
| 8 #include <iterator> | |
| 9 #include <limits> | |
| 10 #include <numeric> | |
| 11 #include <string> | |
| 12 #include <vector> | |
| 13 | |
| 14 #include "base/basictypes.h" | |
| 15 #include "base/files/file_util.h" | |
| 16 #include "base/i18n/break_iterator.h" | |
| 17 #include "base/i18n/case_conversion.h" | |
| 18 #include "base/metrics/histogram.h" | |
| 19 #include "base/strings/string_util.h" | |
| 20 #include "base/strings/utf_string_conversions.h" | |
| 21 #include "base/time/time.h" | |
| 22 #include "chrome/browser/autocomplete/in_memory_url_index.h" | |
| 23 #include "components/bookmarks/browser/bookmark_model.h" | |
| 24 #include "components/bookmarks/browser/bookmark_utils.h" | |
| 25 #include "components/history/core/browser/history_database.h" | |
| 26 #include "components/history/core/browser/history_db_task.h" | |
| 27 #include "components/history/core/browser/history_service.h" | |
| 28 #include "net/base/net_util.h" | |
| 29 | |
| 30 #if defined(USE_SYSTEM_PROTOBUF) | |
| 31 #include <google/protobuf/repeated_field.h> | |
| 32 #else | |
| 33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | |
| 34 #endif | |
| 35 | |
| 36 using google::protobuf::RepeatedField; | |
| 37 using google::protobuf::RepeatedPtrField; | |
| 38 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
| 39 | |
| 40 namespace { | |
| 41 static const size_t kMaxVisitsToStoreInCache = 10u; | |
| 42 } | |
| 43 | |
| 44 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem | |
| 45 WordListItem; | |
| 46 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry | |
| 47 WordMapEntry; | |
| 48 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | |
| 49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem | |
| 50 CharWordMapItem; | |
| 51 typedef in_memory_url_index:: | |
| 52 InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry; | |
| 53 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | |
| 54 WordIDHistoryMapItem; | |
| 55 typedef in_memory_url_index:: | |
| 56 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | |
| 57 WordIDHistoryMapEntry; | |
| 58 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem | |
| 59 HistoryInfoMapItem; | |
| 60 typedef in_memory_url_index:: | |
| 61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | |
| 62 HistoryInfoMapEntry; | |
| 63 typedef in_memory_url_index:: | |
| 64 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo | |
| 65 HistoryInfoMapEntry_VisitInfo; | |
| 66 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem | |
| 67 WordStartsMapItem; | |
| 68 typedef in_memory_url_index:: | |
| 69 InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry | |
| 70 WordStartsMapEntry; | |
| 71 | |
| 72 // Algorithm Functions --------------------------------------------------------- | |
| 73 | |
| 74 // Comparison function for sorting search terms by descending length. | |
| 75 bool LengthGreater(const base::string16& string_a, | |
| 76 const base::string16& string_b) { | |
| 77 return string_a.length() > string_b.length(); | |
| 78 } | |
| 79 | |
| 80 | |
| 81 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- | |
| 82 | |
| 83 // HistoryDBTask used to update the recent visit data for a particular | |
| 84 // row from the history database. | |
| 85 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { | |
| 86 public: | |
| 87 explicit UpdateRecentVisitsFromHistoryDBTask( | |
| 88 URLIndexPrivateData* private_data, | |
| 89 history::URLID url_id); | |
| 90 | |
| 91 bool RunOnDBThread(history::HistoryBackend* backend, | |
| 92 history::HistoryDatabase* db) override; | |
| 93 void DoneRunOnMainThread() override; | |
| 94 | |
| 95 private: | |
| 96 ~UpdateRecentVisitsFromHistoryDBTask() override; | |
| 97 | |
| 98 // The URLIndexPrivateData that gets updated after the historyDB | |
| 99 // task returns. | |
| 100 URLIndexPrivateData* private_data_; | |
| 101 // The ID of the URL to get visits for and then update. | |
| 102 history::URLID url_id_; | |
| 103 // Whether fetching the recent visits for the URL succeeded. | |
| 104 bool succeeded_; | |
| 105 // The awaited data that's shown to private_data_ for it to copy and | |
| 106 // store. | |
| 107 history::VisitVector recent_visits_; | |
| 108 | |
| 109 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); | |
| 110 }; | |
| 111 | |
| 112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( | |
| 113 URLIndexPrivateData* private_data, | |
| 114 history::URLID url_id) | |
| 115 : private_data_(private_data), url_id_(url_id), succeeded_(false) { | |
| 116 } | |
| 117 | |
| 118 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( | |
| 119 history::HistoryBackend* backend, | |
| 120 history::HistoryDatabase* db) { | |
| 121 // Make sure the private data is going to get as many recent visits as | |
| 122 // ScoredHistoryMatch::GetFrequency() hopes to use. | |
| 123 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); | |
| 124 succeeded_ = db->GetMostRecentVisitsForURL(url_id_, | |
| 125 kMaxVisitsToStoreInCache, | |
| 126 &recent_visits_); | |
| 127 if (!succeeded_) | |
| 128 recent_visits_.clear(); | |
| 129 return true; // Always claim to be done; do not retry failures. | |
| 130 } | |
| 131 | |
| 132 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { | |
| 133 if (succeeded_) | |
| 134 private_data_->UpdateRecentVisits(url_id_, recent_visits_); | |
| 135 } | |
| 136 | |
| 137 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { | |
| 138 } | |
| 139 | |
| 140 | |
| 141 // URLIndexPrivateData --------------------------------------------------------- | |
| 142 | |
| 143 URLIndexPrivateData::URLIndexPrivateData() | |
| 144 : restored_cache_version_(0), | |
| 145 saved_cache_version_(kCurrentCacheFileVersion), | |
| 146 pre_filter_item_count_(0), | |
| 147 post_filter_item_count_(0), | |
| 148 post_scoring_item_count_(0) { | |
| 149 } | |
| 150 | |
| 151 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
| 152 base::string16 search_string, | |
| 153 size_t cursor_position, | |
| 154 size_t max_matches, | |
| 155 const std::string& languages, | |
| 156 bookmarks::BookmarkModel* bookmark_model) { | |
| 157 // If cursor position is set and useful (not at either end of the | |
| 158 // string), allow the search string to be broken at cursor position. | |
| 159 // We do this by pretending there's a space where the cursor is. | |
| 160 if ((cursor_position != base::string16::npos) && | |
| 161 (cursor_position < search_string.length()) && | |
| 162 (cursor_position > 0)) { | |
| 163 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
| 164 } | |
| 165 pre_filter_item_count_ = 0; | |
| 166 post_filter_item_count_ = 0; | |
| 167 post_scoring_item_count_ = 0; | |
| 168 // The search string we receive may contain escaped characters. For reducing | |
| 169 // the index we need individual, lower-cased words, ignoring escapings. For | |
| 170 // the final filtering we need whitespace separated substrings possibly | |
| 171 // containing escaped characters. | |
| 172 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
| 173 base::string16 lower_unescaped_string = | |
| 174 net::UnescapeURLComponent(lower_raw_string, | |
| 175 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
| 176 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
| 177 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
| 178 // four 'words': "colspec", "id", "mstone" and "release". | |
| 179 String16Vector lower_words( | |
| 180 String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
| 181 ScoredHistoryMatches scored_items; | |
| 182 | |
| 183 // Do nothing if we have indexed no words (probably because we've not been | |
| 184 // initialized yet) or the search string has no words. | |
| 185 if (word_list_.empty() || lower_words.empty()) { | |
| 186 search_term_cache_.clear(); // Invalidate the term cache. | |
| 187 return scored_items; | |
| 188 } | |
| 189 | |
| 190 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
| 191 // approach. | |
| 192 ResetSearchTermCache(); | |
| 193 | |
| 194 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
| 195 | |
| 196 // Trim the candidate pool if it is large. Note that we do not filter out | |
| 197 // items that do not contain the search terms as proper substrings -- doing | |
| 198 // so is the performance-costly operation we are trying to avoid in order | |
| 199 // to maintain omnibox responsiveness. | |
| 200 const size_t kItemsToScoreLimit = 500; | |
| 201 pre_filter_item_count_ = history_id_set.size(); | |
| 202 // If we trim the results set we do not want to cache the results for next | |
| 203 // time as the user's ultimately desired result could easily be eliminated | |
| 204 // in this early rough filter. | |
| 205 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
| 206 if (was_trimmed) { | |
| 207 HistoryIDVector history_ids; | |
| 208 std::copy(history_id_set.begin(), history_id_set.end(), | |
| 209 std::back_inserter(history_ids)); | |
| 210 // Trim down the set by sorting by typed-count, visit-count, and last | |
| 211 // visit. | |
| 212 HistoryItemFactorGreater | |
| 213 item_factor_functor(history_info_map_); | |
| 214 std::partial_sort(history_ids.begin(), | |
| 215 history_ids.begin() + kItemsToScoreLimit, | |
| 216 history_ids.end(), | |
| 217 item_factor_functor); | |
| 218 history_id_set.clear(); | |
| 219 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
| 220 std::inserter(history_id_set, history_id_set.end())); | |
| 221 post_filter_item_count_ = history_id_set.size(); | |
| 222 } | |
| 223 | |
| 224 // Pass over all of the candidates filtering out any without a proper | |
| 225 // substring match, inserting those which pass in order by score. Note that | |
| 226 // in this step we are using the raw search string complete with escaped | |
| 227 // URL elements. When the user has specifically typed something akin to | |
| 228 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
| 229 // specific substring appears in the URL or page title. | |
| 230 | |
| 231 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
| 232 // we only want to break up the search string on 'true' whitespace rather than | |
| 233 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
| 234 // get two 'terms': "colspec=id%20mstone" and "release". | |
| 235 String16Vector lower_raw_terms; | |
| 236 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16, | |
| 237 &lower_raw_terms) == 0) { | |
| 238 // Don't score matches when there are no terms to score against. (It's | |
| 239 // possible that the word break iterater that extracts words to search | |
| 240 // for in the database allows some whitespace "words" whereas Tokenize | |
| 241 // excludes a long list of whitespace.) One could write a scoring | |
| 242 // function that gives a reasonable order to matches when there | |
| 243 // are no terms (i.e., all the words are some form of whitespace), | |
| 244 // but this is such a rare edge case that it's not worth the time. | |
| 245 return scored_items; | |
| 246 } | |
| 247 scored_items = | |
| 248 std::for_each( | |
| 249 history_id_set.begin(), history_id_set.end(), | |
| 250 AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string, | |
| 251 lower_raw_terms, base::Time::Now())).ScoredMatches(); | |
| 252 | |
| 253 // Select and sort only the top |max_matches| results. | |
| 254 if (scored_items.size() > max_matches) { | |
| 255 std::partial_sort(scored_items.begin(), | |
| 256 scored_items.begin() + | |
| 257 max_matches, | |
| 258 scored_items.end(), | |
| 259 ScoredHistoryMatch::MatchScoreGreater); | |
| 260 scored_items.resize(max_matches); | |
| 261 } else { | |
| 262 std::sort(scored_items.begin(), scored_items.end(), | |
| 263 ScoredHistoryMatch::MatchScoreGreater); | |
| 264 } | |
| 265 post_scoring_item_count_ = scored_items.size(); | |
| 266 | |
| 267 if (was_trimmed) { | |
| 268 search_term_cache_.clear(); // Invalidate the term cache. | |
| 269 } else { | |
| 270 // Remove any stale SearchTermCacheItems. | |
| 271 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 272 cache_iter != search_term_cache_.end(); ) { | |
| 273 if (!cache_iter->second.used_) | |
| 274 search_term_cache_.erase(cache_iter++); | |
| 275 else | |
| 276 ++cache_iter; | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 return scored_items; | |
| 281 } | |
| 282 | |
| 283 bool URLIndexPrivateData::UpdateURL( | |
| 284 history::HistoryService* history_service, | |
| 285 const history::URLRow& row, | |
| 286 const std::string& languages, | |
| 287 const std::set<std::string>& scheme_whitelist, | |
| 288 base::CancelableTaskTracker* tracker) { | |
| 289 // The row may or may not already be in our index. If it is not already | |
| 290 // indexed and it qualifies then it gets indexed. If it is already | |
| 291 // indexed and still qualifies then it gets updated, otherwise it | |
| 292 // is deleted from the index. | |
| 293 bool row_was_updated = false; | |
| 294 history::URLID row_id = row.id(); | |
| 295 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
| 296 if (row_pos == history_info_map_.end()) { | |
| 297 // This new row should be indexed if it qualifies. | |
| 298 history::URLRow new_row(row); | |
| 299 new_row.set_id(row_id); | |
| 300 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
| 301 IndexRow(NULL, | |
| 302 history_service, | |
| 303 new_row, | |
| 304 languages, | |
| 305 scheme_whitelist, | |
| 306 tracker); | |
| 307 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
| 308 // This indexed row still qualifies and will be re-indexed. | |
| 309 // The url won't have changed but the title, visit count, etc. | |
| 310 // might have changed. | |
| 311 history::URLRow& row_to_update = row_pos->second.url_row; | |
| 312 bool title_updated = row_to_update.title() != row.title(); | |
| 313 if (row_to_update.visit_count() != row.visit_count() || | |
| 314 row_to_update.typed_count() != row.typed_count() || | |
| 315 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
| 316 row_to_update.set_visit_count(row.visit_count()); | |
| 317 row_to_update.set_typed_count(row.typed_count()); | |
| 318 row_to_update.set_last_visit(row.last_visit()); | |
| 319 // If something appears to have changed, update the recent visits | |
| 320 // information. | |
| 321 ScheduleUpdateRecentVisits(history_service, row_id, tracker); | |
| 322 // While the URL is guaranteed to remain stable, the title may have | |
| 323 // changed. If so, then update the index with the changed words. | |
| 324 if (title_updated) { | |
| 325 // Clear all words associated with this row and re-index both the | |
| 326 // URL and title. | |
| 327 RemoveRowWordsFromIndex(row_to_update); | |
| 328 row_to_update.set_title(row.title()); | |
| 329 RowWordStarts word_starts; | |
| 330 AddRowWordsToIndex(row_to_update, &word_starts, languages); | |
| 331 word_starts_map_[row_id] = word_starts; | |
| 332 } | |
| 333 row_was_updated = true; | |
| 334 } | |
| 335 } else { | |
| 336 // This indexed row no longer qualifies and will be de-indexed by | |
| 337 // clearing all words associated with this row. | |
| 338 RemoveRowFromIndex(row); | |
| 339 row_was_updated = true; | |
| 340 } | |
| 341 if (row_was_updated) | |
| 342 search_term_cache_.clear(); // This invalidates the cache. | |
| 343 return row_was_updated; | |
| 344 } | |
| 345 | |
| 346 void URLIndexPrivateData::UpdateRecentVisits( | |
| 347 history::URLID url_id, | |
| 348 const history::VisitVector& recent_visits) { | |
| 349 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); | |
| 350 if (row_pos != history_info_map_.end()) { | |
| 351 VisitInfoVector* visits = &row_pos->second.visits; | |
| 352 visits->clear(); | |
| 353 const size_t size = | |
| 354 std::min(recent_visits.size(), kMaxVisitsToStoreInCache); | |
| 355 visits->reserve(size); | |
| 356 for (size_t i = 0; i < size; i++) { | |
| 357 // Copy from the history::VisitVector the only fields visits needs. | |
| 358 visits->push_back(std::make_pair(recent_visits[i].visit_time, | |
| 359 recent_visits[i].transition)); | |
| 360 } | |
| 361 } | |
| 362 // Else: Oddly, the URL doesn't seem to exist in the private index. | |
| 363 // Ignore this update. This can happen if, for instance, the user | |
| 364 // removes the URL from URLIndexPrivateData before the historyDB call | |
| 365 // returns. | |
| 366 } | |
| 367 | |
| 368 void URLIndexPrivateData::ScheduleUpdateRecentVisits( | |
| 369 history::HistoryService* history_service, | |
| 370 history::URLID url_id, | |
| 371 base::CancelableTaskTracker* tracker) { | |
| 372 history_service->ScheduleDBTask( | |
| 373 scoped_ptr<history::HistoryDBTask>( | |
| 374 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker); | |
| 375 } | |
| 376 | |
| 377 // Helper functor for DeleteURL. | |
| 378 class HistoryInfoMapItemHasURL { | |
| 379 public: | |
| 380 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
| 381 | |
| 382 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) { | |
| 383 return item.second.url_row.url() == url_; | |
| 384 } | |
| 385 | |
| 386 private: | |
| 387 const GURL& url_; | |
| 388 }; | |
| 389 | |
| 390 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
| 391 // Find the matching entry in the history_info_map_. | |
| 392 HistoryInfoMap::iterator pos = std::find_if( | |
| 393 history_info_map_.begin(), | |
| 394 history_info_map_.end(), | |
| 395 HistoryInfoMapItemHasURL(url)); | |
| 396 if (pos == history_info_map_.end()) | |
| 397 return false; | |
| 398 RemoveRowFromIndex(pos->second.url_row); | |
| 399 search_term_cache_.clear(); // This invalidates the cache. | |
| 400 return true; | |
| 401 } | |
| 402 | |
| 403 // static | |
| 404 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( | |
| 405 const base::FilePath& file_path, | |
| 406 const std::string& languages) { | |
| 407 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 408 if (!base::PathExists(file_path)) | |
| 409 return NULL; | |
| 410 std::string data; | |
| 411 // If there is no cache file then simply give up. This will cause us to | |
| 412 // attempt to rebuild from the history database. | |
| 413 if (!base::ReadFileToString(file_path, &data)) | |
| 414 return NULL; | |
| 415 | |
| 416 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); | |
| 417 InMemoryURLIndexCacheItem index_cache; | |
| 418 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { | |
| 419 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " | |
| 420 << file_path.value(); | |
| 421 return restored_data; | |
| 422 } | |
| 423 | |
| 424 if (!restored_data->RestorePrivateData(index_cache, languages)) | |
| 425 return NULL; | |
| 426 | |
| 427 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", | |
| 428 base::TimeTicks::Now() - beginning_time); | |
| 429 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
| 430 restored_data->history_id_word_map_.size()); | |
| 431 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); | |
| 432 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
| 433 restored_data->word_map_.size()); | |
| 434 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
| 435 restored_data->char_word_map_.size()); | |
| 436 if (restored_data->Empty()) | |
| 437 return NULL; // 'No data' is the same as a failed reload. | |
| 438 return restored_data; | |
| 439 } | |
| 440 | |
| 441 // static | |
| 442 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
| 443 history::HistoryDatabase* history_db, | |
| 444 const std::string& languages, | |
| 445 const std::set<std::string>& scheme_whitelist) { | |
| 446 if (!history_db) | |
| 447 return NULL; | |
| 448 | |
| 449 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 450 | |
| 451 scoped_refptr<URLIndexPrivateData> | |
| 452 rebuilt_data(new URLIndexPrivateData); | |
| 453 history::URLDatabase::URLEnumerator history_enum; | |
| 454 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
| 455 return NULL; | |
| 456 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); | |
| 457 for (history::URLRow row; history_enum.GetNextURL(&row);) { | |
| 458 rebuilt_data->IndexRow( | |
| 459 history_db, NULL, row, languages, scheme_whitelist, NULL); | |
| 460 } | |
| 461 | |
| 462 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
| 463 base::TimeTicks::Now() - beginning_time); | |
| 464 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
| 465 rebuilt_data->history_id_word_map_.size()); | |
| 466 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
| 467 rebuilt_data->word_map_.size()); | |
| 468 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
| 469 rebuilt_data->char_word_map_.size()); | |
| 470 return rebuilt_data; | |
| 471 } | |
| 472 | |
| 473 // static | |
| 474 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( | |
| 475 scoped_refptr<URLIndexPrivateData> private_data, | |
| 476 const base::FilePath& file_path) { | |
| 477 DCHECK(private_data.get()); | |
| 478 DCHECK(!file_path.empty()); | |
| 479 return private_data->SaveToFile(file_path); | |
| 480 } | |
| 481 | |
| 482 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { | |
| 483 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; | |
| 484 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; | |
| 485 data_copy->word_list_ = word_list_; | |
| 486 data_copy->available_words_ = available_words_; | |
| 487 data_copy->word_map_ = word_map_; | |
| 488 data_copy->char_word_map_ = char_word_map_; | |
| 489 data_copy->word_id_history_map_ = word_id_history_map_; | |
| 490 data_copy->history_id_word_map_ = history_id_word_map_; | |
| 491 data_copy->history_info_map_ = history_info_map_; | |
| 492 data_copy->word_starts_map_ = word_starts_map_; | |
| 493 return data_copy; | |
| 494 // Not copied: | |
| 495 // search_term_cache_ | |
| 496 // pre_filter_item_count_ | |
| 497 // post_filter_item_count_ | |
| 498 // post_scoring_item_count_ | |
| 499 } | |
| 500 | |
| 501 bool URLIndexPrivateData::Empty() const { | |
| 502 return history_info_map_.empty(); | |
| 503 } | |
| 504 | |
| 505 void URLIndexPrivateData::Clear() { | |
| 506 last_time_rebuilt_from_history_ = base::Time(); | |
| 507 word_list_.clear(); | |
| 508 available_words_.clear(); | |
| 509 word_map_.clear(); | |
| 510 char_word_map_.clear(); | |
| 511 word_id_history_map_.clear(); | |
| 512 history_id_word_map_.clear(); | |
| 513 history_info_map_.clear(); | |
| 514 word_starts_map_.clear(); | |
| 515 } | |
| 516 | |
| 517 URLIndexPrivateData::~URLIndexPrivateData() {} | |
| 518 | |
| 519 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | |
| 520 const String16Vector& unsorted_words) { | |
| 521 // Break the terms down into individual terms (words), get the candidate | |
| 522 // set for each term, and intersect each to get a final candidate list. | |
| 523 // Note that a single 'term' from the user's perspective might be | |
| 524 // a string like "http://www.somewebsite.com" which, from our perspective, | |
| 525 // is four words: 'http', 'www', 'somewebsite', and 'com'. | |
| 526 HistoryIDSet history_id_set; | |
| 527 String16Vector words(unsorted_words); | |
| 528 // Sort the words into the longest first as such are likely to narrow down | |
| 529 // the results quicker. Also, single character words are the most expensive | |
| 530 // to process so save them for last. | |
| 531 std::sort(words.begin(), words.end(), LengthGreater); | |
| 532 for (String16Vector::iterator iter = words.begin(); iter != words.end(); | |
| 533 ++iter) { | |
| 534 base::string16 uni_word = *iter; | |
| 535 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | |
| 536 if (term_history_set.empty()) { | |
| 537 history_id_set.clear(); | |
| 538 break; | |
| 539 } | |
| 540 if (iter == words.begin()) { | |
| 541 history_id_set.swap(term_history_set); | |
| 542 } else { | |
| 543 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( | |
| 544 history_id_set, term_history_set); | |
| 545 history_id_set.swap(new_history_id_set); | |
| 546 } | |
| 547 } | |
| 548 return history_id_set; | |
| 549 } | |
| 550 | |
| 551 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( | |
| 552 const base::string16& term) { | |
| 553 if (term.empty()) | |
| 554 return HistoryIDSet(); | |
| 555 | |
| 556 // TODO(mrossetti): Consider optimizing for very common terms such as | |
| 557 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently | |
| 558 // occuring words in the user's searches. | |
| 559 | |
| 560 size_t term_length = term.length(); | |
| 561 WordIDSet word_id_set; | |
| 562 if (term_length > 1) { | |
| 563 // See if this term or a prefix thereof is present in the cache. | |
| 564 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); | |
| 565 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
| 566 cache_iter != search_term_cache_.end(); ++cache_iter) { | |
| 567 if (base::StartsWith(term, cache_iter->first, false) && | |
| 568 (best_prefix == search_term_cache_.end() || | |
| 569 cache_iter->first.length() > best_prefix->first.length())) | |
| 570 best_prefix = cache_iter; | |
| 571 } | |
| 572 | |
| 573 // If a prefix was found then determine the leftover characters to be used | |
| 574 // for further refining the results from that prefix. | |
| 575 Char16Set prefix_chars; | |
| 576 base::string16 leftovers(term); | |
| 577 if (best_prefix != search_term_cache_.end()) { | |
| 578 // If the prefix is an exact match for the term then grab the cached | |
| 579 // results and we're done. | |
| 580 size_t prefix_length = best_prefix->first.length(); | |
| 581 if (prefix_length == term_length) { | |
| 582 best_prefix->second.used_ = true; | |
| 583 return best_prefix->second.history_id_set_; | |
| 584 } | |
| 585 | |
| 586 // Otherwise we have a handy starting point. | |
| 587 // If there are no history results for this prefix then we can bail early | |
| 588 // as there will be no history results for the full term. | |
| 589 if (best_prefix->second.history_id_set_.empty()) { | |
| 590 search_term_cache_[term] = SearchTermCacheItem(); | |
| 591 return HistoryIDSet(); | |
| 592 } | |
| 593 word_id_set = best_prefix->second.word_id_set_; | |
| 594 prefix_chars = Char16SetFromString16(best_prefix->first); | |
| 595 leftovers = term.substr(prefix_length); | |
| 596 } | |
| 597 | |
| 598 // Filter for each remaining, unique character in the term. | |
| 599 Char16Set leftover_chars = Char16SetFromString16(leftovers); | |
| 600 Char16Set unique_chars = | |
| 601 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); | |
| 602 | |
| 603 // Reduce the word set with any leftover, unprocessed characters. | |
| 604 if (!unique_chars.empty()) { | |
| 605 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); | |
| 606 // We might come up empty on the leftovers. | |
| 607 if (leftover_set.empty()) { | |
| 608 search_term_cache_[term] = SearchTermCacheItem(); | |
| 609 return HistoryIDSet(); | |
| 610 } | |
| 611 // Or there may not have been a prefix from which to start. | |
| 612 if (prefix_chars.empty()) { | |
| 613 word_id_set.swap(leftover_set); | |
| 614 } else { | |
| 615 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | |
| 616 word_id_set, leftover_set); | |
| 617 word_id_set.swap(new_word_id_set); | |
| 618 } | |
| 619 } | |
| 620 | |
| 621 // We must filter the word list because the resulting word set surely | |
| 622 // contains words which do not have the search term as a proper subset. | |
| 623 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); | |
| 624 word_set_iter != word_id_set.end(); ) { | |
| 625 if (word_list_[*word_set_iter].find(term) == base::string16::npos) | |
| 626 word_id_set.erase(word_set_iter++); | |
| 627 else | |
| 628 ++word_set_iter; | |
| 629 } | |
| 630 } else { | |
| 631 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); | |
| 632 } | |
| 633 | |
| 634 // If any words resulted then we can compose a set of history IDs by unioning | |
| 635 // the sets from each word. | |
| 636 HistoryIDSet history_id_set; | |
| 637 if (!word_id_set.empty()) { | |
| 638 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | |
| 639 word_id_iter != word_id_set.end(); ++word_id_iter) { | |
| 640 WordID word_id = *word_id_iter; | |
| 641 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | |
| 642 if (word_iter != word_id_history_map_.end()) { | |
| 643 HistoryIDSet& word_history_id_set(word_iter->second); | |
| 644 history_id_set.insert(word_history_id_set.begin(), | |
| 645 word_history_id_set.end()); | |
| 646 } | |
| 647 } | |
| 648 } | |
| 649 | |
| 650 // Record a new cache entry for this word if the term is longer than | |
| 651 // a single character. | |
| 652 if (term_length > 1) | |
| 653 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); | |
| 654 | |
| 655 return history_id_set; | |
| 656 } | |
| 657 | |
| 658 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( | |
| 659 const Char16Set& term_chars) { | |
| 660 WordIDSet word_id_set; | |
| 661 for (Char16Set::const_iterator c_iter = term_chars.begin(); | |
| 662 c_iter != term_chars.end(); ++c_iter) { | |
| 663 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); | |
| 664 if (char_iter == char_word_map_.end()) { | |
| 665 // A character was not found so there are no matching results: bail. | |
| 666 word_id_set.clear(); | |
| 667 break; | |
| 668 } | |
| 669 WordIDSet& char_word_id_set(char_iter->second); | |
| 670 // It is possible for there to no longer be any words associated with | |
| 671 // a particular character. Give up in that case. | |
| 672 if (char_word_id_set.empty()) { | |
| 673 word_id_set.clear(); | |
| 674 break; | |
| 675 } | |
| 676 | |
| 677 if (c_iter == term_chars.begin()) { | |
| 678 // First character results becomes base set of results. | |
| 679 word_id_set = char_word_id_set; | |
| 680 } else { | |
| 681 // Subsequent character results get intersected in. | |
| 682 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | |
| 683 word_id_set, char_word_id_set); | |
| 684 word_id_set.swap(new_word_id_set); | |
| 685 } | |
| 686 } | |
| 687 return word_id_set; | |
| 688 } | |
| 689 | |
| 690 bool URLIndexPrivateData::IndexRow( | |
| 691 history::HistoryDatabase* history_db, | |
| 692 history::HistoryService* history_service, | |
| 693 const history::URLRow& row, | |
| 694 const std::string& languages, | |
| 695 const std::set<std::string>& scheme_whitelist, | |
| 696 base::CancelableTaskTracker* tracker) { | |
| 697 const GURL& gurl(row.url()); | |
| 698 | |
| 699 // Index only URLs with a whitelisted scheme. | |
| 700 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) | |
| 701 return false; | |
| 702 | |
| 703 history::URLID row_id = row.id(); | |
| 704 // Strip out username and password before saving and indexing. | |
| 705 base::string16 url(net::FormatUrl(gurl, languages, | |
| 706 net::kFormatUrlOmitUsernamePassword, | |
| 707 net::UnescapeRule::NONE, | |
| 708 NULL, NULL, NULL)); | |
| 709 | |
| 710 HistoryID history_id = static_cast<HistoryID>(row_id); | |
| 711 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); | |
| 712 | |
| 713 // Add the row for quick lookup in the history info store. | |
| 714 history::URLRow new_row(GURL(url), row_id); | |
| 715 new_row.set_visit_count(row.visit_count()); | |
| 716 new_row.set_typed_count(row.typed_count()); | |
| 717 new_row.set_last_visit(row.last_visit()); | |
| 718 new_row.set_title(row.title()); | |
| 719 history_info_map_[history_id].url_row = new_row; | |
| 720 | |
| 721 // Index the words contained in the URL and title of the row. | |
| 722 RowWordStarts word_starts; | |
| 723 AddRowWordsToIndex(new_row, &word_starts, languages); | |
| 724 word_starts_map_[history_id] = word_starts; | |
| 725 | |
| 726 // Update the recent visits information or schedule the update | |
| 727 // as appropriate. | |
| 728 if (history_db) { | |
| 729 // We'd like to check that we're on the history DB thread. | |
| 730 // However, unittest code actually calls this on the UI thread. | |
| 731 // So we don't do any thread checks. | |
| 732 history::VisitVector recent_visits; | |
| 733 // Make sure the private data is going to get as many recent visits as | |
| 734 // ScoredHistoryMatch::GetFrequency() hopes to use. | |
| 735 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); | |
| 736 if (history_db->GetMostRecentVisitsForURL(row_id, | |
| 737 kMaxVisitsToStoreInCache, | |
| 738 &recent_visits)) | |
| 739 UpdateRecentVisits(row_id, recent_visits); | |
| 740 } else { | |
| 741 DCHECK(tracker); | |
| 742 DCHECK(history_service); | |
| 743 ScheduleUpdateRecentVisits(history_service, row_id, tracker); | |
| 744 } | |
| 745 | |
| 746 return true; | |
| 747 } | |
| 748 | |
| 749 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row, | |
| 750 RowWordStarts* word_starts, | |
| 751 const std::string& languages) { | |
| 752 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
| 753 // Split URL into individual, unique words then add in the title words. | |
| 754 const GURL& gurl(row.url()); | |
| 755 const base::string16& url = | |
| 756 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL); | |
| 757 String16Set url_words = String16SetFromString16(url, | |
| 758 word_starts ? &word_starts->url_word_starts_ : NULL); | |
| 759 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title()); | |
| 760 String16Set title_words = String16SetFromString16(title, | |
| 761 word_starts ? &word_starts->title_word_starts_ : NULL); | |
| 762 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words); | |
| 763 for (String16Set::iterator word_iter = words.begin(); | |
| 764 word_iter != words.end(); ++word_iter) | |
| 765 AddWordToIndex(*word_iter, history_id); | |
| 766 | |
| 767 search_term_cache_.clear(); // Invalidate the term cache. | |
| 768 } | |
| 769 | |
| 770 void URLIndexPrivateData::AddWordToIndex(const base::string16& term, | |
| 771 HistoryID history_id) { | |
| 772 WordMap::iterator word_pos = word_map_.find(term); | |
| 773 if (word_pos != word_map_.end()) | |
| 774 UpdateWordHistory(word_pos->second, history_id); | |
| 775 else | |
| 776 AddWordHistory(term, history_id); | |
| 777 } | |
| 778 | |
| 779 void URLIndexPrivateData::AddWordHistory(const base::string16& term, | |
| 780 HistoryID history_id) { | |
| 781 WordID word_id = word_list_.size(); | |
| 782 if (available_words_.empty()) { | |
| 783 word_list_.push_back(term); | |
| 784 } else { | |
| 785 word_id = *(available_words_.begin()); | |
| 786 word_list_[word_id] = term; | |
| 787 available_words_.erase(word_id); | |
| 788 } | |
| 789 word_map_[term] = word_id; | |
| 790 | |
| 791 HistoryIDSet history_id_set; | |
| 792 history_id_set.insert(history_id); | |
| 793 word_id_history_map_[word_id] = history_id_set; | |
| 794 AddToHistoryIDWordMap(history_id, word_id); | |
| 795 | |
| 796 // For each character in the newly added word (i.e. a word that is not | |
| 797 // already in the word index), add the word to the character index. | |
| 798 Char16Set characters = Char16SetFromString16(term); | |
| 799 for (Char16Set::iterator uni_char_iter = characters.begin(); | |
| 800 uni_char_iter != characters.end(); ++uni_char_iter) { | |
| 801 base::char16 uni_char = *uni_char_iter; | |
| 802 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); | |
| 803 if (char_iter != char_word_map_.end()) { | |
| 804 // Update existing entry in the char/word index. | |
| 805 WordIDSet& word_id_set(char_iter->second); | |
| 806 word_id_set.insert(word_id); | |
| 807 } else { | |
| 808 // Create a new entry in the char/word index. | |
| 809 WordIDSet word_id_set; | |
| 810 word_id_set.insert(word_id); | |
| 811 char_word_map_[uni_char] = word_id_set; | |
| 812 } | |
| 813 } | |
| 814 } | |
| 815 | |
| 816 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, | |
| 817 HistoryID history_id) { | |
| 818 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); | |
| 819 DCHECK(history_pos != word_id_history_map_.end()); | |
| 820 HistoryIDSet& history_id_set(history_pos->second); | |
| 821 history_id_set.insert(history_id); | |
| 822 AddToHistoryIDWordMap(history_id, word_id); | |
| 823 } | |
| 824 | |
| 825 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, | |
| 826 WordID word_id) { | |
| 827 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); | |
| 828 if (iter != history_id_word_map_.end()) { | |
| 829 WordIDSet& word_id_set(iter->second); | |
| 830 word_id_set.insert(word_id); | |
| 831 } else { | |
| 832 WordIDSet word_id_set; | |
| 833 word_id_set.insert(word_id); | |
| 834 history_id_word_map_[history_id] = word_id_set; | |
| 835 } | |
| 836 } | |
| 837 | |
| 838 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) { | |
| 839 RemoveRowWordsFromIndex(row); | |
| 840 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
| 841 history_info_map_.erase(history_id); | |
| 842 word_starts_map_.erase(history_id); | |
| 843 } | |
| 844 | |
| 845 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) { | |
| 846 // Remove the entries in history_id_word_map_ and word_id_history_map_ for | |
| 847 // this row. | |
| 848 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
| 849 WordIDSet word_id_set = history_id_word_map_[history_id]; | |
| 850 history_id_word_map_.erase(history_id); | |
| 851 | |
| 852 // Reconcile any changes to word usage. | |
| 853 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | |
| 854 word_id_iter != word_id_set.end(); ++word_id_iter) { | |
| 855 WordID word_id = *word_id_iter; | |
| 856 word_id_history_map_[word_id].erase(history_id); | |
| 857 if (!word_id_history_map_[word_id].empty()) | |
| 858 continue; // The word is still in use. | |
| 859 | |
| 860 // The word is no longer in use. Reconcile any changes to character usage. | |
| 861 base::string16 word = word_list_[word_id]; | |
| 862 Char16Set characters = Char16SetFromString16(word); | |
| 863 for (Char16Set::iterator uni_char_iter = characters.begin(); | |
| 864 uni_char_iter != characters.end(); ++uni_char_iter) { | |
| 865 base::char16 uni_char = *uni_char_iter; | |
| 866 char_word_map_[uni_char].erase(word_id); | |
| 867 if (char_word_map_[uni_char].empty()) | |
| 868 char_word_map_.erase(uni_char); // No longer in use. | |
| 869 } | |
| 870 | |
| 871 // Complete the removal of references to the word. | |
| 872 word_id_history_map_.erase(word_id); | |
| 873 word_map_.erase(word); | |
| 874 word_list_[word_id] = base::string16(); | |
| 875 available_words_.insert(word_id); | |
| 876 } | |
| 877 } | |
| 878 | |
| 879 void URLIndexPrivateData::ResetSearchTermCache() { | |
| 880 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | |
| 881 iter != search_term_cache_.end(); ++iter) | |
| 882 iter->second.used_ = false; | |
| 883 } | |
| 884 | |
| 885 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { | |
| 886 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
| 887 InMemoryURLIndexCacheItem index_cache; | |
| 888 SavePrivateData(&index_cache); | |
| 889 std::string data; | |
| 890 if (!index_cache.SerializeToString(&data)) { | |
| 891 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; | |
| 892 return false; | |
| 893 } | |
| 894 | |
| 895 int size = data.size(); | |
| 896 if (base::WriteFile(file_path, data.c_str(), size) != size) { | |
| 897 LOG(WARNING) << "Failed to write " << file_path.value(); | |
| 898 return false; | |
| 899 } | |
| 900 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", | |
| 901 base::TimeTicks::Now() - beginning_time); | |
| 902 return true; | |
| 903 } | |
| 904 | |
| 905 void URLIndexPrivateData::SavePrivateData( | |
| 906 InMemoryURLIndexCacheItem* cache) const { | |
| 907 DCHECK(cache); | |
| 908 cache->set_last_rebuild_timestamp( | |
| 909 last_time_rebuilt_from_history_.ToInternalValue()); | |
| 910 cache->set_version(saved_cache_version_); | |
| 911 // history_item_count_ is no longer used but rather than change the protobuf | |
| 912 // definition use a placeholder. This will go away with the switch to SQLite. | |
| 913 cache->set_history_item_count(0); | |
| 914 SaveWordList(cache); | |
| 915 SaveWordMap(cache); | |
| 916 SaveCharWordMap(cache); | |
| 917 SaveWordIDHistoryMap(cache); | |
| 918 SaveHistoryInfoMap(cache); | |
| 919 SaveWordStartsMap(cache); | |
| 920 } | |
| 921 | |
| 922 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { | |
| 923 if (word_list_.empty()) | |
| 924 return; | |
| 925 WordListItem* list_item = cache->mutable_word_list(); | |
| 926 list_item->set_word_count(word_list_.size()); | |
| 927 for (String16Vector::const_iterator iter = word_list_.begin(); | |
| 928 iter != word_list_.end(); ++iter) | |
| 929 list_item->add_word(base::UTF16ToUTF8(*iter)); | |
| 930 } | |
| 931 | |
| 932 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { | |
| 933 if (word_map_.empty()) | |
| 934 return; | |
| 935 WordMapItem* map_item = cache->mutable_word_map(); | |
| 936 map_item->set_item_count(word_map_.size()); | |
| 937 for (WordMap::const_iterator iter = word_map_.begin(); | |
| 938 iter != word_map_.end(); ++iter) { | |
| 939 WordMapEntry* map_entry = map_item->add_word_map_entry(); | |
| 940 map_entry->set_word(base::UTF16ToUTF8(iter->first)); | |
| 941 map_entry->set_word_id(iter->second); | |
| 942 } | |
| 943 } | |
| 944 | |
| 945 void URLIndexPrivateData::SaveCharWordMap( | |
| 946 InMemoryURLIndexCacheItem* cache) const { | |
| 947 if (char_word_map_.empty()) | |
| 948 return; | |
| 949 CharWordMapItem* map_item = cache->mutable_char_word_map(); | |
| 950 map_item->set_item_count(char_word_map_.size()); | |
| 951 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); | |
| 952 iter != char_word_map_.end(); ++iter) { | |
| 953 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); | |
| 954 map_entry->set_char_16(iter->first); | |
| 955 const WordIDSet& word_id_set(iter->second); | |
| 956 map_entry->set_item_count(word_id_set.size()); | |
| 957 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); | |
| 958 set_iter != word_id_set.end(); ++set_iter) | |
| 959 map_entry->add_word_id(*set_iter); | |
| 960 } | |
| 961 } | |
| 962 | |
| 963 void URLIndexPrivateData::SaveWordIDHistoryMap( | |
| 964 InMemoryURLIndexCacheItem* cache) const { | |
| 965 if (word_id_history_map_.empty()) | |
| 966 return; | |
| 967 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); | |
| 968 map_item->set_item_count(word_id_history_map_.size()); | |
| 969 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); | |
| 970 iter != word_id_history_map_.end(); ++iter) { | |
| 971 WordIDHistoryMapEntry* map_entry = | |
| 972 map_item->add_word_id_history_map_entry(); | |
| 973 map_entry->set_word_id(iter->first); | |
| 974 const HistoryIDSet& history_id_set(iter->second); | |
| 975 map_entry->set_item_count(history_id_set.size()); | |
| 976 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); | |
| 977 set_iter != history_id_set.end(); ++set_iter) | |
| 978 map_entry->add_history_id(*set_iter); | |
| 979 } | |
| 980 } | |
| 981 | |
| 982 void URLIndexPrivateData::SaveHistoryInfoMap( | |
| 983 InMemoryURLIndexCacheItem* cache) const { | |
| 984 if (history_info_map_.empty()) | |
| 985 return; | |
| 986 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); | |
| 987 map_item->set_item_count(history_info_map_.size()); | |
| 988 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
| 989 iter != history_info_map_.end(); ++iter) { | |
| 990 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); | |
| 991 map_entry->set_history_id(iter->first); | |
| 992 const history::URLRow& url_row(iter->second.url_row); | |
| 993 // Note: We only save information that contributes to the index so there | |
| 994 // is no need to save search_term_cache_ (not persistent). | |
| 995 map_entry->set_visit_count(url_row.visit_count()); | |
| 996 map_entry->set_typed_count(url_row.typed_count()); | |
| 997 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); | |
| 998 map_entry->set_url(url_row.url().spec()); | |
| 999 map_entry->set_title(base::UTF16ToUTF8(url_row.title())); | |
| 1000 const VisitInfoVector& visits(iter->second.visits); | |
| 1001 for (VisitInfoVector::const_iterator visit_iter = visits.begin(); | |
| 1002 visit_iter != visits.end(); ++visit_iter) { | |
| 1003 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); | |
| 1004 visit_info->set_visit_time(visit_iter->first.ToInternalValue()); | |
| 1005 visit_info->set_transition_type(visit_iter->second); | |
| 1006 } | |
| 1007 } | |
| 1008 } | |
| 1009 | |
| 1010 void URLIndexPrivateData::SaveWordStartsMap( | |
| 1011 InMemoryURLIndexCacheItem* cache) const { | |
| 1012 if (word_starts_map_.empty()) | |
| 1013 return; | |
| 1014 // For unit testing: Enable saving of the cache as an earlier version to | |
| 1015 // allow testing of cache file upgrading in ReadFromFile(). | |
| 1016 // TODO(mrossetti): Instead of intruding on production code with this kind of | |
| 1017 // test harness, save a copy of an older version cache with known results. | |
| 1018 // Implement this when switching the caching over to SQLite. | |
| 1019 if (saved_cache_version_ < 1) | |
| 1020 return; | |
| 1021 | |
| 1022 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); | |
| 1023 map_item->set_item_count(word_starts_map_.size()); | |
| 1024 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); | |
| 1025 iter != word_starts_map_.end(); ++iter) { | |
| 1026 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); | |
| 1027 map_entry->set_history_id(iter->first); | |
| 1028 const RowWordStarts& word_starts(iter->second); | |
| 1029 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); | |
| 1030 i != word_starts.url_word_starts_.end(); ++i) | |
| 1031 map_entry->add_url_word_starts(*i); | |
| 1032 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); | |
| 1033 i != word_starts.title_word_starts_.end(); ++i) | |
| 1034 map_entry->add_title_word_starts(*i); | |
| 1035 } | |
| 1036 } | |
| 1037 | |
| 1038 bool URLIndexPrivateData::RestorePrivateData( | |
| 1039 const InMemoryURLIndexCacheItem& cache, | |
| 1040 const std::string& languages) { | |
| 1041 last_time_rebuilt_from_history_ = | |
| 1042 base::Time::FromInternalValue(cache.last_rebuild_timestamp()); | |
| 1043 const base::TimeDelta rebuilt_ago = | |
| 1044 base::Time::Now() - last_time_rebuilt_from_history_; | |
| 1045 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || | |
| 1046 (rebuilt_ago < base::TimeDelta::FromDays(-1))) { | |
| 1047 // Cache is more than a week old or, somehow, from some time in the future. | |
| 1048 // It's probably a good time to rebuild the index from history to | |
| 1049 // allow synced entries to now appear, expired entries to disappear, etc. | |
| 1050 // Allow one day in the future to make the cache not rebuild on simple | |
| 1051 // system clock changes such as time zone changes. | |
| 1052 return false; | |
| 1053 } | |
| 1054 if (cache.has_version()) { | |
| 1055 if (cache.version() < kCurrentCacheFileVersion) { | |
| 1056 // Don't try to restore an old format cache file. (This will cause | |
| 1057 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData | |
| 1058 // from history.) | |
| 1059 return false; | |
| 1060 } | |
| 1061 restored_cache_version_ = cache.version(); | |
| 1062 } | |
| 1063 return RestoreWordList(cache) && RestoreWordMap(cache) && | |
| 1064 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && | |
| 1065 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); | |
| 1066 } | |
| 1067 | |
| 1068 bool URLIndexPrivateData::RestoreWordList( | |
| 1069 const InMemoryURLIndexCacheItem& cache) { | |
| 1070 if (!cache.has_word_list()) | |
| 1071 return false; | |
| 1072 const WordListItem& list_item(cache.word_list()); | |
| 1073 uint32 expected_item_count = list_item.word_count(); | |
| 1074 uint32 actual_item_count = list_item.word_size(); | |
| 1075 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1076 return false; | |
| 1077 const RepeatedPtrField<std::string>& words(list_item.word()); | |
| 1078 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); | |
| 1079 iter != words.end(); ++iter) | |
| 1080 word_list_.push_back(base::UTF8ToUTF16(*iter)); | |
| 1081 return true; | |
| 1082 } | |
| 1083 | |
| 1084 bool URLIndexPrivateData::RestoreWordMap( | |
| 1085 const InMemoryURLIndexCacheItem& cache) { | |
| 1086 if (!cache.has_word_map()) | |
| 1087 return false; | |
| 1088 const WordMapItem& list_item(cache.word_map()); | |
| 1089 uint32 expected_item_count = list_item.item_count(); | |
| 1090 uint32 actual_item_count = list_item.word_map_entry_size(); | |
| 1091 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1092 return false; | |
| 1093 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); | |
| 1094 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); | |
| 1095 iter != entries.end(); ++iter) | |
| 1096 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id(); | |
| 1097 return true; | |
| 1098 } | |
| 1099 | |
| 1100 bool URLIndexPrivateData::RestoreCharWordMap( | |
| 1101 const InMemoryURLIndexCacheItem& cache) { | |
| 1102 if (!cache.has_char_word_map()) | |
| 1103 return false; | |
| 1104 const CharWordMapItem& list_item(cache.char_word_map()); | |
| 1105 uint32 expected_item_count = list_item.item_count(); | |
| 1106 uint32 actual_item_count = list_item.char_word_map_entry_size(); | |
| 1107 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1108 return false; | |
| 1109 const RepeatedPtrField<CharWordMapEntry>& | |
| 1110 entries(list_item.char_word_map_entry()); | |
| 1111 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = | |
| 1112 entries.begin(); iter != entries.end(); ++iter) { | |
| 1113 expected_item_count = iter->item_count(); | |
| 1114 actual_item_count = iter->word_id_size(); | |
| 1115 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1116 return false; | |
| 1117 base::char16 uni_char = static_cast<base::char16>(iter->char_16()); | |
| 1118 WordIDSet word_id_set; | |
| 1119 const RepeatedField<int32>& word_ids(iter->word_id()); | |
| 1120 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); | |
| 1121 jiter != word_ids.end(); ++jiter) | |
| 1122 word_id_set.insert(*jiter); | |
| 1123 char_word_map_[uni_char] = word_id_set; | |
| 1124 } | |
| 1125 return true; | |
| 1126 } | |
| 1127 | |
| 1128 bool URLIndexPrivateData::RestoreWordIDHistoryMap( | |
| 1129 const InMemoryURLIndexCacheItem& cache) { | |
| 1130 if (!cache.has_word_id_history_map()) | |
| 1131 return false; | |
| 1132 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); | |
| 1133 uint32 expected_item_count = list_item.item_count(); | |
| 1134 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); | |
| 1135 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1136 return false; | |
| 1137 const RepeatedPtrField<WordIDHistoryMapEntry>& | |
| 1138 entries(list_item.word_id_history_map_entry()); | |
| 1139 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = | |
| 1140 entries.begin(); iter != entries.end(); ++iter) { | |
| 1141 expected_item_count = iter->item_count(); | |
| 1142 actual_item_count = iter->history_id_size(); | |
| 1143 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1144 return false; | |
| 1145 WordID word_id = iter->word_id(); | |
| 1146 HistoryIDSet history_id_set; | |
| 1147 const RepeatedField<int64>& history_ids(iter->history_id()); | |
| 1148 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); | |
| 1149 jiter != history_ids.end(); ++jiter) { | |
| 1150 history_id_set.insert(*jiter); | |
| 1151 AddToHistoryIDWordMap(*jiter, word_id); | |
| 1152 } | |
| 1153 word_id_history_map_[word_id] = history_id_set; | |
| 1154 } | |
| 1155 return true; | |
| 1156 } | |
| 1157 | |
| 1158 bool URLIndexPrivateData::RestoreHistoryInfoMap( | |
| 1159 const InMemoryURLIndexCacheItem& cache) { | |
| 1160 if (!cache.has_history_info_map()) | |
| 1161 return false; | |
| 1162 const HistoryInfoMapItem& list_item(cache.history_info_map()); | |
| 1163 uint32 expected_item_count = list_item.item_count(); | |
| 1164 uint32 actual_item_count = list_item.history_info_map_entry_size(); | |
| 1165 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1166 return false; | |
| 1167 const RepeatedPtrField<HistoryInfoMapEntry>& | |
| 1168 entries(list_item.history_info_map_entry()); | |
| 1169 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = | |
| 1170 entries.begin(); iter != entries.end(); ++iter) { | |
| 1171 HistoryID history_id = iter->history_id(); | |
| 1172 GURL url(iter->url()); | |
| 1173 history::URLRow url_row(url, history_id); | |
| 1174 url_row.set_visit_count(iter->visit_count()); | |
| 1175 url_row.set_typed_count(iter->typed_count()); | |
| 1176 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); | |
| 1177 if (iter->has_title()) { | |
| 1178 base::string16 title(base::UTF8ToUTF16(iter->title())); | |
| 1179 url_row.set_title(title); | |
| 1180 } | |
| 1181 history_info_map_[history_id].url_row = url_row; | |
| 1182 | |
| 1183 // Restore visits list. | |
| 1184 VisitInfoVector visits; | |
| 1185 visits.reserve(iter->visits_size()); | |
| 1186 for (int i = 0; i < iter->visits_size(); ++i) { | |
| 1187 visits.push_back(std::make_pair( | |
| 1188 base::Time::FromInternalValue(iter->visits(i).visit_time()), | |
| 1189 ui::PageTransitionFromInt(iter->visits(i).transition_type()))); | |
| 1190 } | |
| 1191 history_info_map_[history_id].visits = visits; | |
| 1192 } | |
| 1193 return true; | |
| 1194 } | |
| 1195 | |
| 1196 bool URLIndexPrivateData::RestoreWordStartsMap( | |
| 1197 const InMemoryURLIndexCacheItem& cache, | |
| 1198 const std::string& languages) { | |
| 1199 // Note that this function must be called after RestoreHistoryInfoMap() has | |
| 1200 // been run as the word starts may have to be recalculated from the urls and | |
| 1201 // page titles. | |
| 1202 if (cache.has_word_starts_map()) { | |
| 1203 const WordStartsMapItem& list_item(cache.word_starts_map()); | |
| 1204 uint32 expected_item_count = list_item.item_count(); | |
| 1205 uint32 actual_item_count = list_item.word_starts_map_entry_size(); | |
| 1206 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
| 1207 return false; | |
| 1208 const RepeatedPtrField<WordStartsMapEntry>& | |
| 1209 entries(list_item.word_starts_map_entry()); | |
| 1210 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = | |
| 1211 entries.begin(); iter != entries.end(); ++iter) { | |
| 1212 HistoryID history_id = iter->history_id(); | |
| 1213 RowWordStarts word_starts; | |
| 1214 // Restore the URL word starts. | |
| 1215 const RepeatedField<int32>& url_starts(iter->url_word_starts()); | |
| 1216 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); | |
| 1217 jiter != url_starts.end(); ++jiter) | |
| 1218 word_starts.url_word_starts_.push_back(*jiter); | |
| 1219 // Restore the page title word starts. | |
| 1220 const RepeatedField<int32>& title_starts(iter->title_word_starts()); | |
| 1221 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); | |
| 1222 jiter != title_starts.end(); ++jiter) | |
| 1223 word_starts.title_word_starts_.push_back(*jiter); | |
| 1224 word_starts_map_[history_id] = word_starts; | |
| 1225 } | |
| 1226 } else { | |
| 1227 // Since the cache did not contain any word starts we must rebuild then from | |
| 1228 // the URL and page titles. | |
| 1229 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
| 1230 iter != history_info_map_.end(); ++iter) { | |
| 1231 RowWordStarts word_starts; | |
| 1232 const history::URLRow& row(iter->second.url_row); | |
| 1233 const base::string16& url = | |
| 1234 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL); | |
| 1235 String16VectorFromString16(url, false, &word_starts.url_word_starts_); | |
| 1236 const base::string16& title = | |
| 1237 bookmarks::CleanUpTitleForMatching(row.title()); | |
| 1238 String16VectorFromString16(title, false, &word_starts.title_word_starts_); | |
| 1239 word_starts_map_[iter->first] = word_starts; | |
| 1240 } | |
| 1241 } | |
| 1242 return true; | |
| 1243 } | |
| 1244 | |
| 1245 // static | |
| 1246 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | |
| 1247 const GURL& gurl, | |
| 1248 const std::set<std::string>& whitelist) { | |
| 1249 return whitelist.find(gurl.scheme()) != whitelist.end(); | |
| 1250 } | |
| 1251 | |
| 1252 | |
| 1253 // SearchTermCacheItem --------------------------------------------------------- | |
| 1254 | |
| 1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | |
| 1256 const WordIDSet& word_id_set, | |
| 1257 const HistoryIDSet& history_id_set) | |
| 1258 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { | |
| 1259 } | |
| 1260 | |
| 1261 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { | |
| 1262 } | |
| 1263 | |
| 1264 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { | |
| 1265 } | |
| 1266 | |
| 1267 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- | |
| 1268 | |
| 1269 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( | |
| 1270 bookmarks::BookmarkModel* bookmark_model, | |
| 1271 const URLIndexPrivateData& private_data, | |
| 1272 const std::string& languages, | |
| 1273 const base::string16& lower_string, | |
| 1274 const String16Vector& lower_terms, | |
| 1275 const base::Time now) | |
| 1276 : bookmark_model_(bookmark_model), | |
| 1277 private_data_(private_data), | |
| 1278 languages_(languages), | |
| 1279 lower_string_(lower_string), | |
| 1280 lower_terms_(lower_terms), | |
| 1281 now_(now) { | |
| 1282 // Calculate offsets for each term. For instance, the offset for | |
| 1283 // ".net" should be 1, indicating that the actual word-part of the term | |
| 1284 // starts at offset 1. | |
| 1285 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u); | |
| 1286 for (size_t i = 0; i < lower_terms_.size(); ++i) { | |
| 1287 base::i18n::BreakIterator iter(lower_terms_[i], | |
| 1288 base::i18n::BreakIterator::BREAK_WORD); | |
| 1289 // If the iterator doesn't work, assume an offset of 0. | |
| 1290 if (!iter.Init()) | |
| 1291 continue; | |
| 1292 // Find the first word start. If the iterator didn't find a word break, set | |
| 1293 // an offset of term size. For example, the offset for "://" should be 3, | |
| 1294 // indicating that the word-part is missing. | |
| 1295 while (iter.Advance() && !iter.IsWord()) {} | |
| 1296 | |
| 1297 lower_terms_to_word_starts_offsets_[i] = iter.prev(); | |
| 1298 } | |
| 1299 } | |
| 1300 | |
| 1301 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() { | |
| 1302 } | |
| 1303 | |
| 1304 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
| 1305 const HistoryID history_id) { | |
| 1306 HistoryInfoMap::const_iterator hist_pos = | |
| 1307 private_data_.history_info_map_.find(history_id); | |
| 1308 if (hist_pos != private_data_.history_info_map_.end()) { | |
| 1309 const history::URLRow& hist_item = hist_pos->second.url_row; | |
| 1310 const VisitInfoVector& visits = hist_pos->second.visits; | |
| 1311 WordStartsMap::const_iterator starts_pos = | |
| 1312 private_data_.word_starts_map_.find(history_id); | |
| 1313 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
| 1314 ScoredHistoryMatch match( | |
| 1315 hist_item, visits, languages_, lower_string_, lower_terms_, | |
| 1316 lower_terms_to_word_starts_offsets_, starts_pos->second, | |
| 1317 bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()), | |
| 1318 now_); | |
| 1319 if (match.raw_score > 0) | |
| 1320 scored_matches_.push_back(match); | |
| 1321 } | |
| 1322 } | |
| 1323 | |
| 1324 | |
| 1325 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | |
| 1326 | |
| 1327 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
| 1328 const HistoryInfoMap& history_info_map) | |
| 1329 : history_info_map_(history_info_map) { | |
| 1330 } | |
| 1331 | |
| 1332 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() { | |
| 1333 } | |
| 1334 | |
| 1335 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
| 1336 const HistoryID h1, | |
| 1337 const HistoryID h2) { | |
| 1338 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
| 1339 if (entry1 == history_info_map_.end()) | |
| 1340 return false; | |
| 1341 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
| 1342 if (entry2 == history_info_map_.end()) | |
| 1343 return true; | |
| 1344 const history::URLRow& r1(entry1->second.url_row); | |
| 1345 const history::URLRow& r2(entry2->second.url_row); | |
| 1346 // First cut: typed count, visit count, recency. | |
| 1347 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
| 1348 // recently visited (within the last 12/24 hours) as highly important. Get | |
| 1349 // input from mpearson. | |
| 1350 if (r1.typed_count() != r2.typed_count()) | |
| 1351 return (r1.typed_count() > r2.typed_count()); | |
| 1352 if (r1.visit_count() != r2.visit_count()) | |
| 1353 return (r1.visit_count() > r2.visit_count()); | |
| 1354 return (r1.last_visit() > r2.last_visit()); | |
| 1355 } | |
| OLD | NEW |