| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #ifndef CHROME_BROWSER_AUTOCOMPLETE_URL_INDEX_PRIVATE_DATA_H_ | |
| 6 #define CHROME_BROWSER_AUTOCOMPLETE_URL_INDEX_PRIVATE_DATA_H_ | |
| 7 | |
| 8 #include <set> | |
| 9 #include <string> | |
| 10 | |
| 11 #include "base/files/file_path.h" | |
| 12 #include "base/gtest_prod_util.h" | |
| 13 #include "base/memory/ref_counted.h" | |
| 14 #include "chrome/browser/autocomplete/in_memory_url_index_cache.pb.h" | |
| 15 #include "components/history/core/browser/history_service.h" | |
| 16 #include "components/omnibox/in_memory_url_index_types.h" | |
| 17 #include "components/omnibox/scored_history_match.h" | |
| 18 | |
| 19 class HistoryQuickProviderTest; | |
| 20 | |
| 21 namespace bookmarks { | |
| 22 class BookmarkModel; | |
| 23 } | |
| 24 | |
| 25 namespace in_memory_url_index { | |
| 26 class InMemoryURLIndexCacheItem; | |
| 27 } | |
| 28 | |
| 29 namespace history { | |
| 30 class HistoryDatabase; | |
| 31 class InMemoryURLIndex; | |
| 32 class RefCountedBool; | |
| 33 } | |
| 34 | |
| 35 // Current version of the cache file. | |
| 36 static const int kCurrentCacheFileVersion = 5; | |
| 37 | |
| 38 // A structure private to InMemoryURLIndex describing its internal data and | |
| 39 // providing for restoring, rebuilding and updating that internal data. As | |
| 40 // this class is for exclusive use by the InMemoryURLIndex class there should | |
| 41 // be no calls from any other class. | |
| 42 // | |
| 43 // All public member functions are called on the main thread unless otherwise | |
| 44 // annotated. | |
| 45 class URLIndexPrivateData | |
| 46 : public base::RefCountedThreadSafe<URLIndexPrivateData> { | |
| 47 public: | |
| 48 URLIndexPrivateData(); | |
| 49 | |
| 50 // Given a base::string16 in |term_string|, scans the history index and | |
| 51 // returns a vector with all scored, matching history items. The | |
| 52 // |term_string| is broken down into individual terms (words), each of which | |
| 53 // must occur in the candidate history item's URL or page title for the item | |
| 54 // to qualify; however, the terms do not necessarily have to be adjacent. We | |
| 55 // also allow breaking |term_string| at |cursor_position| (if | |
| 56 // set). Once we have a set of candidates, they are filtered to ensure | |
| 57 // that all |term_string| terms, as separated by whitespace and the | |
| 58 // cursor (if set), occur within the candidate's URL or page title. | |
| 59 // Scores are then calculated on no more than |kItemsToScoreLimit| | |
| 60 // candidates, as the scoring of such a large number of candidates may | |
| 61 // cause perceptible typing response delays in the omnibox. This is | |
| 62 // likely to occur for short omnibox terms such as 'h' and 'w' which | |
| 63 // will be found in nearly all history candidates. Results are sorted by | |
| 64 // descending score. The full results set (i.e. beyond the | |
| 65 // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls | |
| 66 // to this function. |languages| is used to help parse/format the URLs in the | |
| 67 // history index. In total, |max_matches| of items will be returned in the | |
| 68 // |ScoredHistoryMatches| vector. | |
| 69 ScoredHistoryMatches HistoryItemsForTerms( | |
| 70 base::string16 term_string, | |
| 71 size_t cursor_position, | |
| 72 size_t max_matches, | |
| 73 const std::string& languages, | |
| 74 bookmarks::BookmarkModel* bookmark_model); | |
| 75 | |
| 76 // Adds the history item in |row| to the index if it does not already already | |
| 77 // exist and it meets the minimum 'quick' criteria. If the row already exists | |
| 78 // in the index then the index will be updated if the row still meets the | |
| 79 // criteria, otherwise the row will be removed from the index. Returns true | |
| 80 // if the index was actually updated. |languages| gives a list of language | |
| 81 // encodings by which the URLs and page titles are broken down into words and | |
| 82 // characters. |scheme_whitelist| is used to filter non-qualifying schemes. | |
| 83 // |history_service| is used to schedule an update to the recent visits | |
| 84 // component of this URL's entry in the index. | |
| 85 bool UpdateURL(history::HistoryService* history_service, | |
| 86 const history::URLRow& row, | |
| 87 const std::string& languages, | |
| 88 const std::set<std::string>& scheme_whitelist, | |
| 89 base::CancelableTaskTracker* tracker); | |
| 90 | |
| 91 // Updates the entry for |url_id| in the index, replacing its | |
| 92 // recent visits information with |recent_visits|. If |url_id| | |
| 93 // is not in the index, does nothing. | |
| 94 void UpdateRecentVisits(history::URLID url_id, | |
| 95 const history::VisitVector& recent_visits); | |
| 96 | |
| 97 // Using |history_service| schedules an update (using the historyDB | |
| 98 // thread) for the recent visits information for |url_id|. Unless | |
| 99 // something unexpectedly goes wrong, UdpateRecentVisits() should | |
| 100 // eventually be called from a callback. | |
| 101 void ScheduleUpdateRecentVisits(history::HistoryService* history_service, | |
| 102 history::URLID url_id, | |
| 103 base::CancelableTaskTracker* tracker); | |
| 104 | |
| 105 // Deletes index data for the history item with the given |url|. | |
| 106 // The item may not have actually been indexed, which is the case if it did | |
| 107 // not previously meet minimum 'quick' criteria. Returns true if the index | |
| 108 // was actually updated. | |
| 109 bool DeleteURL(const GURL& url); | |
| 110 | |
| 111 // Constructs a new object by restoring its contents from the cache file | |
| 112 // at |path|. Returns the new URLIndexPrivateData which on success will | |
| 113 // contain the restored data but upon failure will be empty. |languages| | |
| 114 // is used to break URLs and page titles into words. This function | |
| 115 // should be run on the the file thread. | |
| 116 static scoped_refptr<URLIndexPrivateData> RestoreFromFile( | |
| 117 const base::FilePath& path, | |
| 118 const std::string& languages); | |
| 119 | |
| 120 // Constructs a new object by rebuilding its contents from the history | |
| 121 // database in |history_db|. Returns the new URLIndexPrivateData which on | |
| 122 // success will contain the rebuilt data but upon failure will be empty. | |
| 123 // |languages| gives a list of language encodings by which the URLs and page | |
| 124 // titles are broken down into words and characters. | |
| 125 static scoped_refptr<URLIndexPrivateData> RebuildFromHistory( | |
| 126 history::HistoryDatabase* history_db, | |
| 127 const std::string& languages, | |
| 128 const std::set<std::string>& scheme_whitelist); | |
| 129 | |
| 130 // Writes |private_data| as a cache file to |file_path| and returns success. | |
| 131 static bool WritePrivateDataToCacheFileTask( | |
| 132 scoped_refptr<URLIndexPrivateData> private_data, | |
| 133 const base::FilePath& file_path); | |
| 134 | |
| 135 // Creates a copy of ourself. | |
| 136 scoped_refptr<URLIndexPrivateData> Duplicate() const; | |
| 137 | |
| 138 // Returns true if there is no data in the index. | |
| 139 bool Empty() const; | |
| 140 | |
| 141 // Initializes all index data members in preparation for restoring the index | |
| 142 // from the cache or a complete rebuild from the history database. | |
| 143 void Clear(); | |
| 144 | |
| 145 private: | |
| 146 friend class base::RefCountedThreadSafe<URLIndexPrivateData>; | |
| 147 ~URLIndexPrivateData(); | |
| 148 | |
| 149 friend class AddHistoryMatch; | |
| 150 friend class ::HistoryQuickProviderTest; | |
| 151 friend class InMemoryURLIndexTest; | |
| 152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch); | |
| 153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); | |
| 154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet); | |
| 155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory); | |
| 156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld); | |
| 157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); | |
| 158 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); | |
| 159 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); | |
| 160 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs); | |
| 161 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); | |
| 162 | |
| 163 // Support caching of term results so that we can optimize searches which | |
| 164 // build upon a previous search. Each entry in this map represents one | |
| 165 // search term from the most recent search. For example, if the user had | |
| 166 // typed "google blog trans" and then typed an additional 'l' (at the end, | |
| 167 // of course) then there would be four items in the cache: 'blog', 'google', | |
| 168 // 'trans', and 'transl'. All would be marked as being in use except for the | |
| 169 // 'trans' item; its cached data would have been used when optimizing the | |
| 170 // construction of the search results candidates for 'transl' but then would | |
| 171 // no longer needed. | |
| 172 // | |
| 173 // Items stored in the search term cache. If a search term exactly matches one | |
| 174 // in the cache then we can quickly supply the proper |history_id_set_| (and | |
| 175 // marking the cache item as being |used_|. If we find a prefix for a search | |
| 176 // term in the cache (which is very likely to occur as the user types each | |
| 177 // term into the omnibox) then we can short-circuit the index search for those | |
| 178 // characters in the prefix by returning the |word_id_set|. In that case we do | |
| 179 // not mark the item as being |used_|. | |
| 180 struct SearchTermCacheItem { | |
| 181 SearchTermCacheItem(const WordIDSet& word_id_set, | |
| 182 const HistoryIDSet& history_id_set); | |
| 183 // Creates a cache item for a term which has no results. | |
| 184 SearchTermCacheItem(); | |
| 185 | |
| 186 ~SearchTermCacheItem(); | |
| 187 | |
| 188 WordIDSet word_id_set_; | |
| 189 HistoryIDSet history_id_set_; | |
| 190 bool used_; // True if this item has been used for the current term search. | |
| 191 }; | |
| 192 typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap; | |
| 193 | |
| 194 // A helper class which performs the final filter on each candidate | |
| 195 // history URL match, inserting accepted matches into |scored_matches_|. | |
| 196 class AddHistoryMatch : public std::unary_function<HistoryID, void> { | |
| 197 public: | |
| 198 AddHistoryMatch(bookmarks::BookmarkModel* bookmark_model, | |
| 199 const URLIndexPrivateData& private_data, | |
| 200 const std::string& languages, | |
| 201 const base::string16& lower_string, | |
| 202 const String16Vector& lower_terms, | |
| 203 const base::Time now); | |
| 204 ~AddHistoryMatch(); | |
| 205 | |
| 206 void operator()(const HistoryID history_id); | |
| 207 | |
| 208 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; } | |
| 209 | |
| 210 private: | |
| 211 friend class InMemoryURLIndexTest; | |
| 212 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch); | |
| 213 bookmarks::BookmarkModel* bookmark_model_; | |
| 214 const URLIndexPrivateData& private_data_; | |
| 215 const std::string& languages_; | |
| 216 ScoredHistoryMatches scored_matches_; | |
| 217 const base::string16& lower_string_; | |
| 218 const String16Vector& lower_terms_; | |
| 219 WordStarts lower_terms_to_word_starts_offsets_; | |
| 220 const base::Time now_; | |
| 221 }; | |
| 222 | |
| 223 // A helper predicate class used to filter excess history items when the | |
| 224 // candidate results set is too large. | |
| 225 class HistoryItemFactorGreater | |
| 226 : public std::binary_function<HistoryID, HistoryID, void> { | |
| 227 public: | |
| 228 explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map); | |
| 229 ~HistoryItemFactorGreater(); | |
| 230 | |
| 231 bool operator()(const HistoryID h1, const HistoryID h2); | |
| 232 | |
| 233 private: | |
| 234 const HistoryInfoMap& history_info_map_; | |
| 235 }; | |
| 236 | |
| 237 // URL History indexing support functions. | |
| 238 | |
| 239 // Composes a set of history item IDs by intersecting the set for each word | |
| 240 // in |unsorted_words|. | |
| 241 HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words); | |
| 242 | |
| 243 // Helper function to HistoryIDSetFromWords which composes a set of history | |
| 244 // ids for the given term given in |term|. | |
| 245 HistoryIDSet HistoryIDsForTerm(const base::string16& term); | |
| 246 | |
| 247 // Given a set of Char16s, finds words containing those characters. | |
| 248 WordIDSet WordIDSetForTermChars(const Char16Set& term_chars); | |
| 249 | |
| 250 // Indexes one URL history item as described by |row|. Returns true if the | |
| 251 // row was actually indexed. |languages| gives a list of language encodings by | |
| 252 // which the URLs and page titles are broken down into words and characters. | |
| 253 // |scheme_whitelist| is used to filter non-qualifying schemes. If | |
| 254 // |history_db| is not NULL then this function uses the history database | |
| 255 // synchronously to get the URL's recent visits information. This mode should | |
| 256 // only be used on the historyDB thread. If |history_db| is NULL, then | |
| 257 // this function uses |history_service| to schedule a task on the | |
| 258 // historyDB thread to fetch and update the recent visits | |
| 259 // information. | |
| 260 bool IndexRow(history::HistoryDatabase* history_db, | |
| 261 history::HistoryService* history_service, | |
| 262 const history::URLRow& row, | |
| 263 const std::string& languages, | |
| 264 const std::set<std::string>& scheme_whitelist, | |
| 265 base::CancelableTaskTracker* tracker); | |
| 266 | |
| 267 // Parses and indexes the words in the URL and page title of |row| and | |
| 268 // calculate the word starts in each, saving the starts in |word_starts|. | |
| 269 // |languages| gives a list of language encodings by which the URLs and page | |
| 270 // titles are broken down into words and characters. | |
| 271 void AddRowWordsToIndex(const history::URLRow& row, | |
| 272 RowWordStarts* word_starts, | |
| 273 const std::string& languages); | |
| 274 | |
| 275 // Given a single word in |uni_word|, adds a reference for the containing | |
| 276 // history item identified by |history_id| to the index. | |
| 277 void AddWordToIndex(const base::string16& uni_word, HistoryID history_id); | |
| 278 | |
| 279 // Creates a new entry in the word/history map for |word_id| and add | |
| 280 // |history_id| as the initial element of the word's set. | |
| 281 void AddWordHistory(const base::string16& uni_word, HistoryID history_id); | |
| 282 | |
| 283 // Updates an existing entry in the word/history index by adding the | |
| 284 // |history_id| to set for |word_id| in the word_id_history_map_. | |
| 285 void UpdateWordHistory(WordID word_id, HistoryID history_id); | |
| 286 | |
| 287 // Adds |word_id| to |history_id|'s entry in the history/word map, | |
| 288 // creating a new entry if one does not already exist. | |
| 289 void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id); | |
| 290 | |
| 291 // Removes |row| and all associated words and characters from the index. | |
| 292 void RemoveRowFromIndex(const history::URLRow& row); | |
| 293 | |
| 294 // Removes all words and characters associated with |row| from the index. | |
| 295 void RemoveRowWordsFromIndex(const history::URLRow& row); | |
| 296 | |
| 297 // Clears |used_| for each item in the search term cache. | |
| 298 void ResetSearchTermCache(); | |
| 299 | |
| 300 // Caches the index private data and writes the cache file to the profile | |
| 301 // directory. Called by WritePrivateDataToCacheFileTask. | |
| 302 bool SaveToFile(const base::FilePath& file_path); | |
| 303 | |
| 304 // Encode a data structure into the protobuf |cache|. | |
| 305 void SavePrivateData( | |
| 306 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 307 void SaveWordList( | |
| 308 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 309 void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 310 void SaveCharWordMap( | |
| 311 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 312 void SaveWordIDHistoryMap( | |
| 313 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 314 void SaveHistoryInfoMap( | |
| 315 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 316 void SaveWordStartsMap( | |
| 317 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; | |
| 318 | |
| 319 // Decode a data structure from the protobuf |cache|. Return false if there | |
| 320 // is any kind of failure. |languages| will be used to break URLs and page | |
| 321 // titles into words | |
| 322 bool RestorePrivateData( | |
| 323 const in_memory_url_index::InMemoryURLIndexCacheItem& cache, | |
| 324 const std::string& languages); | |
| 325 bool RestoreWordList( | |
| 326 const in_memory_url_index::InMemoryURLIndexCacheItem& cache); | |
| 327 bool RestoreWordMap( | |
| 328 const in_memory_url_index::InMemoryURLIndexCacheItem& cache); | |
| 329 bool RestoreCharWordMap( | |
| 330 const in_memory_url_index::InMemoryURLIndexCacheItem& cache); | |
| 331 bool RestoreWordIDHistoryMap( | |
| 332 const in_memory_url_index::InMemoryURLIndexCacheItem& cache); | |
| 333 bool RestoreHistoryInfoMap( | |
| 334 const in_memory_url_index::InMemoryURLIndexCacheItem& cache); | |
| 335 bool RestoreWordStartsMap( | |
| 336 const in_memory_url_index::InMemoryURLIndexCacheItem& cache, | |
| 337 const std::string& languages); | |
| 338 | |
| 339 // Determines if |gurl| has a whitelisted scheme and returns true if so. | |
| 340 static bool URLSchemeIsWhitelisted(const GURL& gurl, | |
| 341 const std::set<std::string>& whitelist); | |
| 342 | |
| 343 // Cache of search terms. | |
| 344 SearchTermCacheMap search_term_cache_; | |
| 345 | |
| 346 // Start of data members that are cached ------------------------------------- | |
| 347 | |
| 348 // The version of the cache file most recently used to restore this instance | |
| 349 // of the private data. If the private data was rebuilt from the history | |
| 350 // database this will be 0. | |
| 351 int restored_cache_version_; | |
| 352 | |
| 353 // The last time the data was rebuilt from the history database. | |
| 354 base::Time last_time_rebuilt_from_history_; | |
| 355 | |
| 356 // A list of all of indexed words. The index of a word in this list is the | |
| 357 // ID of the word in the word_map_. It reduces the memory overhead by | |
| 358 // replacing a potentially long and repeated string with a simple index. | |
| 359 String16Vector word_list_; | |
| 360 | |
| 361 // A list of available words slots in |word_list_|. An available word slot | |
| 362 // is the index of a unused word in word_list_ vector, also referred to as | |
| 363 // a WordID. As URL visits are added or modified new words may be added to | |
| 364 // the index, in which case any available words are used, if any, and then | |
| 365 // words are added to the end of the word_list_. When URL visits are | |
| 366 // modified or deleted old words may be removed from the index, in which | |
| 367 // case the slots for those words are added to available_words_ for resuse | |
| 368 // by future URL updates. | |
| 369 WordIDSet available_words_; | |
| 370 | |
| 371 // A one-to-one mapping from the a word string to its slot number (i.e. | |
| 372 // WordID) in the |word_list_|. | |
| 373 WordMap word_map_; | |
| 374 | |
| 375 // A one-to-many mapping from a single character to all WordIDs of words | |
| 376 // containing that character. | |
| 377 CharWordIDMap char_word_map_; | |
| 378 | |
| 379 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as | |
| 380 // used in the history database) of history items in which the word occurs. | |
| 381 WordIDHistoryMap word_id_history_map_; | |
| 382 | |
| 383 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur | |
| 384 // in the URL and/or page title of the history item referenced by that | |
| 385 // HistoryID. | |
| 386 HistoryIDWordMap history_id_word_map_; | |
| 387 | |
| 388 // A one-to-one mapping from HistoryID to the history item data governing | |
| 389 // index inclusion and relevance scoring. | |
| 390 HistoryInfoMap history_info_map_; | |
| 391 | |
| 392 // A one-to-one mapping from HistoryID to the word starts detected in each | |
| 393 // item's URL and page title. | |
| 394 WordStartsMap word_starts_map_; | |
| 395 | |
| 396 // End of data members that are cached --------------------------------------- | |
| 397 | |
| 398 // For unit testing only. Specifies the version of the cache file to be saved. | |
| 399 // Used only for testing upgrading of an older version of the cache upon | |
| 400 // restore. | |
| 401 int saved_cache_version_; | |
| 402 | |
| 403 // Used for unit testing only. Records the number of candidate history items | |
| 404 // at three stages in the index searching process. | |
| 405 size_t pre_filter_item_count_; // After word index is queried. | |
| 406 size_t post_filter_item_count_; // After trimming large result set. | |
| 407 size_t post_scoring_item_count_; // After performing final filter/scoring. | |
| 408 }; | |
| 409 | |
| 410 #endif // CHROME_BROWSER_AUTOCOMPLETE_URL_INDEX_PRIVATE_DATA_H_ | |
| OLD | NEW |