OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ | |
6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ | |
7 | |
8 #include <set> | |
9 #include <string> | |
10 | |
11 #include "base/files/file_path.h" | |
12 #include "base/gtest_prod_util.h" | |
13 #include "base/memory/ref_counted.h" | |
14 #include "chrome/browser/history/history_service.h" | |
15 #include "components/history/core/browser/in_memory_url_index_cache.pb.h" | |
16 #include "components/history/core/browser/in_memory_url_index_types.h" | |
17 #include "components/history/core/browser/scored_history_match.h" | |
18 | |
19 class HistoryQuickProviderTest; | |
20 | |
21 namespace in_memory_url_index { | |
22 class InMemoryURLIndexCacheItem; | |
23 } | |
24 | |
25 namespace history { | |
26 | |
27 namespace imui = in_memory_url_index; | |
28 | |
29 class HistoryDatabase; | |
30 class InMemoryURLIndex; | |
31 class RefCountedBool; | |
32 | |
33 // Current version of the cache file. | |
34 static const int kCurrentCacheFileVersion = 5; | |
35 | |
36 // A structure private to InMemoryURLIndex describing its internal data and | |
37 // providing for restoring, rebuilding and updating that internal data. As | |
38 // this class is for exclusive use by the InMemoryURLIndex class there should | |
39 // be no calls from any other class. | |
40 // | |
41 // All public member functions are called on the main thread unless otherwise | |
42 // annotated. | |
43 class URLIndexPrivateData | |
44 : public base::RefCountedThreadSafe<URLIndexPrivateData> { | |
45 public: | |
46 URLIndexPrivateData(); | |
47 | |
48 // Given a base::string16 in |term_string|, scans the history index and | |
49 // returns a vector with all scored, matching history items. The | |
50 // |term_string| is broken down into individual terms (words), each of which | |
51 // must occur in the candidate history item's URL or page title for the item | |
52 // to qualify; however, the terms do not necessarily have to be adjacent. We | |
53 // also allow breaking |term_string| at |cursor_position| (if | |
54 // set). Once we have a set of candidates, they are filtered to ensure | |
55 // that all |term_string| terms, as separated by whitespace and the | |
56 // cursor (if set), occur within the candidate's URL or page title. | |
57 // Scores are then calculated on no more than |kItemsToScoreLimit| | |
58 // candidates, as the scoring of such a large number of candidates may | |
59 // cause perceptible typing response delays in the omnibox. This is | |
60 // likely to occur for short omnibox terms such as 'h' and 'w' which | |
61 // will be found in nearly all history candidates. Results are sorted by | |
62 // descending score. The full results set (i.e. beyond the | |
63 // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls | |
64 // to this function. |languages| is used to help parse/format the URLs in the | |
65 // history index. In total, |max_matches| of items will be returned in the | |
66 // |ScoredHistoryMatches| vector. | |
67 ScoredHistoryMatches HistoryItemsForTerms( | |
68 base::string16 term_string, | |
69 size_t cursor_position, | |
70 size_t max_matches, | |
71 const std::string& languages, | |
72 const ScoredHistoryMatch::Builder& builder); | |
73 | |
74 // Adds the history item in |row| to the index if it does not already already | |
75 // exist and it meets the minimum 'quick' criteria. If the row already exists | |
76 // in the index then the index will be updated if the row still meets the | |
77 // criteria, otherwise the row will be removed from the index. Returns true | |
78 // if the index was actually updated. |languages| gives a list of language | |
79 // encodings by which the URLs and page titles are broken down into words and | |
80 // characters. |scheme_whitelist| is used to filter non-qualifying schemes. | |
81 // |history_service| is used to schedule an update to the recent visits | |
82 // component of this URL's entry in the index. | |
83 bool UpdateURL(HistoryService* history_service, | |
84 const URLRow& row, | |
85 const std::string& languages, | |
86 const std::set<std::string>& scheme_whitelist, | |
87 base::CancelableTaskTracker* tracker); | |
88 | |
89 // Updates the entry for |url_id| in the index, replacing its | |
90 // recent visits information with |recent_visits|. If |url_id| | |
91 // is not in the index, does nothing. | |
92 void UpdateRecentVisits(URLID url_id, | |
93 const VisitVector& recent_visits); | |
94 | |
95 // Using |history_service| schedules an update (using the historyDB | |
96 // thread) for the recent visits information for |url_id|. Unless | |
97 // something unexpectedly goes wrong, UdpateRecentVisits() should | |
98 // eventually be called from a callback. | |
99 void ScheduleUpdateRecentVisits(HistoryService* history_service, | |
100 URLID url_id, | |
101 base::CancelableTaskTracker* tracker); | |
102 | |
103 // Deletes index data for the history item with the given |url|. | |
104 // The item may not have actually been indexed, which is the case if it did | |
105 // not previously meet minimum 'quick' criteria. Returns true if the index | |
106 // was actually updated. | |
107 bool DeleteURL(const GURL& url); | |
108 | |
109 // Constructs a new object by restoring its contents from the cache file | |
110 // at |path|. Returns the new URLIndexPrivateData which on success will | |
111 // contain the restored data but upon failure will be empty. |languages| | |
112 // is used to break URLs and page titles into words. This function | |
113 // should be run on the the file thread. | |
114 static scoped_refptr<URLIndexPrivateData> RestoreFromFile( | |
115 const base::FilePath& path, | |
116 const std::string& languages); | |
117 | |
118 // Constructs a new object by rebuilding its contents from the history | |
119 // database in |history_db|. Returns the new URLIndexPrivateData which on | |
120 // success will contain the rebuilt data but upon failure will be empty. | |
121 // |languages| gives a list of language encodings by which the URLs and page | |
122 // titles are broken down into words and characters. | |
123 static scoped_refptr<URLIndexPrivateData> RebuildFromHistory( | |
124 HistoryDatabase* history_db, | |
125 const std::string& languages, | |
126 const std::set<std::string>& scheme_whitelist); | |
127 | |
128 // Writes |private_data| as a cache file to |file_path| and returns success. | |
129 static bool WritePrivateDataToCacheFileTask( | |
130 scoped_refptr<URLIndexPrivateData> private_data, | |
131 const base::FilePath& file_path); | |
132 | |
133 // Creates a copy of ourself. | |
134 scoped_refptr<URLIndexPrivateData> Duplicate() const; | |
135 | |
136 // Returns true if there is no data in the index. | |
137 bool Empty() const; | |
138 | |
139 // Initializes all index data members in preparation for restoring the index | |
140 // from the cache or a complete rebuild from the history database. | |
141 void Clear(); | |
142 | |
143 private: | |
144 friend class base::RefCountedThreadSafe<URLIndexPrivateData>; | |
145 ~URLIndexPrivateData(); | |
146 | |
147 friend class AddHistoryMatch; | |
148 friend class ::HistoryQuickProviderTest; | |
149 friend class InMemoryURLIndexTest; | |
150 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); | |
151 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet); | |
152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory); | |
153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld); | |
154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); | |
155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); | |
156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); | |
157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs); | |
158 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); | |
159 | |
160 // Support caching of term results so that we can optimize searches which | |
161 // build upon a previous search. Each entry in this map represents one | |
162 // search term from the most recent search. For example, if the user had | |
163 // typed "google blog trans" and then typed an additional 'l' (at the end, | |
164 // of course) then there would be four items in the cache: 'blog', 'google', | |
165 // 'trans', and 'transl'. All would be marked as being in use except for the | |
166 // 'trans' item; its cached data would have been used when optimizing the | |
167 // construction of the search results candidates for 'transl' but then would | |
168 // no longer needed. | |
169 // | |
170 // Items stored in the search term cache. If a search term exactly matches one | |
171 // in the cache then we can quickly supply the proper |history_id_set_| (and | |
172 // marking the cache item as being |used_|. If we find a prefix for a search | |
173 // term in the cache (which is very likely to occur as the user types each | |
174 // term into the omnibox) then we can short-circuit the index search for those | |
175 // characters in the prefix by returning the |word_id_set|. In that case we do | |
176 // not mark the item as being |used_|. | |
177 struct SearchTermCacheItem { | |
178 SearchTermCacheItem(const WordIDSet& word_id_set, | |
179 const HistoryIDSet& history_id_set); | |
180 // Creates a cache item for a term which has no results. | |
181 SearchTermCacheItem(); | |
182 | |
183 ~SearchTermCacheItem(); | |
184 | |
185 WordIDSet word_id_set_; | |
186 HistoryIDSet history_id_set_; | |
187 bool used_; // True if this item has been used for the current term search. | |
188 }; | |
189 typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap; | |
190 | |
191 // A helper class which performs the final filter on each candidate | |
192 // history URL match, inserting accepted matches into |scored_matches_|. | |
193 class AddHistoryMatch : public std::unary_function<HistoryID, void> { | |
194 public: | |
195 AddHistoryMatch(const URLIndexPrivateData& private_data, | |
196 const std::string& languages, | |
197 const base::string16& lower_string, | |
198 const String16Vector& lower_terms, | |
199 const base::Time now, | |
200 const ScoredHistoryMatch::Builder& builder); | |
201 ~AddHistoryMatch(); | |
202 | |
203 void operator()(const HistoryID history_id); | |
204 | |
205 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; } | |
206 | |
207 private: | |
208 const URLIndexPrivateData& private_data_; | |
209 const std::string& languages_; | |
210 const ScoredHistoryMatch::Builder& builder_; | |
211 ScoredHistoryMatches scored_matches_; | |
212 const base::string16& lower_string_; | |
213 const String16Vector& lower_terms_; | |
214 WordStarts lower_terms_to_word_starts_offsets_; | |
215 const base::Time now_; | |
216 }; | |
217 | |
218 // A helper predicate class used to filter excess history items when the | |
219 // candidate results set is too large. | |
220 class HistoryItemFactorGreater | |
221 : public std::binary_function<HistoryID, HistoryID, void> { | |
222 public: | |
223 explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map); | |
224 ~HistoryItemFactorGreater(); | |
225 | |
226 bool operator()(const HistoryID h1, const HistoryID h2); | |
227 | |
228 private: | |
229 const history::HistoryInfoMap& history_info_map_; | |
230 }; | |
231 | |
232 // URL History indexing support functions. | |
233 | |
234 // Composes a set of history item IDs by intersecting the set for each word | |
235 // in |unsorted_words|. | |
236 HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words); | |
237 | |
238 // Helper function to HistoryIDSetFromWords which composes a set of history | |
239 // ids for the given term given in |term|. | |
240 HistoryIDSet HistoryIDsForTerm(const base::string16& term); | |
241 | |
242 // Given a set of Char16s, finds words containing those characters. | |
243 WordIDSet WordIDSetForTermChars(const Char16Set& term_chars); | |
244 | |
245 // Indexes one URL history item as described by |row|. Returns true if the | |
246 // row was actually indexed. |languages| gives a list of language encodings by | |
247 // which the URLs and page titles are broken down into words and characters. | |
248 // |scheme_whitelist| is used to filter non-qualifying schemes. If | |
249 // |history_db| is not NULL then this function uses the history database | |
250 // synchronously to get the URL's recent visits information. This mode should | |
251 // only be used on the historyDB thread. If |history_db| is NULL, then | |
252 // this function uses |history_service| to schedule a task on the | |
253 // historyDB thread to fetch and update the recent visits | |
254 // information. | |
255 bool IndexRow(HistoryDatabase* history_db, | |
256 HistoryService* history_service, | |
257 const URLRow& row, | |
258 const std::string& languages, | |
259 const std::set<std::string>& scheme_whitelist, | |
260 base::CancelableTaskTracker* tracker); | |
261 | |
262 // Parses and indexes the words in the URL and page title of |row| and | |
263 // calculate the word starts in each, saving the starts in |word_starts|. | |
264 // |languages| gives a list of language encodings by which the URLs and page | |
265 // titles are broken down into words and characters. | |
266 void AddRowWordsToIndex(const URLRow& row, | |
267 RowWordStarts* word_starts, | |
268 const std::string& languages); | |
269 | |
270 // Given a single word in |uni_word|, adds a reference for the containing | |
271 // history item identified by |history_id| to the index. | |
272 void AddWordToIndex(const base::string16& uni_word, HistoryID history_id); | |
273 | |
274 // Creates a new entry in the word/history map for |word_id| and add | |
275 // |history_id| as the initial element of the word's set. | |
276 void AddWordHistory(const base::string16& uni_word, HistoryID history_id); | |
277 | |
278 // Updates an existing entry in the word/history index by adding the | |
279 // |history_id| to set for |word_id| in the word_id_history_map_. | |
280 void UpdateWordHistory(WordID word_id, HistoryID history_id); | |
281 | |
282 // Adds |word_id| to |history_id|'s entry in the history/word map, | |
283 // creating a new entry if one does not already exist. | |
284 void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id); | |
285 | |
286 // Removes |row| and all associated words and characters from the index. | |
287 void RemoveRowFromIndex(const URLRow& row); | |
288 | |
289 // Removes all words and characters associated with |row| from the index. | |
290 void RemoveRowWordsFromIndex(const URLRow& row); | |
291 | |
292 // Clears |used_| for each item in the search term cache. | |
293 void ResetSearchTermCache(); | |
294 | |
295 // Caches the index private data and writes the cache file to the profile | |
296 // directory. Called by WritePrivateDataToCacheFileTask. | |
297 bool SaveToFile(const base::FilePath& file_path); | |
298 | |
299 // Encode a data structure into the protobuf |cache|. | |
300 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const; | |
301 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const; | |
302 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const; | |
303 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const; | |
304 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const; | |
305 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const; | |
306 void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const; | |
307 | |
308 // Decode a data structure from the protobuf |cache|. Return false if there | |
309 // is any kind of failure. |languages| will be used to break URLs and page | |
310 // titles into words | |
311 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache, | |
312 const std::string& languages); | |
313 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache); | |
314 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache); | |
315 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache); | |
316 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache); | |
317 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache); | |
318 bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache, | |
319 const std::string& languages); | |
320 | |
321 // Determines if |gurl| has a whitelisted scheme and returns true if so. | |
322 static bool URLSchemeIsWhitelisted(const GURL& gurl, | |
323 const std::set<std::string>& whitelist); | |
324 | |
325 // Cache of search terms. | |
326 SearchTermCacheMap search_term_cache_; | |
327 | |
328 // Start of data members that are cached ------------------------------------- | |
329 | |
330 // The version of the cache file most recently used to restore this instance | |
331 // of the private data. If the private data was rebuilt from the history | |
332 // database this will be 0. | |
333 int restored_cache_version_; | |
334 | |
335 // The last time the data was rebuilt from the history database. | |
336 base::Time last_time_rebuilt_from_history_; | |
337 | |
338 // A list of all of indexed words. The index of a word in this list is the | |
339 // ID of the word in the word_map_. It reduces the memory overhead by | |
340 // replacing a potentially long and repeated string with a simple index. | |
341 String16Vector word_list_; | |
342 | |
343 // A list of available words slots in |word_list_|. An available word slot | |
344 // is the index of a unused word in word_list_ vector, also referred to as | |
345 // a WordID. As URL visits are added or modified new words may be added to | |
346 // the index, in which case any available words are used, if any, and then | |
347 // words are added to the end of the word_list_. When URL visits are | |
348 // modified or deleted old words may be removed from the index, in which | |
349 // case the slots for those words are added to available_words_ for resuse | |
350 // by future URL updates. | |
351 WordIDSet available_words_; | |
352 | |
353 // A one-to-one mapping from the a word string to its slot number (i.e. | |
354 // WordID) in the |word_list_|. | |
355 WordMap word_map_; | |
356 | |
357 // A one-to-many mapping from a single character to all WordIDs of words | |
358 // containing that character. | |
359 CharWordIDMap char_word_map_; | |
360 | |
361 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as | |
362 // used in the history database) of history items in which the word occurs. | |
363 WordIDHistoryMap word_id_history_map_; | |
364 | |
365 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur | |
366 // in the URL and/or page title of the history item referenced by that | |
367 // HistoryID. | |
368 HistoryIDWordMap history_id_word_map_; | |
369 | |
370 // A one-to-one mapping from HistoryID to the history item data governing | |
371 // index inclusion and relevance scoring. | |
372 HistoryInfoMap history_info_map_; | |
373 | |
374 // A one-to-one mapping from HistoryID to the word starts detected in each | |
375 // item's URL and page title. | |
376 WordStartsMap word_starts_map_; | |
377 | |
378 // End of data members that are cached --------------------------------------- | |
379 | |
380 // For unit testing only. Specifies the version of the cache file to be saved. | |
381 // Used only for testing upgrading of an older version of the cache upon | |
382 // restore. | |
383 int saved_cache_version_; | |
384 | |
385 // Used for unit testing only. Records the number of candidate history items | |
386 // at three stages in the index searching process. | |
387 size_t pre_filter_item_count_; // After word index is queried. | |
388 size_t post_filter_item_count_; // After trimming large result set. | |
389 size_t post_scoring_item_count_; // After performing final filter/scoring. | |
390 }; | |
391 | |
392 } // namespace history | |
393 | |
394 #endif // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ | |
OLD | NEW |