Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(10)

Side by Side Diff: chrome/browser/history/url_index_private_data.h

Issue 963823003: Move InMemoryURLIndex into chrome/browser/autocomplete (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@shortcut-database
Patch Set: Fixing win_chromium_x64_rel_ng build Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
7
8 #include <set>
9 #include <string>
10
11 #include "base/files/file_path.h"
12 #include "base/gtest_prod_util.h"
13 #include "base/memory/ref_counted.h"
14 #include "chrome/browser/history/history_service.h"
15 #include "components/history/core/browser/in_memory_url_index_cache.pb.h"
16 #include "components/history/core/browser/in_memory_url_index_types.h"
17 #include "components/history/core/browser/scored_history_match.h"
18
19 class HistoryQuickProviderTest;
20
21 namespace in_memory_url_index {
22 class InMemoryURLIndexCacheItem;
23 }
24
25 namespace history {
26
27 namespace imui = in_memory_url_index;
28
29 class HistoryDatabase;
30 class InMemoryURLIndex;
31 class RefCountedBool;
32
33 // Current version of the cache file.
34 static const int kCurrentCacheFileVersion = 5;
35
36 // A structure private to InMemoryURLIndex describing its internal data and
37 // providing for restoring, rebuilding and updating that internal data. As
38 // this class is for exclusive use by the InMemoryURLIndex class there should
39 // be no calls from any other class.
40 //
41 // All public member functions are called on the main thread unless otherwise
42 // annotated.
43 class URLIndexPrivateData
44 : public base::RefCountedThreadSafe<URLIndexPrivateData> {
45 public:
46 URLIndexPrivateData();
47
48 // Given a base::string16 in |term_string|, scans the history index and
49 // returns a vector with all scored, matching history items. The
50 // |term_string| is broken down into individual terms (words), each of which
51 // must occur in the candidate history item's URL or page title for the item
52 // to qualify; however, the terms do not necessarily have to be adjacent. We
53 // also allow breaking |term_string| at |cursor_position| (if
54 // set). Once we have a set of candidates, they are filtered to ensure
55 // that all |term_string| terms, as separated by whitespace and the
56 // cursor (if set), occur within the candidate's URL or page title.
57 // Scores are then calculated on no more than |kItemsToScoreLimit|
58 // candidates, as the scoring of such a large number of candidates may
59 // cause perceptible typing response delays in the omnibox. This is
60 // likely to occur for short omnibox terms such as 'h' and 'w' which
61 // will be found in nearly all history candidates. Results are sorted by
62 // descending score. The full results set (i.e. beyond the
63 // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
64 // to this function. |languages| is used to help parse/format the URLs in the
65 // history index. In total, |max_matches| of items will be returned in the
66 // |ScoredHistoryMatches| vector.
67 ScoredHistoryMatches HistoryItemsForTerms(
68 base::string16 term_string,
69 size_t cursor_position,
70 size_t max_matches,
71 const std::string& languages,
72 const ScoredHistoryMatch::Builder& builder);
73
74 // Adds the history item in |row| to the index if it does not already already
75 // exist and it meets the minimum 'quick' criteria. If the row already exists
76 // in the index then the index will be updated if the row still meets the
77 // criteria, otherwise the row will be removed from the index. Returns true
78 // if the index was actually updated. |languages| gives a list of language
79 // encodings by which the URLs and page titles are broken down into words and
80 // characters. |scheme_whitelist| is used to filter non-qualifying schemes.
81 // |history_service| is used to schedule an update to the recent visits
82 // component of this URL's entry in the index.
83 bool UpdateURL(HistoryService* history_service,
84 const URLRow& row,
85 const std::string& languages,
86 const std::set<std::string>& scheme_whitelist,
87 base::CancelableTaskTracker* tracker);
88
89 // Updates the entry for |url_id| in the index, replacing its
90 // recent visits information with |recent_visits|. If |url_id|
91 // is not in the index, does nothing.
92 void UpdateRecentVisits(URLID url_id,
93 const VisitVector& recent_visits);
94
95 // Using |history_service| schedules an update (using the historyDB
96 // thread) for the recent visits information for |url_id|. Unless
97 // something unexpectedly goes wrong, UdpateRecentVisits() should
98 // eventually be called from a callback.
99 void ScheduleUpdateRecentVisits(HistoryService* history_service,
100 URLID url_id,
101 base::CancelableTaskTracker* tracker);
102
103 // Deletes index data for the history item with the given |url|.
104 // The item may not have actually been indexed, which is the case if it did
105 // not previously meet minimum 'quick' criteria. Returns true if the index
106 // was actually updated.
107 bool DeleteURL(const GURL& url);
108
109 // Constructs a new object by restoring its contents from the cache file
110 // at |path|. Returns the new URLIndexPrivateData which on success will
111 // contain the restored data but upon failure will be empty. |languages|
112 // is used to break URLs and page titles into words. This function
113 // should be run on the the file thread.
114 static scoped_refptr<URLIndexPrivateData> RestoreFromFile(
115 const base::FilePath& path,
116 const std::string& languages);
117
118 // Constructs a new object by rebuilding its contents from the history
119 // database in |history_db|. Returns the new URLIndexPrivateData which on
120 // success will contain the rebuilt data but upon failure will be empty.
121 // |languages| gives a list of language encodings by which the URLs and page
122 // titles are broken down into words and characters.
123 static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(
124 HistoryDatabase* history_db,
125 const std::string& languages,
126 const std::set<std::string>& scheme_whitelist);
127
128 // Writes |private_data| as a cache file to |file_path| and returns success.
129 static bool WritePrivateDataToCacheFileTask(
130 scoped_refptr<URLIndexPrivateData> private_data,
131 const base::FilePath& file_path);
132
133 // Creates a copy of ourself.
134 scoped_refptr<URLIndexPrivateData> Duplicate() const;
135
136 // Returns true if there is no data in the index.
137 bool Empty() const;
138
139 // Initializes all index data members in preparation for restoring the index
140 // from the cache or a complete rebuild from the history database.
141 void Clear();
142
143 private:
144 friend class base::RefCountedThreadSafe<URLIndexPrivateData>;
145 ~URLIndexPrivateData();
146
147 friend class AddHistoryMatch;
148 friend class ::HistoryQuickProviderTest;
149 friend class InMemoryURLIndexTest;
150 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
151 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);
153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);
154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);
158 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
159
160 // Support caching of term results so that we can optimize searches which
161 // build upon a previous search. Each entry in this map represents one
162 // search term from the most recent search. For example, if the user had
163 // typed "google blog trans" and then typed an additional 'l' (at the end,
164 // of course) then there would be four items in the cache: 'blog', 'google',
165 // 'trans', and 'transl'. All would be marked as being in use except for the
166 // 'trans' item; its cached data would have been used when optimizing the
167 // construction of the search results candidates for 'transl' but then would
168 // no longer needed.
169 //
170 // Items stored in the search term cache. If a search term exactly matches one
171 // in the cache then we can quickly supply the proper |history_id_set_| (and
172 // marking the cache item as being |used_|. If we find a prefix for a search
173 // term in the cache (which is very likely to occur as the user types each
174 // term into the omnibox) then we can short-circuit the index search for those
175 // characters in the prefix by returning the |word_id_set|. In that case we do
176 // not mark the item as being |used_|.
177 struct SearchTermCacheItem {
178 SearchTermCacheItem(const WordIDSet& word_id_set,
179 const HistoryIDSet& history_id_set);
180 // Creates a cache item for a term which has no results.
181 SearchTermCacheItem();
182
183 ~SearchTermCacheItem();
184
185 WordIDSet word_id_set_;
186 HistoryIDSet history_id_set_;
187 bool used_; // True if this item has been used for the current term search.
188 };
189 typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;
190
191 // A helper class which performs the final filter on each candidate
192 // history URL match, inserting accepted matches into |scored_matches_|.
193 class AddHistoryMatch : public std::unary_function<HistoryID, void> {
194 public:
195 AddHistoryMatch(const URLIndexPrivateData& private_data,
196 const std::string& languages,
197 const base::string16& lower_string,
198 const String16Vector& lower_terms,
199 const base::Time now,
200 const ScoredHistoryMatch::Builder& builder);
201 ~AddHistoryMatch();
202
203 void operator()(const HistoryID history_id);
204
205 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
206
207 private:
208 const URLIndexPrivateData& private_data_;
209 const std::string& languages_;
210 const ScoredHistoryMatch::Builder& builder_;
211 ScoredHistoryMatches scored_matches_;
212 const base::string16& lower_string_;
213 const String16Vector& lower_terms_;
214 WordStarts lower_terms_to_word_starts_offsets_;
215 const base::Time now_;
216 };
217
218 // A helper predicate class used to filter excess history items when the
219 // candidate results set is too large.
220 class HistoryItemFactorGreater
221 : public std::binary_function<HistoryID, HistoryID, void> {
222 public:
223 explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
224 ~HistoryItemFactorGreater();
225
226 bool operator()(const HistoryID h1, const HistoryID h2);
227
228 private:
229 const history::HistoryInfoMap& history_info_map_;
230 };
231
232 // URL History indexing support functions.
233
234 // Composes a set of history item IDs by intersecting the set for each word
235 // in |unsorted_words|.
236 HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);
237
238 // Helper function to HistoryIDSetFromWords which composes a set of history
239 // ids for the given term given in |term|.
240 HistoryIDSet HistoryIDsForTerm(const base::string16& term);
241
242 // Given a set of Char16s, finds words containing those characters.
243 WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);
244
245 // Indexes one URL history item as described by |row|. Returns true if the
246 // row was actually indexed. |languages| gives a list of language encodings by
247 // which the URLs and page titles are broken down into words and characters.
248 // |scheme_whitelist| is used to filter non-qualifying schemes. If
249 // |history_db| is not NULL then this function uses the history database
250 // synchronously to get the URL's recent visits information. This mode should
251 // only be used on the historyDB thread. If |history_db| is NULL, then
252 // this function uses |history_service| to schedule a task on the
253 // historyDB thread to fetch and update the recent visits
254 // information.
255 bool IndexRow(HistoryDatabase* history_db,
256 HistoryService* history_service,
257 const URLRow& row,
258 const std::string& languages,
259 const std::set<std::string>& scheme_whitelist,
260 base::CancelableTaskTracker* tracker);
261
262 // Parses and indexes the words in the URL and page title of |row| and
263 // calculate the word starts in each, saving the starts in |word_starts|.
264 // |languages| gives a list of language encodings by which the URLs and page
265 // titles are broken down into words and characters.
266 void AddRowWordsToIndex(const URLRow& row,
267 RowWordStarts* word_starts,
268 const std::string& languages);
269
270 // Given a single word in |uni_word|, adds a reference for the containing
271 // history item identified by |history_id| to the index.
272 void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);
273
274 // Creates a new entry in the word/history map for |word_id| and add
275 // |history_id| as the initial element of the word's set.
276 void AddWordHistory(const base::string16& uni_word, HistoryID history_id);
277
278 // Updates an existing entry in the word/history index by adding the
279 // |history_id| to set for |word_id| in the word_id_history_map_.
280 void UpdateWordHistory(WordID word_id, HistoryID history_id);
281
282 // Adds |word_id| to |history_id|'s entry in the history/word map,
283 // creating a new entry if one does not already exist.
284 void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);
285
286 // Removes |row| and all associated words and characters from the index.
287 void RemoveRowFromIndex(const URLRow& row);
288
289 // Removes all words and characters associated with |row| from the index.
290 void RemoveRowWordsFromIndex(const URLRow& row);
291
292 // Clears |used_| for each item in the search term cache.
293 void ResetSearchTermCache();
294
295 // Caches the index private data and writes the cache file to the profile
296 // directory. Called by WritePrivateDataToCacheFileTask.
297 bool SaveToFile(const base::FilePath& file_path);
298
299 // Encode a data structure into the protobuf |cache|.
300 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;
301 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;
302 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
303 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
304 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;
305 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;
306 void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const;
307
308 // Decode a data structure from the protobuf |cache|. Return false if there
309 // is any kind of failure. |languages| will be used to break URLs and page
310 // titles into words
311 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache,
312 const std::string& languages);
313 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);
314 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);
315 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);
316 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);
317 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);
318 bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache,
319 const std::string& languages);
320
321 // Determines if |gurl| has a whitelisted scheme and returns true if so.
322 static bool URLSchemeIsWhitelisted(const GURL& gurl,
323 const std::set<std::string>& whitelist);
324
325 // Cache of search terms.
326 SearchTermCacheMap search_term_cache_;
327
328 // Start of data members that are cached -------------------------------------
329
330 // The version of the cache file most recently used to restore this instance
331 // of the private data. If the private data was rebuilt from the history
332 // database this will be 0.
333 int restored_cache_version_;
334
335 // The last time the data was rebuilt from the history database.
336 base::Time last_time_rebuilt_from_history_;
337
338 // A list of all of indexed words. The index of a word in this list is the
339 // ID of the word in the word_map_. It reduces the memory overhead by
340 // replacing a potentially long and repeated string with a simple index.
341 String16Vector word_list_;
342
343 // A list of available words slots in |word_list_|. An available word slot
344 // is the index of a unused word in word_list_ vector, also referred to as
345 // a WordID. As URL visits are added or modified new words may be added to
346 // the index, in which case any available words are used, if any, and then
347 // words are added to the end of the word_list_. When URL visits are
348 // modified or deleted old words may be removed from the index, in which
349 // case the slots for those words are added to available_words_ for resuse
350 // by future URL updates.
351 WordIDSet available_words_;
352
353 // A one-to-one mapping from the a word string to its slot number (i.e.
354 // WordID) in the |word_list_|.
355 WordMap word_map_;
356
357 // A one-to-many mapping from a single character to all WordIDs of words
358 // containing that character.
359 CharWordIDMap char_word_map_;
360
361 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
362 // used in the history database) of history items in which the word occurs.
363 WordIDHistoryMap word_id_history_map_;
364
365 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
366 // in the URL and/or page title of the history item referenced by that
367 // HistoryID.
368 HistoryIDWordMap history_id_word_map_;
369
370 // A one-to-one mapping from HistoryID to the history item data governing
371 // index inclusion and relevance scoring.
372 HistoryInfoMap history_info_map_;
373
374 // A one-to-one mapping from HistoryID to the word starts detected in each
375 // item's URL and page title.
376 WordStartsMap word_starts_map_;
377
378 // End of data members that are cached ---------------------------------------
379
380 // For unit testing only. Specifies the version of the cache file to be saved.
381 // Used only for testing upgrading of an older version of the cache upon
382 // restore.
383 int saved_cache_version_;
384
385 // Used for unit testing only. Records the number of candidate history items
386 // at three stages in the index searching process.
387 size_t pre_filter_item_count_; // After word index is queried.
388 size_t post_filter_item_count_; // After trimming large result set.
389 size_t post_scoring_item_count_; // After performing final filter/scoring.
390 };
391
392 } // namespace history
393
394 #endif // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
OLDNEW
« no previous file with comments | « chrome/browser/history/in_memory_url_index_unittest.cc ('k') | chrome/browser/history/url_index_private_data.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698