chrome/browser/autocomplete/url_index_private_data.h - Issue 1182113009: Componentize HistoryQuickProvider and its supporting cast

Side by Side Diff: chrome/browser/autocomplete/url_index_private_data.h

Issue 1182113009: Componentize HistoryQuickProvider and its supporting cast (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #ifndef CHROME_BROWSER_AUTOCOMPLETE_URL_INDEX_PRIVATE_DATA_H_

6 #define CHROME_BROWSER_AUTOCOMPLETE_URL_INDEX_PRIVATE_DATA_H_

7

8 #include <set>

9 #include <string>

10

11 #include "base/files/file_path.h"

12 #include "base/gtest_prod_util.h"

13 #include "base/memory/ref_counted.h"

14 #include "chrome/browser/autocomplete/in_memory_url_index_cache.pb.h"

15 #include "components/history/core/browser/history_service.h"

16 #include "components/omnibox/in_memory_url_index_types.h"

17 #include "components/omnibox/scored_history_match.h"

18

19 class HistoryQuickProviderTest;

20

21 namespace bookmarks {

22 class BookmarkModel;

23 }

24

25 namespace in_memory_url_index {

26 class InMemoryURLIndexCacheItem;

27 }

28

29 namespace history {

30 class HistoryDatabase;

31 class InMemoryURLIndex;

32 class RefCountedBool;

33 }

34

35 // Current version of the cache file.

36 static const int kCurrentCacheFileVersion = 5;

37

38 // A structure private to InMemoryURLIndex describing its internal data and

39 // providing for restoring, rebuilding and updating that internal data. As

40 // this class is for exclusive use by the InMemoryURLIndex class there should

41 // be no calls from any other class.

42 //

43 // All public member functions are called on the main thread unless otherwise

44 // annotated.

45 class URLIndexPrivateData

46 : public base::RefCountedThreadSafe<URLIndexPrivateData> {

47 public:

48 URLIndexPrivateData();

49

50 // Given a base::string16 in \|term_string\|, scans the history index and

51 // returns a vector with all scored, matching history items. The

52 // \|term_string\| is broken down into individual terms (words), each of which

53 // must occur in the candidate history item's URL or page title for the item

54 // to qualify; however, the terms do not necessarily have to be adjacent. We

55 // also allow breaking \|term_string\| at \|cursor_position\| (if

56 // set). Once we have a set of candidates, they are filtered to ensure

57 // that all \|term_string\| terms, as separated by whitespace and the

58 // cursor (if set), occur within the candidate's URL or page title.

59 // Scores are then calculated on no more than \|kItemsToScoreLimit\|

60 // candidates, as the scoring of such a large number of candidates may

61 // cause perceptible typing response delays in the omnibox. This is

62 // likely to occur for short omnibox terms such as 'h' and 'w' which

63 // will be found in nearly all history candidates. Results are sorted by

64 // descending score. The full results set (i.e. beyond the

65 // \|kItemsToScoreLimit\| limit) will be retained and used for subsequent calls

66 // to this function. \|languages\| is used to help parse/format the URLs in the

67 // history index. In total, \|max_matches\| of items will be returned in the

68 // \|ScoredHistoryMatches\| vector.

69 ScoredHistoryMatches HistoryItemsForTerms(

70 base::string16 term_string,

71 size_t cursor_position,

72 size_t max_matches,

73 const std::string& languages,

74 bookmarks::BookmarkModel* bookmark_model);

75

76 // Adds the history item in \|row\| to the index if it does not already already

77 // exist and it meets the minimum 'quick' criteria. If the row already exists

78 // in the index then the index will be updated if the row still meets the

79 // criteria, otherwise the row will be removed from the index. Returns true

80 // if the index was actually updated. \|languages\| gives a list of language

81 // encodings by which the URLs and page titles are broken down into words and

82 // characters. \|scheme_whitelist\| is used to filter non-qualifying schemes.

83 // \|history_service\| is used to schedule an update to the recent visits

84 // component of this URL's entry in the index.

85 bool UpdateURL(history::HistoryService* history_service,

86 const history::URLRow& row,

87 const std::string& languages,

88 const std::set<std::string>& scheme_whitelist,

89 base::CancelableTaskTracker* tracker);

90

91 // Updates the entry for \|url_id\| in the index, replacing its

92 // recent visits information with \|recent_visits\|. If \|url_id\|

93 // is not in the index, does nothing.

94 void UpdateRecentVisits(history::URLID url_id,

95 const history::VisitVector& recent_visits);

96

97 // Using \|history_service\| schedules an update (using the historyDB

98 // thread) for the recent visits information for \|url_id\|. Unless

99 // something unexpectedly goes wrong, UdpateRecentVisits() should

100 // eventually be called from a callback.

101 void ScheduleUpdateRecentVisits(history::HistoryService* history_service,

102 history::URLID url_id,

103 base::CancelableTaskTracker* tracker);

104

105 // Deletes index data for the history item with the given \|url\|.

106 // The item may not have actually been indexed, which is the case if it did

107 // not previously meet minimum 'quick' criteria. Returns true if the index

108 // was actually updated.

109 bool DeleteURL(const GURL& url);

110

111 // Constructs a new object by restoring its contents from the cache file

112 // at \|path\|. Returns the new URLIndexPrivateData which on success will

113 // contain the restored data but upon failure will be empty. \|languages\|

114 // is used to break URLs and page titles into words. This function

115 // should be run on the the file thread.

116 static scoped_refptr<URLIndexPrivateData> RestoreFromFile(

117 const base::FilePath& path,

118 const std::string& languages);

119

120 // Constructs a new object by rebuilding its contents from the history

121 // database in \|history_db\|. Returns the new URLIndexPrivateData which on

122 // success will contain the rebuilt data but upon failure will be empty.

123 // \|languages\| gives a list of language encodings by which the URLs and page

124 // titles are broken down into words and characters.

125 static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(

126 history::HistoryDatabase* history_db,

127 const std::string& languages,

128 const std::set<std::string>& scheme_whitelist);

129

130 // Writes \|private_data\| as a cache file to \|file_path\| and returns success.

131 static bool WritePrivateDataToCacheFileTask(

132 scoped_refptr<URLIndexPrivateData> private_data,

133 const base::FilePath& file_path);

134

135 // Creates a copy of ourself.

136 scoped_refptr<URLIndexPrivateData> Duplicate() const;

137

138 // Returns true if there is no data in the index.

139 bool Empty() const;

140

141 // Initializes all index data members in preparation for restoring the index

142 // from the cache or a complete rebuild from the history database.

143 void Clear();

144

145 private:

146 friend class base::RefCountedThreadSafe<URLIndexPrivateData>;

147 ~URLIndexPrivateData();

148

149 friend class AddHistoryMatch;

150 friend class ::HistoryQuickProviderTest;

151 friend class InMemoryURLIndexTest;

152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch);

153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);

154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);

155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);

156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);

157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);

158 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);

159 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);

160 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);

161 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);

162

163 // Support caching of term results so that we can optimize searches which

164 // build upon a previous search. Each entry in this map represents one

165 // search term from the most recent search. For example, if the user had

166 // typed "google blog trans" and then typed an additional 'l' (at the end,

167 // of course) then there would be four items in the cache: 'blog', 'google',

168 // 'trans', and 'transl'. All would be marked as being in use except for the

169 // 'trans' item; its cached data would have been used when optimizing the

170 // construction of the search results candidates for 'transl' but then would

171 // no longer needed.

172 //

173 // Items stored in the search term cache. If a search term exactly matches one

174 // in the cache then we can quickly supply the proper \|history_id_set_\| (and

175 // marking the cache item as being \|used_\|. If we find a prefix for a search

176 // term in the cache (which is very likely to occur as the user types each

177 // term into the omnibox) then we can short-circuit the index search for those

178 // characters in the prefix by returning the \|word_id_set\|. In that case we do

179 // not mark the item as being \|used_\|.

180 struct SearchTermCacheItem {

181 SearchTermCacheItem(const WordIDSet& word_id_set,

182 const HistoryIDSet& history_id_set);

183 // Creates a cache item for a term which has no results.

184 SearchTermCacheItem();

185

186 ~SearchTermCacheItem();

187

188 WordIDSet word_id_set_;

189 HistoryIDSet history_id_set_;

190 bool used_; // True if this item has been used for the current term search.

191 };

192 typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;

193

194 // A helper class which performs the final filter on each candidate

195 // history URL match, inserting accepted matches into \|scored_matches_\|.

196 class AddHistoryMatch : public std::unary_function<HistoryID, void> {

197 public:

198 AddHistoryMatch(bookmarks::BookmarkModel* bookmark_model,

199 const URLIndexPrivateData& private_data,

200 const std::string& languages,

201 const base::string16& lower_string,

202 const String16Vector& lower_terms,

203 const base::Time now);

204 ~AddHistoryMatch();

205

206 void operator()(const HistoryID history_id);

207

208 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }

209

210 private:

211 friend class InMemoryURLIndexTest;

212 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch);

213 bookmarks::BookmarkModel* bookmark_model_;

214 const URLIndexPrivateData& private_data_;

215 const std::string& languages_;

216 ScoredHistoryMatches scored_matches_;

217 const base::string16& lower_string_;

218 const String16Vector& lower_terms_;

219 WordStarts lower_terms_to_word_starts_offsets_;

220 const base::Time now_;

221 };

222

223 // A helper predicate class used to filter excess history items when the

224 // candidate results set is too large.

225 class HistoryItemFactorGreater

226 : public std::binary_function<HistoryID, HistoryID, void> {

227 public:

228 explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);

229 ~HistoryItemFactorGreater();

230

231 bool operator()(const HistoryID h1, const HistoryID h2);

232

233 private:

234 const HistoryInfoMap& history_info_map_;

235 };

236

237 // URL History indexing support functions.

238

239 // Composes a set of history item IDs by intersecting the set for each word

240 // in \|unsorted_words\|.

241 HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);

242

243 // Helper function to HistoryIDSetFromWords which composes a set of history

244 // ids for the given term given in \|term\|.

245 HistoryIDSet HistoryIDsForTerm(const base::string16& term);

246

247 // Given a set of Char16s, finds words containing those characters.

248 WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);

249

250 // Indexes one URL history item as described by \|row\|. Returns true if the

251 // row was actually indexed. \|languages\| gives a list of language encodings by

252 // which the URLs and page titles are broken down into words and characters.

253 // \|scheme_whitelist\| is used to filter non-qualifying schemes. If

254 // \|history_db\| is not NULL then this function uses the history database

255 // synchronously to get the URL's recent visits information. This mode should

256 // only be used on the historyDB thread. If \|history_db\| is NULL, then

257 // this function uses \|history_service\| to schedule a task on the

258 // historyDB thread to fetch and update the recent visits

259 // information.

260 bool IndexRow(history::HistoryDatabase* history_db,

261 history::HistoryService* history_service,

262 const history::URLRow& row,

263 const std::string& languages,

264 const std::set<std::string>& scheme_whitelist,

265 base::CancelableTaskTracker* tracker);

266

267 // Parses and indexes the words in the URL and page title of \|row\| and

268 // calculate the word starts in each, saving the starts in \|word_starts\|.

269 // \|languages\| gives a list of language encodings by which the URLs and page

270 // titles are broken down into words and characters.

271 void AddRowWordsToIndex(const history::URLRow& row,

272 RowWordStarts* word_starts,

273 const std::string& languages);

274

275 // Given a single word in \|uni_word\|, adds a reference for the containing

276 // history item identified by \|history_id\| to the index.

277 void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);

278

279 // Creates a new entry in the word/history map for \|word_id\| and add

280 // \|history_id\| as the initial element of the word's set.

281 void AddWordHistory(const base::string16& uni_word, HistoryID history_id);

282

283 // Updates an existing entry in the word/history index by adding the

284 // \|history_id\| to set for \|word_id\| in the word_id_history_map_.

285 void UpdateWordHistory(WordID word_id, HistoryID history_id);

286

287 // Adds \|word_id\| to \|history_id\|'s entry in the history/word map,

288 // creating a new entry if one does not already exist.

289 void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);

290

291 // Removes \|row\| and all associated words and characters from the index.

292 void RemoveRowFromIndex(const history::URLRow& row);

293

294 // Removes all words and characters associated with \|row\| from the index.

295 void RemoveRowWordsFromIndex(const history::URLRow& row);

296

297 // Clears \|used_\| for each item in the search term cache.

298 void ResetSearchTermCache();

299

300 // Caches the index private data and writes the cache file to the profile

301 // directory. Called by WritePrivateDataToCacheFileTask.

302 bool SaveToFile(const base::FilePath& file_path);

303

304 // Encode a data structure into the protobuf \|cache\|.

305 void SavePrivateData(

306 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

307 void SaveWordList(

308 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

309 void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

310 void SaveCharWordMap(

311 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

312 void SaveWordIDHistoryMap(

313 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

314 void SaveHistoryInfoMap(

315 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

316 void SaveWordStartsMap(

317 in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

318

319 // Decode a data structure from the protobuf \|cache\|. Return false if there

320 // is any kind of failure. \|languages\| will be used to break URLs and page

321 // titles into words

322 bool RestorePrivateData(

323 const in_memory_url_index::InMemoryURLIndexCacheItem& cache,

324 const std::string& languages);

325 bool RestoreWordList(

326 const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

327 bool RestoreWordMap(

328 const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

329 bool RestoreCharWordMap(

330 const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

331 bool RestoreWordIDHistoryMap(

332 const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

333 bool RestoreHistoryInfoMap(

334 const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

335 bool RestoreWordStartsMap(

336 const in_memory_url_index::InMemoryURLIndexCacheItem& cache,

337 const std::string& languages);

338

339 // Determines if \|gurl\| has a whitelisted scheme and returns true if so.

340 static bool URLSchemeIsWhitelisted(const GURL& gurl,

341 const std::set<std::string>& whitelist);

342

343 // Cache of search terms.

344 SearchTermCacheMap search_term_cache_;

345

346 // Start of data members that are cached -------------------------------------

347

348 // The version of the cache file most recently used to restore this instance

349 // of the private data. If the private data was rebuilt from the history

350 // database this will be 0.

351 int restored_cache_version_;

352

353 // The last time the data was rebuilt from the history database.

354 base::Time last_time_rebuilt_from_history_;

355

356 // A list of all of indexed words. The index of a word in this list is the

357 // ID of the word in the word_map_. It reduces the memory overhead by

358 // replacing a potentially long and repeated string with a simple index.

359 String16Vector word_list_;

360

361 // A list of available words slots in \|word_list_\|. An available word slot

362 // is the index of a unused word in word_list_ vector, also referred to as

363 // a WordID. As URL visits are added or modified new words may be added to

364 // the index, in which case any available words are used, if any, and then

365 // words are added to the end of the word_list_. When URL visits are

366 // modified or deleted old words may be removed from the index, in which

367 // case the slots for those words are added to available_words_ for resuse

368 // by future URL updates.

369 WordIDSet available_words_;

370

371 // A one-to-one mapping from the a word string to its slot number (i.e.

372 // WordID) in the \|word_list_\|.

373 WordMap word_map_;

374

375 // A one-to-many mapping from a single character to all WordIDs of words

376 // containing that character.

377 CharWordIDMap char_word_map_;

378

379 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as

380 // used in the history database) of history items in which the word occurs.

381 WordIDHistoryMap word_id_history_map_;

382

383 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur

384 // in the URL and/or page title of the history item referenced by that

385 // HistoryID.

386 HistoryIDWordMap history_id_word_map_;

387

388 // A one-to-one mapping from HistoryID to the history item data governing

389 // index inclusion and relevance scoring.

390 HistoryInfoMap history_info_map_;

391

392 // A one-to-one mapping from HistoryID to the word starts detected in each

393 // item's URL and page title.

394 WordStartsMap word_starts_map_;

395

396 // End of data members that are cached ---------------------------------------

397

398 // For unit testing only. Specifies the version of the cache file to be saved.

399 // Used only for testing upgrading of an older version of the cache upon

400 // restore.

401 int saved_cache_version_;

402

403 // Used for unit testing only. Records the number of candidate history items

404 // at three stages in the index searching process.

405 size_t pre_filter_item_count_; // After word index is queried.

406 size_t post_filter_item_count_; // After trimming large result set.

407 size_t post_scoring_item_count_; // After performing final filter/scoring.

408 };

409

410 #endif // CHROME_BROWSER_AUTOCOMPLETE_URL_INDEX_PRIVATE_DATA_H_

OLD	NEW