chrome/browser/history/url_index_private_data.h - Issue 963823003: Move InMemoryURLIndex into chrome/browser/autocomplete

Side by Side Diff: chrome/browser/history/url_index_private_data.h

Issue 963823003: Move InMemoryURLIndex into chrome/browser/autocomplete (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@shortcut-database

Patch Set: Fixing win_chromium_x64_rel_ng build Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_

6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_

7

8 #include <set>

9 #include <string>

10

11 #include "base/files/file_path.h"

12 #include "base/gtest_prod_util.h"

13 #include "base/memory/ref_counted.h"

14 #include "chrome/browser/history/history_service.h"

15 #include "components/history/core/browser/in_memory_url_index_cache.pb.h"

16 #include "components/history/core/browser/in_memory_url_index_types.h"

17 #include "components/history/core/browser/scored_history_match.h"

18

19 class HistoryQuickProviderTest;

20

21 namespace in_memory_url_index {

22 class InMemoryURLIndexCacheItem;

23 }

24

25 namespace history {

26

27 namespace imui = in_memory_url_index;

28

29 class HistoryDatabase;

30 class InMemoryURLIndex;

31 class RefCountedBool;

32

33 // Current version of the cache file.

34 static const int kCurrentCacheFileVersion = 5;

35

36 // A structure private to InMemoryURLIndex describing its internal data and

37 // providing for restoring, rebuilding and updating that internal data. As

38 // this class is for exclusive use by the InMemoryURLIndex class there should

39 // be no calls from any other class.

40 //

41 // All public member functions are called on the main thread unless otherwise

42 // annotated.

43 class URLIndexPrivateData

44 : public base::RefCountedThreadSafe<URLIndexPrivateData> {

45 public:

46 URLIndexPrivateData();

47

48 // Given a base::string16 in \|term_string\|, scans the history index and

49 // returns a vector with all scored, matching history items. The

50 // \|term_string\| is broken down into individual terms (words), each of which

51 // must occur in the candidate history item's URL or page title for the item

52 // to qualify; however, the terms do not necessarily have to be adjacent. We

53 // also allow breaking \|term_string\| at \|cursor_position\| (if

54 // set). Once we have a set of candidates, they are filtered to ensure

55 // that all \|term_string\| terms, as separated by whitespace and the

56 // cursor (if set), occur within the candidate's URL or page title.

57 // Scores are then calculated on no more than \|kItemsToScoreLimit\|

58 // candidates, as the scoring of such a large number of candidates may

59 // cause perceptible typing response delays in the omnibox. This is

60 // likely to occur for short omnibox terms such as 'h' and 'w' which

61 // will be found in nearly all history candidates. Results are sorted by

62 // descending score. The full results set (i.e. beyond the

63 // \|kItemsToScoreLimit\| limit) will be retained and used for subsequent calls

64 // to this function. \|languages\| is used to help parse/format the URLs in the

65 // history index. In total, \|max_matches\| of items will be returned in the

66 // \|ScoredHistoryMatches\| vector.

67 ScoredHistoryMatches HistoryItemsForTerms(

68 base::string16 term_string,

69 size_t cursor_position,

70 size_t max_matches,

71 const std::string& languages,

72 const ScoredHistoryMatch::Builder& builder);

73

74 // Adds the history item in \|row\| to the index if it does not already already

75 // exist and it meets the minimum 'quick' criteria. If the row already exists

76 // in the index then the index will be updated if the row still meets the

77 // criteria, otherwise the row will be removed from the index. Returns true

78 // if the index was actually updated. \|languages\| gives a list of language

79 // encodings by which the URLs and page titles are broken down into words and

80 // characters. \|scheme_whitelist\| is used to filter non-qualifying schemes.

81 // \|history_service\| is used to schedule an update to the recent visits

82 // component of this URL's entry in the index.

83 bool UpdateURL(HistoryService* history_service,

84 const URLRow& row,

85 const std::string& languages,

86 const std::set<std::string>& scheme_whitelist,

87 base::CancelableTaskTracker* tracker);

88

89 // Updates the entry for \|url_id\| in the index, replacing its

90 // recent visits information with \|recent_visits\|. If \|url_id\|

91 // is not in the index, does nothing.

92 void UpdateRecentVisits(URLID url_id,

93 const VisitVector& recent_visits);

94

95 // Using \|history_service\| schedules an update (using the historyDB

96 // thread) for the recent visits information for \|url_id\|. Unless

97 // something unexpectedly goes wrong, UdpateRecentVisits() should

98 // eventually be called from a callback.

99 void ScheduleUpdateRecentVisits(HistoryService* history_service,

100 URLID url_id,

101 base::CancelableTaskTracker* tracker);

102

103 // Deletes index data for the history item with the given \|url\|.

104 // The item may not have actually been indexed, which is the case if it did

105 // not previously meet minimum 'quick' criteria. Returns true if the index

106 // was actually updated.

107 bool DeleteURL(const GURL& url);

108

109 // Constructs a new object by restoring its contents from the cache file

110 // at \|path\|. Returns the new URLIndexPrivateData which on success will

111 // contain the restored data but upon failure will be empty. \|languages\|

112 // is used to break URLs and page titles into words. This function

113 // should be run on the the file thread.

114 static scoped_refptr<URLIndexPrivateData> RestoreFromFile(

115 const base::FilePath& path,

116 const std::string& languages);

117

118 // Constructs a new object by rebuilding its contents from the history

119 // database in \|history_db\|. Returns the new URLIndexPrivateData which on

120 // success will contain the rebuilt data but upon failure will be empty.

121 // \|languages\| gives a list of language encodings by which the URLs and page

122 // titles are broken down into words and characters.

123 static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(

124 HistoryDatabase* history_db,

125 const std::string& languages,

126 const std::set<std::string>& scheme_whitelist);

127

128 // Writes \|private_data\| as a cache file to \|file_path\| and returns success.

129 static bool WritePrivateDataToCacheFileTask(

130 scoped_refptr<URLIndexPrivateData> private_data,

131 const base::FilePath& file_path);

132

133 // Creates a copy of ourself.

134 scoped_refptr<URLIndexPrivateData> Duplicate() const;

135

136 // Returns true if there is no data in the index.

137 bool Empty() const;

138

139 // Initializes all index data members in preparation for restoring the index

140 // from the cache or a complete rebuild from the history database.

141 void Clear();

142

143 private:

144 friend class base::RefCountedThreadSafe<URLIndexPrivateData>;

145 ~URLIndexPrivateData();

146

147 friend class AddHistoryMatch;

148 friend class ::HistoryQuickProviderTest;

149 friend class InMemoryURLIndexTest;

150 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);

151 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);

152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);

153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);

154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);

155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);

156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);

157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);

158 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);

159

160 // Support caching of term results so that we can optimize searches which

161 // build upon a previous search. Each entry in this map represents one

162 // search term from the most recent search. For example, if the user had

163 // typed "google blog trans" and then typed an additional 'l' (at the end,

164 // of course) then there would be four items in the cache: 'blog', 'google',

165 // 'trans', and 'transl'. All would be marked as being in use except for the

166 // 'trans' item; its cached data would have been used when optimizing the

167 // construction of the search results candidates for 'transl' but then would

168 // no longer needed.

169 //

170 // Items stored in the search term cache. If a search term exactly matches one

171 // in the cache then we can quickly supply the proper \|history_id_set_\| (and

172 // marking the cache item as being \|used_\|. If we find a prefix for a search

173 // term in the cache (which is very likely to occur as the user types each

174 // term into the omnibox) then we can short-circuit the index search for those

175 // characters in the prefix by returning the \|word_id_set\|. In that case we do

176 // not mark the item as being \|used_\|.

177 struct SearchTermCacheItem {

178 SearchTermCacheItem(const WordIDSet& word_id_set,

179 const HistoryIDSet& history_id_set);

180 // Creates a cache item for a term which has no results.

181 SearchTermCacheItem();

182

183 ~SearchTermCacheItem();

184

185 WordIDSet word_id_set_;

186 HistoryIDSet history_id_set_;

187 bool used_; // True if this item has been used for the current term search.

188 };

189 typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;

190

191 // A helper class which performs the final filter on each candidate

192 // history URL match, inserting accepted matches into \|scored_matches_\|.

193 class AddHistoryMatch : public std::unary_function<HistoryID, void> {

194 public:

195 AddHistoryMatch(const URLIndexPrivateData& private_data,

196 const std::string& languages,

197 const base::string16& lower_string,

198 const String16Vector& lower_terms,

199 const base::Time now,

200 const ScoredHistoryMatch::Builder& builder);

201 ~AddHistoryMatch();

202

203 void operator()(const HistoryID history_id);

204

205 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }

206

207 private:

208 const URLIndexPrivateData& private_data_;

209 const std::string& languages_;

210 const ScoredHistoryMatch::Builder& builder_;

211 ScoredHistoryMatches scored_matches_;

212 const base::string16& lower_string_;

213 const String16Vector& lower_terms_;

214 WordStarts lower_terms_to_word_starts_offsets_;

215 const base::Time now_;

216 };

217

218 // A helper predicate class used to filter excess history items when the

219 // candidate results set is too large.

220 class HistoryItemFactorGreater

221 : public std::binary_function<HistoryID, HistoryID, void> {

222 public:

223 explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);

224 ~HistoryItemFactorGreater();

225

226 bool operator()(const HistoryID h1, const HistoryID h2);

227

228 private:

229 const history::HistoryInfoMap& history_info_map_;

230 };

231

232 // URL History indexing support functions.

233

234 // Composes a set of history item IDs by intersecting the set for each word

235 // in \|unsorted_words\|.

236 HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);

237

238 // Helper function to HistoryIDSetFromWords which composes a set of history

239 // ids for the given term given in \|term\|.

240 HistoryIDSet HistoryIDsForTerm(const base::string16& term);

241

242 // Given a set of Char16s, finds words containing those characters.

243 WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);

244

245 // Indexes one URL history item as described by \|row\|. Returns true if the

246 // row was actually indexed. \|languages\| gives a list of language encodings by

247 // which the URLs and page titles are broken down into words and characters.

248 // \|scheme_whitelist\| is used to filter non-qualifying schemes. If

249 // \|history_db\| is not NULL then this function uses the history database

250 // synchronously to get the URL's recent visits information. This mode should

251 // only be used on the historyDB thread. If \|history_db\| is NULL, then

252 // this function uses \|history_service\| to schedule a task on the

253 // historyDB thread to fetch and update the recent visits

254 // information.

255 bool IndexRow(HistoryDatabase* history_db,

256 HistoryService* history_service,

257 const URLRow& row,

258 const std::string& languages,

259 const std::set<std::string>& scheme_whitelist,

260 base::CancelableTaskTracker* tracker);

261

262 // Parses and indexes the words in the URL and page title of \|row\| and

263 // calculate the word starts in each, saving the starts in \|word_starts\|.

264 // \|languages\| gives a list of language encodings by which the URLs and page

265 // titles are broken down into words and characters.

266 void AddRowWordsToIndex(const URLRow& row,

267 RowWordStarts* word_starts,

268 const std::string& languages);

269

270 // Given a single word in \|uni_word\|, adds a reference for the containing

271 // history item identified by \|history_id\| to the index.

272 void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);

273

274 // Creates a new entry in the word/history map for \|word_id\| and add

275 // \|history_id\| as the initial element of the word's set.

276 void AddWordHistory(const base::string16& uni_word, HistoryID history_id);

277

278 // Updates an existing entry in the word/history index by adding the

279 // \|history_id\| to set for \|word_id\| in the word_id_history_map_.

280 void UpdateWordHistory(WordID word_id, HistoryID history_id);

281

282 // Adds \|word_id\| to \|history_id\|'s entry in the history/word map,

283 // creating a new entry if one does not already exist.

284 void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);

285

286 // Removes \|row\| and all associated words and characters from the index.

287 void RemoveRowFromIndex(const URLRow& row);

288

289 // Removes all words and characters associated with \|row\| from the index.

290 void RemoveRowWordsFromIndex(const URLRow& row);

291

292 // Clears \|used_\| for each item in the search term cache.

293 void ResetSearchTermCache();

294

295 // Caches the index private data and writes the cache file to the profile

296 // directory. Called by WritePrivateDataToCacheFileTask.

297 bool SaveToFile(const base::FilePath& file_path);

298

299 // Encode a data structure into the protobuf \|cache\|.

300 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;

301 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;

302 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;

303 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;

304 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;

305 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;

306 void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const;

307

308 // Decode a data structure from the protobuf \|cache\|. Return false if there

309 // is any kind of failure. \|languages\| will be used to break URLs and page

310 // titles into words

311 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache,

312 const std::string& languages);

313 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);

314 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);

315 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);

316 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);

317 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);

318 bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache,

319 const std::string& languages);

320

321 // Determines if \|gurl\| has a whitelisted scheme and returns true if so.

322 static bool URLSchemeIsWhitelisted(const GURL& gurl,

323 const std::set<std::string>& whitelist);

324

325 // Cache of search terms.

326 SearchTermCacheMap search_term_cache_;

327

328 // Start of data members that are cached -------------------------------------

329

330 // The version of the cache file most recently used to restore this instance

331 // of the private data. If the private data was rebuilt from the history

332 // database this will be 0.

333 int restored_cache_version_;

334

335 // The last time the data was rebuilt from the history database.

336 base::Time last_time_rebuilt_from_history_;

337

338 // A list of all of indexed words. The index of a word in this list is the

339 // ID of the word in the word_map_. It reduces the memory overhead by

340 // replacing a potentially long and repeated string with a simple index.

341 String16Vector word_list_;

342

343 // A list of available words slots in \|word_list_\|. An available word slot

344 // is the index of a unused word in word_list_ vector, also referred to as

345 // a WordID. As URL visits are added or modified new words may be added to

346 // the index, in which case any available words are used, if any, and then

347 // words are added to the end of the word_list_. When URL visits are

348 // modified or deleted old words may be removed from the index, in which

349 // case the slots for those words are added to available_words_ for resuse

350 // by future URL updates.

351 WordIDSet available_words_;

352

353 // A one-to-one mapping from the a word string to its slot number (i.e.

354 // WordID) in the \|word_list_\|.

355 WordMap word_map_;

356

357 // A one-to-many mapping from a single character to all WordIDs of words

358 // containing that character.

359 CharWordIDMap char_word_map_;

360

361 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as

362 // used in the history database) of history items in which the word occurs.

363 WordIDHistoryMap word_id_history_map_;

364

365 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur

366 // in the URL and/or page title of the history item referenced by that

367 // HistoryID.

368 HistoryIDWordMap history_id_word_map_;

369

370 // A one-to-one mapping from HistoryID to the history item data governing

371 // index inclusion and relevance scoring.

372 HistoryInfoMap history_info_map_;

373

374 // A one-to-one mapping from HistoryID to the word starts detected in each

375 // item's URL and page title.

376 WordStartsMap word_starts_map_;

377

378 // End of data members that are cached ---------------------------------------

379

380 // For unit testing only. Specifies the version of the cache file to be saved.

381 // Used only for testing upgrading of an older version of the cache upon

382 // restore.

383 int saved_cache_version_;

384

385 // Used for unit testing only. Records the number of candidate history items

386 // at three stages in the index searching process.

387 size_t pre_filter_item_count_; // After word index is queried.

388 size_t post_filter_item_count_; // After trimming large result set.

389 size_t post_scoring_item_count_; // After performing final filter/scoring.

390 };

391

392 } // namespace history

393

394 #endif // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_

OLD	NEW

« no previous file with comments | « chrome/browser/history/in_memory_url_index_unittest.cc ('k') | chrome/browser/history/url_index_private_data.cc » ('j') | no next file with comments »