chrome/browser/history/in_memory_url_index.cc - Issue 8359019: Create Private Data for InMemoryURLIndex (in Preparation for SQLite Cache)

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8359019: Create Private Data for InMemoryURLIndex (in Preparation for SQLite Cache) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 9 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« chrome/browser/history/history_types.h ('K') | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_types.h » ('j') | chrome/browser/history/in_memory_url_index_types.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/in_memory_url_index.h"	5 #include "chrome/browser/history/in_memory_url_index.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <functional>	8 #include <functional>

9 #include <iterator>	9 #include <iterator>

10 #include <limits>	10 #include <limits>

11 #include <numeric>	11 #include <numeric>

12	12

13 #include "base/file_util.h"	13 #include "base/file_util.h"

14 #include "base/i18n/break_iterator.h"

15 #include "base/i18n/case_conversion.h"	14 #include "base/i18n/case_conversion.h"

16 #include "base/metrics/histogram.h"	15 #include "base/metrics/histogram.h"

17 #include "base/string_util.h"	16 #include "base/string_util.h"

18 #include "base/threading/thread_restrictions.h"	17 #include "base/threading/thread_restrictions.h"

19 #include "base/time.h"	18 #include "base/time.h"

20 #include "base/utf_string_conversions.h"	19 #include "base/utf_string_conversions.h"

21 #include "chrome/browser/autocomplete/autocomplete.h"	20 #include "chrome/browser/autocomplete/autocomplete.h"

22 #include "chrome/browser/autocomplete/history_provider_util.h"	21 #include "chrome/browser/autocomplete/history_provider_util.h"

23 #include "chrome/browser/history/url_database.h"	22 #include "chrome/browser/history/url_database.h"

24 #include "chrome/browser/profiles/profile.h"	23 #include "chrome/browser/profiles/profile.h"

(...skipping 27 matching lines...) Expand all Loading...
52 HistoryInfoMapEntry;	51 HistoryInfoMapEntry;

53	52

54 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;	53 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;

55	54

56 // Score ranges used to get a 'base' score for each of the scoring factors	55 // Score ranges used to get a 'base' score for each of the scoring factors

57 // (such as recency of last visit, times visited, times the URL was typed,	56 // (such as recency of last visit, times visited, times the URL was typed,

58 // and the quality of the string match). There is a matching value range for	57 // and the quality of the string match). There is a matching value range for

59 // each of these scores for each factor.	58 // each of these scores for each factor.

60 const int kScoreRank[] = { 1425, 1200, 900, 400 };	59 const int kScoreRank[] = { 1425, 1200, 900, 400 };

61	60

62 ScoredHistoryMatch::ScoredHistoryMatch()

63 : raw_score(0),

64 can_inline(false) {}

65

66 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)

67 : HistoryMatch(url_info, 0, false, false),

68 raw_score(0),

69 can_inline(false) {}

70

71 ScoredHistoryMatch::~ScoredHistoryMatch() {}

72

73 // Comparison function for sorting ScoredMatches by their scores.

74 bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,

75 const ScoredHistoryMatch& m2) {

76 return m1.raw_score >= m2.raw_score;

77 }

78

79 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem(	61 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem(

80 const WordIDSet& word_id_set,	62 const WordIDSet& word_id_set,

81 const HistoryIDSet& history_id_set)	63 const HistoryIDSet& history_id_set)

82 : word_id_set_(word_id_set),	64 : word_id_set_(word_id_set),

83 history_id_set_(history_id_set),	65 history_id_set_(history_id_set),

84 used_(true) {}	66 used_(true) {}

85	67

86 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem()	68 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem()

87 : used_(true) {}	69 : used_(true) {}

88	70

89 InMemoryURLIndex::SearchTermCacheItem::~SearchTermCacheItem() {}	71 InMemoryURLIndex::SearchTermCacheItem::~SearchTermCacheItem() {}

90	72

91 // Comparison function for sorting TermMatches by their offsets.

92 bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) {

93 return m1.offset < m2.offset;

94 }

95

96 // Comparison function for sorting search terms by descending length.	73 // Comparison function for sorting search terms by descending length.

97 bool LengthGreater(const string16& string_a, const string16& string_b) {	74 bool LengthGreater(const string16& string_a, const string16& string_b) {

98 return string_a.length() > string_b.length();	75 return string_a.length() > string_b.length();

99 }	76 }

100	77

101 // std::accumulate helper function to add up TermMatches' lengths.	78 // std::accumulate helper function to add up TermMatches' lengths.

102 int AccumulateMatchLength(int total, const TermMatch& match) {	79 int AccumulateMatchLength(int total, const TermMatch& match) {

103 return total + match.length;	80 return total + match.length;

104 }	81 }

105	82

(...skipping 24 matching lines...) Expand all Loading...
130 if (i > 0) {	107 if (i > 0) {

131 score += (value - value_ranks[i]) *	108 score += (value - value_ranks[i]) *

132 (kScoreRank[i - 1] - kScoreRank[i]) /	109 (kScoreRank[i - 1] - kScoreRank[i]) /

133 (value_ranks[i - 1] - value_ranks[i]);	110 (value_ranks[i - 1] - value_ranks[i]);

134 }	111 }

135 return score;	112 return score;

136 }	113 }

137	114

138 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)	115 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)

139 : history_dir_(history_dir),	116 : history_dir_(history_dir),

140 history_item_count_(0),	117 private_data_(new URLIndexPrivateData),

141 cached_at_shutdown_(false) {	118 cached_at_shutdown_(false) {

142 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_);	119 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_);

143 }	120 }

144	121

145 // Called only by unit tests.	122 // Called only by unit tests.

146 InMemoryURLIndex::InMemoryURLIndex()	123 InMemoryURLIndex::InMemoryURLIndex()

147 : history_item_count_(0),	124 : private_data_(new URLIndexPrivateData),

148 cached_at_shutdown_(false) {	125 cached_at_shutdown_(false) {

149 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_);	126 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_);

150 }	127 }

151	128

152 InMemoryURLIndex::~InMemoryURLIndex() {	129 InMemoryURLIndex::~InMemoryURLIndex() {

153 // If there was a history directory (which there won't be for some unit tests)	130 // If there was a history directory (which there won't be for some unit tests)

154 // then insure that the cache has already been saved.	131 // then insure that the cache has already been saved.

155 DCHECK(history_dir_.empty() \|\| cached_at_shutdown_);	132 DCHECK(history_dir_.empty() \|\| cached_at_shutdown_);

156 }	133 }

157	134

158 // static	135 // static

159 void InMemoryURLIndex::InitializeSchemeWhitelist(	136 void InMemoryURLIndex::InitializeSchemeWhitelist(

160 std::set<std::string>* whitelist) {	137 std::set<std::string>* whitelist) {

161 DCHECK(whitelist);	138 DCHECK(whitelist);

162 whitelist->insert(std::string(chrome::kAboutScheme));	139 whitelist->insert(std::string(chrome::kAboutScheme));

163 whitelist->insert(std::string(chrome::kChromeUIScheme));	140 whitelist->insert(std::string(chrome::kChromeUIScheme));

164 whitelist->insert(std::string(chrome::kFileScheme));	141 whitelist->insert(std::string(chrome::kFileScheme));

165 whitelist->insert(std::string(chrome::kFtpScheme));	142 whitelist->insert(std::string(chrome::kFtpScheme));

166 whitelist->insert(std::string(chrome::kHttpScheme));	143 whitelist->insert(std::string(chrome::kHttpScheme));

167 whitelist->insert(std::string(chrome::kHttpsScheme));	144 whitelist->insert(std::string(chrome::kHttpsScheme));

168 whitelist->insert(std::string(chrome::kMailToScheme));	145 whitelist->insert(std::string(chrome::kMailToScheme));

169 }	146 }

170	147

171 // Indexing	148 // Indexing

172	149

173 bool InMemoryURLIndex::Init(history::URLDatabase* history_db,	150 bool InMemoryURLIndex::Init(URLDatabase* history_db,

174 const std::string& languages) {	151 const std::string& languages) {

175 // TODO(mrossetti): Register for profile/language change notifications.	152 // TODO(mrossetti): Register for profile/language change notifications.

176 languages_ = languages;	153 languages_ = languages;

177 return ReloadFromHistory(history_db, false);	154 return ReloadFromHistory(history_db, false);

178 }	155 }

179	156

180 void InMemoryURLIndex::ShutDown() {	157 void InMemoryURLIndex::ShutDown() {

181 // Write our cache.	158 // Write our cache.

182 SaveToCacheFile();	159 SaveToCacheFile();

183 cached_at_shutdown_ = true;	160 cached_at_shutdown_ = true;

184 }	161 }

185	162

186 bool InMemoryURLIndex::IndexRow(const URLRow& row) {	163 bool InMemoryURLIndex::IndexRow(const URLRow& row) {

187 const GURL& gurl(row.url());	164 const GURL& gurl(row.url());

188	165

189 // Index only URLs with a whitelisted scheme.	166 // Index only URLs with a whitelisted scheme.

190 if (!InMemoryURLIndex::URLSchemeIsWhitelisted(gurl))	167 if (!InMemoryURLIndex::URLSchemeIsWhitelisted(gurl))

191 return true;	168 return true;

192	169

	170 URLID row_id = row.id();

	171 // Strip out username and password before saving and indexing.

193 string16 url(net::FormatUrl(gurl, languages_,	172 string16 url(net::FormatUrl(gurl, languages_,

194 net::kFormatUrlOmitUsernamePassword,	173 net::kFormatUrlOmitUsernamePassword,

195 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,	174 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,

196 NULL, NULL, NULL));	175 NULL, NULL, NULL));

197	176

198 HistoryID history_id = static_cast<HistoryID>(row.id());	177 HistoryID history_id = static_cast<HistoryID>(row_id);

199 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());	178 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());

200	179

201 // Add the row for quick lookup in the history info store.	180 // Add the row for quick lookup in the history info store.

202 URLRow new_row(GURL(url), row.id());	181 URLRow new_row(GURL(url), row_id);

203 new_row.set_visit_count(row.visit_count());	182 new_row.set_visit_count(row.visit_count());

204 new_row.set_typed_count(row.typed_count());	183 new_row.set_typed_count(row.typed_count());

205 new_row.set_last_visit(row.last_visit());	184 new_row.set_last_visit(row.last_visit());

206 new_row.set_title(row.title());	185 new_row.set_title(row.title());

207 history_info_map_[history_id] = new_row;	186 private_data_->history_info_map_[history_id] = new_row;

208	187

	188 // Index the words contained in the URL and title of the row.

	189 AddRowWordsToIndex(new_row);

	190 return true;
	Peter Kasting 2011/10/24 22:01:40 Nit: I just noticed that this function (old and ne Nit: I just noticed that this function (old and new versions) never returns false. Seems like it should be void then. mrossetti 2011/10/25 16:15:05 Done. Show quoted text On 2011/10/24 22:01:40, Peter Kasting wrote: > Nit: I just noticed that this function (old and new versions) never returns > false. Seems like it should be void then. Done.
	191 }

	192

	193 void InMemoryURLIndex::AddRowWordsToIndex(const URLRow& row) {

	194 HistoryID history_id = static_cast<HistoryID>(row.id());

209 // Split URL into individual, unique words then add in the title words.	195 // Split URL into individual, unique words then add in the title words.

	196 const GURL& gurl(row.url());

	197 string16 url(net::FormatUrl(gurl, languages_,

	198 net::kFormatUrlOmitUsernamePassword,

	199 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,

	200 NULL, NULL, NULL));

210 url = base::i18n::ToLower(url);	201 url = base::i18n::ToLower(url);

211 String16Set url_words = WordSetFromString16(url);	202 String16Set url_words = String16SetFromString16(url);

212 String16Set title_words = WordSetFromString16(row.title());	203 String16Set title_words = String16SetFromString16(row.title());

213 String16Set words;	204 String16Set words;

214 std::set_union(url_words.begin(), url_words.end(),	205 std::set_union(url_words.begin(), url_words.end(),

215 title_words.begin(), title_words.end(),	206 title_words.begin(), title_words.end(),

216 std::insert_iterator<String16Set>(words, words.begin()));	207 std::insert_iterator<String16Set>(words, words.begin()));

217 for (String16Set::iterator word_iter = words.begin();	208 for (String16Set::iterator word_iter = words.begin();

218 word_iter != words.end(); ++word_iter)	209 word_iter != words.end(); ++word_iter)

219 AddWordToIndex(*word_iter, history_id);	210 AddWordToIndex(*word_iter, history_id);

220	211

221 ++history_item_count_;	212 search_term_cache_.clear(); // Invalidate the term cache.

222 return true;	213 }

	214

	215 void InMemoryURLIndex::RemoveRowFromIndex(const URLRow& row) {

	216 RemoveRowWordsFromIndex(row);

	217 HistoryID history_id = static_cast<HistoryID>(row.id());

	218 private_data_->history_info_map_.erase(history_id);

	219 }

	220

	221 void InMemoryURLIndex::RemoveRowWordsFromIndex(const URLRow& row) {

	222 // Remove the entries in history_id_word_map_ and word_id_history_map_ for

	223 // this row.

	224 URLIndexPrivateData& private_data(*(private_data_.get()));

	225 HistoryID history_id = static_cast<HistoryID>(row.id());

	226 WordIDSet word_id_set = private_data.history_id_word_map_[history_id];

	227 private_data.history_id_word_map_.erase(history_id);

	228

	229 // Reconcile any changes to word usage.

	230 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

	231 word_id_iter != word_id_set.end(); ++word_id_iter) {

	232 WordID word_id = *word_id_iter;

	233 private_data.word_id_history_map_[word_id].erase(history_id);

	234 if (!private_data.word_id_history_map_[word_id].empty())

	235 continue; // The word is still in use.

	236

	237 // The word is no longer in use. Reconcile any changes to character usage.

	238 string16 word = private_data.word_list_[word_id];

	239 Char16Set characters = Char16SetFromString16(word);

	240 for (Char16Set::iterator uni_char_iter = characters.begin();

	241 uni_char_iter != characters.end(); ++uni_char_iter) {

	242 char16 uni_char = *uni_char_iter;

	243 private_data.char_word_map_[uni_char].erase(word_id);

	244 if (private_data.char_word_map_[uni_char].empty())

	245 private_data.char_word_map_.erase(uni_char); // No longer in use.

	246 }

	247

	248 // Complete the removal of references to the word.

	249 private_data.word_id_history_map_.erase(word_id);

	250 private_data.word_map_.erase(word);

	251 private_data.word_list_[word_id] = string16();

	252 private_data.available_words_.insert(word_id);

	253 }

223 }	254 }

224	255

225 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,	256 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,

226 bool clear_cache) {	257 bool clear_cache) {

227 ClearPrivateData();	258 ClearPrivateData();

228	259

229 if (!history_db)	260 if (!history_db)

230 return false;	261 return false;

231	262

232 if (clear_cache \|\| !RestoreFromCacheFile()) {	263 if (clear_cache \|\| !RestoreFromCacheFile()) {

233 base::TimeTicks beginning_time = base::TimeTicks::Now();	264 base::TimeTicks beginning_time = base::TimeTicks::Now();

234 // The index has to be built from scratch.	265 // The index has to be built from scratch.

235 URLDatabase::URLEnumerator history_enum;	266 URLDatabase::URLEnumerator history_enum;

236 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))	267 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))

237 return false;	268 return false;

238 URLRow row;	269 URLRow row;

239 while (history_enum.GetNextURL(&row)) {	270 while (history_enum.GetNextURL(&row)) {

240 if (!IndexRow(row))	271 if (!IndexRow(row))

241 return false;	272 return false;

242 }	273 }

243 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",	274 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",

244 base::TimeTicks::Now() - beginning_time);	275 base::TimeTicks::Now() - beginning_time);

245 SaveToCacheFile();	276 SaveToCacheFile();

246 }	277 }

247 return true;	278 return true;

248 }	279 }

249	280

250 void InMemoryURLIndex::ClearPrivateData() {	281 void InMemoryURLIndex::ClearPrivateData() {

251 history_item_count_ = 0;	282 private_data_->Clear();

252 word_list_.clear();

253 word_map_.clear();

254 char_word_map_.clear();

255 word_id_history_map_.clear();

256 history_info_map_.clear();

257 search_term_cache_.clear();	283 search_term_cache_.clear();

258 }	284 }

259	285

260 bool InMemoryURLIndex::RestoreFromCacheFile() {	286 bool InMemoryURLIndex::RestoreFromCacheFile() {

261 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date.	287 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date.

262 // That is: ensure that the database has not been modified since the cache	288 // That is: ensure that the database has not been modified since the cache

263 // was last saved. DB file modification date is inadequate. There are no	289 // was last saved. DB file modification date is inadequate. There are no

264 // SQLite table checksums automatically stored.	290 // SQLite table checksums automatically stored.

265 // FIXME(mrossetti): Move File IO to another thread.	291 // FIXME(mrossetti): Move File IO to another thread.

266 base::ThreadRestrictions::ScopedAllowIO allow_io;	292 base::ThreadRestrictions::ScopedAllowIO allow_io;

(...skipping 15 matching lines...) Expand all Loading...
282 return false;	308 return false;

283 }	309 }

284	310

285 if (!RestorePrivateData(index_cache)) {	311 if (!RestorePrivateData(index_cache)) {

286 ClearPrivateData(); // Back to square one -- must build from scratch.	312 ClearPrivateData(); // Back to square one -- must build from scratch.

287 return false;	313 return false;

288 }	314 }

289	315

290 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",	316 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",

291 base::TimeTicks::Now() - beginning_time);	317 base::TimeTicks::Now() - beginning_time);

292 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", history_item_count_);	318 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",

	319 private_data_->history_id_word_map_.size());

293 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());	320 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());

294 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", word_map_.size());	321 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",

295 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", char_word_map_.size());	322 private_data_->word_map_.size());

	323 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",

	324 private_data_->char_word_map_.size());

296 return true;	325 return true;

297 }	326 }

298	327

299 bool InMemoryURLIndex::SaveToCacheFile() {	328 bool InMemoryURLIndex::SaveToCacheFile() {

300 // TODO(mrossetti): Move File IO to another thread.	329 // TODO(mrossetti): Move File IO to another thread.

301 base::ThreadRestrictions::ScopedAllowIO allow_io;	330 base::ThreadRestrictions::ScopedAllowIO allow_io;

302 FilePath file_path;	331 FilePath file_path;

303 if (!GetCacheFilePath(&file_path))	332 if (!GetCacheFilePath(&file_path))

304 return false;	333 return false;

305	334

(...skipping 14 matching lines...) Expand all Loading...
320 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",	349 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",

321 base::TimeTicks::Now() - beginning_time);	350 base::TimeTicks::Now() - beginning_time);

322 return true;	351 return true;

323 }	352 }

324	353

325 void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) {	354 void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) {

326 // The row may or may not already be in our index. If it is not already	355 // The row may or may not already be in our index. If it is not already

327 // indexed and it qualifies then it gets indexed. If it is already	356 // indexed and it qualifies then it gets indexed. If it is already

328 // indexed and still qualifies then it gets updated, otherwise it	357 // indexed and still qualifies then it gets updated, otherwise it

329 // is deleted from the index.	358 // is deleted from the index.

330 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);	359 HistoryInfoMap::iterator row_pos =

331 if (row_pos == history_info_map_.end()) {	360 private_data_->history_info_map_.find(row_id);

	361 if (row_pos == private_data_->history_info_map_.end()) {

332 // This new row should be indexed if it qualifies.	362 // This new row should be indexed if it qualifies.

333 if (RowQualifiesAsSignificant(row, base::Time()))	363 URLRow new_row(row);

334 IndexRow(row);	364 new_row.set_id(row_id);

	365 if (RowQualifiesAsSignificant(new_row, base::Time()))

	366 IndexRow(new_row);

335 } else if (RowQualifiesAsSignificant(row, base::Time())) {	367 } else if (RowQualifiesAsSignificant(row, base::Time())) {

336 // This indexed row still qualifies and will be re-indexed.	368 // This indexed row still qualifies and will be re-indexed.

337 // The url won't have changed but the title, visit count, etc.	369 // The url won't have changed but the title, visit count, etc.

338 // might have changed.	370 // might have changed.

339 URLRow& old_row = row_pos->second;	371 URLRow& updated_row = row_pos->second;

340 old_row.set_visit_count(row.visit_count());	372 updated_row.set_visit_count(row.visit_count());

341 old_row.set_typed_count(row.typed_count());	373 updated_row.set_typed_count(row.typed_count());

342 old_row.set_last_visit(row.last_visit());	374 updated_row.set_last_visit(row.last_visit());

343 // TODO(mrossetti): When we start indexing the title the next line	375 // While the URL is guaranteed to remain stable, the title may have changed.

344 // will need attention.	376 // If so, then we need to update the index with the changed words.

345 old_row.set_title(row.title());	377 if (updated_row.title() != row.title()) {

	378 // Clear all words associated with this row and re-index both the

	379 // URL and title.

	380 RemoveRowWordsFromIndex(updated_row);

	381 updated_row.set_title(row.title());

	382 AddRowWordsToIndex(updated_row);

	383 }

346 } else {	384 } else {

347 // This indexed row no longer qualifies and will be de-indexed.	385 // This indexed row no longer qualifies and will be de-indexed by

348 history_info_map_.erase(row_id);	386 // clearing all words associated with this row.

	387 URLRow& removed_row = row_pos->second;

	388 RemoveRowFromIndex(removed_row);

349 }	389 }

350 // This invalidates the cache.	390 // This invalidates the cache.

351 search_term_cache_.clear();	391 search_term_cache_.clear();

352 // TODO(mrossetti): Record this transaction in the cache.

353 }	392 }

354	393

355 void InMemoryURLIndex::DeleteURL(URLID row_id) {	394 void InMemoryURLIndex::DeleteURL(URLID row_id) {

356 // Note that this does not remove any reference to this row from the	395 // Note that this does not remove any reference to this row from the

357 // word_id_history_map_. That map will continue to contain (and return)	396 // word_id_history_map_. That map will continue to contain (and return)

358 // hits against this row until that map is rebuilt, but since the	397 // hits against this row until that map is rebuilt, but since the

359 // history_info_map_ no longer references the row no erroneous results	398 // history_info_map_ no longer references the row no erroneous results

360 // will propagate to the user.	399 // will propagate to the user.

361 history_info_map_.erase(row_id);	400 private_data_->history_info_map_.erase(row_id);

362 // This invalidates the word cache.	401 // This invalidates the word cache.

363 search_term_cache_.clear();	402 search_term_cache_.clear();

364 // TODO(mrossetti): Record this transaction in the cache.

365 }	403 }

366	404

367 // Searching	405 // Searching

368	406

369 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(	407 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(

370 const String16Vector& terms) {	408 const String16Vector& terms) {

371 ScoredHistoryMatches scored_items;	409 ScoredHistoryMatches scored_items;

	410

	411 // Do nothing if we have indexed no words (probably because we've not been

	412 // initialized yet).

	413 if (private_data_->word_list_.empty())

	414 return scored_items;

	415

372 if (!terms.empty()) {	416 if (!terms.empty()) {

373 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep	417 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

374 // approach.	418 // approach.

375 ResetSearchTermCache();	419 ResetSearchTermCache();

376	420

377 // Lowercase the terms.	421 // Lowercase the terms.

378 // TODO(mrossetti): Another opportunity for a transform algorithm.	422 // TODO(mrossetti): Another opportunity for a transform algorithm.

379 String16Vector lower_terms;	423 String16Vector lower_terms;

380 for (String16Vector::const_iterator term_iter = terms.begin();	424 for (String16Vector::const_iterator term_iter = terms.begin();

381 term_iter != terms.end(); ++term_iter)	425 term_iter != terms.end(); ++term_iter)

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
419	463

420 return scored_items;	464 return scored_items;

421 }	465 }

422	466

423 void InMemoryURLIndex::ResetSearchTermCache() {	467 void InMemoryURLIndex::ResetSearchTermCache() {

424 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();	468 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();

425 iter != search_term_cache_.end(); ++iter)	469 iter != search_term_cache_.end(); ++iter)

426 iter->second.used_ = false;	470 iter->second.used_ = false;

427 }	471 }

428	472

429 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(	473 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(

430 const string16& uni_string) {	474 const string16& uni_string) {

431 // Break the terms down into individual terms (words), get the candidate	475 // Break the terms down into individual terms (words), get the candidate

432 // set for each term, and intersect each to get a final candidate list.	476 // set for each term, and intersect each to get a final candidate list.

433 // Note that a single 'term' from the user's perspective might be	477 // Note that a single 'term' from the user's perspective might be

434 // a string like "http://www.somewebsite.com" which, from our perspective,	478 // a string like "http://www.somewebsite.com" which, from our perspective,

435 // is four words: 'http', 'www', 'somewebsite', and 'com'.	479 // is four words: 'http', 'www', 'somewebsite', and 'com'.

436 HistoryIDSet history_id_set;	480 HistoryIDSet history_id_set;

437 String16Vector terms = WordVectorFromString16(uni_string, true);	481 String16Vector terms = String16VectorFromString16(uni_string, true);

438 // Sort the terms into the longest first as such are likely to narrow down	482 // Sort the terms into the longest first as such are likely to narrow down

439 // the results quicker. Also, single character terms are the most expensive	483 // the results quicker. Also, single character terms are the most expensive

440 // to process so save them for last.	484 // to process so save them for last.

441 std::sort(terms.begin(), terms.end(), LengthGreater);	485 std::sort(terms.begin(), terms.end(), LengthGreater);

442 for (String16Vector::iterator iter = terms.begin(); iter != terms.end();	486 for (String16Vector::iterator iter = terms.begin(); iter != terms.end();

443 ++iter) {	487 ++iter) {

444 string16 uni_word = *iter;	488 string16 uni_word = *iter;

445 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);	489 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

446 if (term_history_set.empty()) {	490 if (term_history_set.empty()) {

447 history_id_set.clear();	491 history_id_set.clear();

448 break;	492 break;

449 }	493 }

450 if (iter == terms.begin()) {	494 if (iter == terms.begin()) {

451 history_id_set.swap(term_history_set);	495 history_id_set.swap(term_history_set);

452 } else {	496 } else {

453 HistoryIDSet new_history_id_set;	497 HistoryIDSet new_history_id_set;

454 std::set_intersection(history_id_set.begin(), history_id_set.end(),	498 std::set_intersection(history_id_set.begin(), history_id_set.end(),

455 term_history_set.begin(), term_history_set.end(),	499 term_history_set.begin(), term_history_set.end(),

456 std::inserter(new_history_id_set,	500 std::inserter(new_history_id_set,

457 new_history_id_set.begin()));	501 new_history_id_set.begin()));

458 history_id_set.swap(new_history_id_set);	502 history_id_set.swap(new_history_id_set);

459 }	503 }

460 }	504 }

461 return history_id_set;	505 return history_id_set;

462 }	506 }

463	507

464 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(	508 HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(

465 const string16& term) {	509 const string16& term) {

466 if (term.empty())	510 if (term.empty())

467 return HistoryIDSet();	511 return HistoryIDSet();

468	512

469 // TODO(mrossetti): Consider optimizing for very common terms such as	513 // TODO(mrossetti): Consider optimizing for very common terms such as

470 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently	514 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently

471 // occuring words in the user's searches.	515 // occuring words in the user's searches.

472	516

473 size_t term_length = term.length();	517 size_t term_length = term.length();

474 InMemoryURLIndex::WordIDSet word_id_set;	518 WordIDSet word_id_set;

475 if (term_length > 1) {	519 if (term_length > 1) {

476 // See if this term or a prefix thereof is present in the cache.	520 // See if this term or a prefix thereof is present in the cache.

477 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());	521 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());

478 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();	522 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

479 cache_iter != search_term_cache_.end(); ++cache_iter) {	523 cache_iter != search_term_cache_.end(); ++cache_iter) {

480 if (StartsWith(term, cache_iter->first, false) &&	524 if (StartsWith(term, cache_iter->first, false) &&

481 (best_prefix == search_term_cache_.end() \|\|	525 (best_prefix == search_term_cache_.end() \|\|

482 cache_iter->first.length() > best_prefix->first.length()))	526 cache_iter->first.length() > best_prefix->first.length()))

483 best_prefix = cache_iter;	527 best_prefix = cache_iter;

484 }	528 }

(...skipping 25 matching lines...) Expand all Loading...
510	554

511 // Filter for each remaining, unique character in the term.	555 // Filter for each remaining, unique character in the term.

512 Char16Set leftover_chars = Char16SetFromString16(leftovers);	556 Char16Set leftover_chars = Char16SetFromString16(leftovers);

513 Char16Set unique_chars;	557 Char16Set unique_chars;

514 std::set_difference(leftover_chars.begin(), leftover_chars.end(),	558 std::set_difference(leftover_chars.begin(), leftover_chars.end(),

515 prefix_chars.begin(), prefix_chars.end(),	559 prefix_chars.begin(), prefix_chars.end(),

516 std::inserter(unique_chars, unique_chars.begin()));	560 std::inserter(unique_chars, unique_chars.begin()));

517	561

518 // Reduce the word set with any leftover, unprocessed characters.	562 // Reduce the word set with any leftover, unprocessed characters.

519 if (!unique_chars.empty()) {	563 if (!unique_chars.empty()) {

520 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));	564 WordIDSet leftover_set(

	565 private_data_->WordIDSetForTermChars(unique_chars));

521 // We might come up empty on the leftovers.	566 // We might come up empty on the leftovers.

522 if (leftover_set.empty()) {	567 if (leftover_set.empty()) {

523 search_term_cache_[term] = SearchTermCacheItem();	568 search_term_cache_[term] = SearchTermCacheItem();

524 return HistoryIDSet();	569 return HistoryIDSet();

525 }	570 }

526 // Or there may not have been a prefix from which to start.	571 // Or there may not have been a prefix from which to start.

527 if (prefix_chars.empty()) {	572 if (prefix_chars.empty()) {

528 word_id_set.swap(leftover_set);	573 word_id_set.swap(leftover_set);

529 } else {	574 } else {

530 WordIDSet new_word_id_set;	575 WordIDSet new_word_id_set;

531 std::set_intersection(word_id_set.begin(), word_id_set.end(),	576 std::set_intersection(word_id_set.begin(), word_id_set.end(),

532 leftover_set.begin(), leftover_set.end(),	577 leftover_set.begin(), leftover_set.end(),

533 std::inserter(new_word_id_set,	578 std::inserter(new_word_id_set,

534 new_word_id_set.begin()));	579 new_word_id_set.begin()));

535 word_id_set.swap(new_word_id_set);	580 word_id_set.swap(new_word_id_set);

536 }	581 }

537 }	582 }

538	583

539 // We must filter the word list because the resulting word set surely	584 // We must filter the word list because the resulting word set surely

540 // contains words which do not have the search term as a proper subset.	585 // contains words which do not have the search term as a proper subset.

541 for (WordIDSet::iterator word_set_iter = word_id_set.begin();	586 for (WordIDSet::iterator word_set_iter = word_id_set.begin();

542 word_set_iter != word_id_set.end(); ) {	587 word_set_iter != word_id_set.end(); ) {

543 if (word_list_[*word_set_iter].find(term) == string16::npos)	588 if (private_data_->word_list_[*word_set_iter].find(term) ==

	589 string16::npos)

544 word_id_set.erase(word_set_iter++);	590 word_id_set.erase(word_set_iter++);

545 else	591 else

546 ++word_set_iter;	592 ++word_set_iter;

547 }	593 }

548 } else {	594 } else {

549 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));	595 word_id_set =

	596 private_data_->WordIDSetForTermChars(Char16SetFromString16(term));

550 }	597 }

551	598

552 // If any words resulted then we can compose a set of history IDs by unioning	599 // If any words resulted then we can compose a set of history IDs by unioning

553 // the sets from each word.	600 // the sets from each word.

554 HistoryIDSet history_id_set;	601 HistoryIDSet history_id_set;

555 if (!word_id_set.empty()) {	602 if (!word_id_set.empty()) {

556 for (WordIDSet::iterator word_id_iter = word_id_set.begin();	603 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

557 word_id_iter != word_id_set.end(); ++word_id_iter) {	604 word_id_iter != word_id_set.end(); ++word_id_iter) {

558 WordID word_id = *word_id_iter;	605 WordID word_id = *word_id_iter;

559 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);	606 WordIDHistoryMap::iterator word_iter =

560 if (word_iter != word_id_history_map_.end()) {	607 private_data_->word_id_history_map_.find(word_id);

	608 if (word_iter != private_data_->word_id_history_map_.end()) {

561 HistoryIDSet& word_history_id_set(word_iter->second);	609 HistoryIDSet& word_history_id_set(word_iter->second);

562 history_id_set.insert(word_history_id_set.begin(),	610 history_id_set.insert(word_history_id_set.begin(),

563 word_history_id_set.end());	611 word_history_id_set.end());

564 }	612 }

565 }	613 }

566 }	614 }

567	615

568 // Record a new cache entry for this word if the term is longer than	616 // Record a new cache entry for this word if the term is longer than

569 // a single character.	617 // a single character.

570 if (term_length > 1)	618 if (term_length > 1)

571 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);	619 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);

572	620

573 return history_id_set;	621 return history_id_set;

574 }	622 }

575	623

576 // Utility Functions	624 // Utility Functions

577	625

578 // static

579 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(

580 const string16& uni_string) {

581 const size_t kMaxWordLength = 64;

582 String16Vector words = WordVectorFromString16(uni_string, false);

583 String16Set word_set;

584 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();

585 ++iter)

586 word_set.insert(base::i18n::ToLower(*iter).substr(0, kMaxWordLength));

587 return word_set;

588 }

589

590 // static

591 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(

592 const string16& uni_string,

593 bool break_on_space) {

594 base::i18n::BreakIterator iter(

595 uni_string,

596 break_on_space ? base::i18n::BreakIterator::BREAK_SPACE

597 : base::i18n::BreakIterator::BREAK_WORD);

598 String16Vector words;

599 if (!iter.Init())

600 return words;

601 while (iter.Advance()) {

602 if (break_on_space \|\| iter.IsWord()) {

603 string16 word = iter.GetString();

604 if (break_on_space)

605 TrimWhitespace(word, TRIM_ALL, &word);

606 if (!word.empty())

607 words.push_back(word);

608 }

609 }

610 return words;

611 }

612

613 // static

614 InMemoryURLIndex::Char16Set InMemoryURLIndex::Char16SetFromString16(

615 const string16& term) {

616 Char16Set characters;

617 for (string16::const_iterator iter = term.begin(); iter != term.end();

618 ++iter)

619 characters.insert(*iter);

620 return characters;

621 }

622

623 void InMemoryURLIndex::AddWordToIndex(const string16& term,	626 void InMemoryURLIndex::AddWordToIndex(const string16& term,

624 HistoryID history_id) {	627 HistoryID history_id) {

625 WordMap::iterator word_pos = word_map_.find(term);	628 WordMap::iterator word_pos = private_data_->word_map_.find(term);

626 if (word_pos != word_map_.end())	629 if (word_pos != private_data_->word_map_.end())

627 UpdateWordHistory(word_pos->second, history_id);	630 UpdateWordHistory(word_pos->second, history_id);

628 else	631 else

629 AddWordHistory(term, history_id);	632 AddWordHistory(term, history_id);

630 }	633 }

631	634

632 void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {	635 void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {

633 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);	636 WordIDHistoryMap::iterator history_pos =

634 DCHECK(history_pos != word_id_history_map_.end());	637 private_data_->word_id_history_map_.find(word_id);

635 HistoryIDSet& history_id_set(history_pos->second);	638 DCHECK(history_pos != private_data_->word_id_history_map_.end());

636 history_id_set.insert(history_id);	639 HistoryIDSet& history_id_set(history_pos->second);

	640 history_id_set.insert(history_id);

	641 private_data_->AddToHistoryIDWordMap(history_id, word_id);

637 }	642 }

638	643

639 // Add a new word to the word list and the word map, and then create a	644 // Add a new word to the word list and the word map, and then create a

640 // new entry in the word/history map.	645 // new entry in the word/history map.

641 void InMemoryURLIndex::AddWordHistory(const string16& term,	646 void InMemoryURLIndex::AddWordHistory(const string16& term,

642 HistoryID history_id) {	647 HistoryID history_id) {

643 word_list_.push_back(term);	648 URLIndexPrivateData& private_data(*(private_data_.get()));

644 WordID word_id = word_list_.size() - 1;	649 WordID word_id = private_data.word_list_.size();

645 word_map_[term] = word_id;	650 if (private_data.available_words_.empty()) {

	651 private_data.word_list_.push_back(term);

	652 } else {

	653 word_id = *(private_data.available_words_.begin());

	654 private_data.word_list_[word_id] = term;

	655 private_data.available_words_.erase(word_id);

	656 }

	657 private_data.word_map_[term] = word_id;

	658

646 HistoryIDSet history_id_set;	659 HistoryIDSet history_id_set;

647 history_id_set.insert(history_id);	660 history_id_set.insert(history_id);

648 word_id_history_map_[word_id] = history_id_set;	661 private_data.word_id_history_map_[word_id] = history_id_set;

	662 private_data.AddToHistoryIDWordMap(history_id, word_id);

	663

649 // For each character in the newly added word (i.e. a word that is not	664 // For each character in the newly added word (i.e. a word that is not

650 // already in the word index), add the word to the character index.	665 // already in the word index), add the word to the character index.

651 Char16Set characters = Char16SetFromString16(term);	666 Char16Set characters = Char16SetFromString16(term);

652 for (Char16Set::iterator uni_char_iter = characters.begin();	667 for (Char16Set::iterator uni_char_iter = characters.begin();

653 uni_char_iter != characters.end(); ++uni_char_iter) {	668 uni_char_iter != characters.end(); ++uni_char_iter) {

654 char16 uni_char = *uni_char_iter;	669 char16 uni_char = *uni_char_iter;

655 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);	670 CharWordIDMap::iterator char_iter =

656 if (char_iter != char_word_map_.end()) {	671 private_data.char_word_map_.find(uni_char);

	672 if (char_iter != private_data.char_word_map_.end()) {

657 // Update existing entry in the char/word index.	673 // Update existing entry in the char/word index.

658 WordIDSet& word_id_set(char_iter->second);	674 WordIDSet& word_id_set(char_iter->second);

659 word_id_set.insert(word_id);	675 word_id_set.insert(word_id);

660 } else {	676 } else {

661 // Create a new entry in the char/word index.	677 // Create a new entry in the char/word index.

662 WordIDSet word_id_set;	678 WordIDSet word_id_set;

663 word_id_set.insert(word_id);	679 word_id_set.insert(word_id);

664 char_word_map_[uni_char] = word_id_set;	680 private_data.char_word_map_[uni_char] = word_id_set;

665 }	681 }

666 }	682 }

667 }	683 }

668	684

669 InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(

670 const Char16Set& term_chars) {

671 WordIDSet word_id_set;

672 for (Char16Set::const_iterator c_iter = term_chars.begin();

673 c_iter != term_chars.end(); ++c_iter) {

674 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);

675 if (char_iter == char_word_map_.end()) {

676 // A character was not found so there are no matching results: bail.

677 word_id_set.clear();

678 break;

679 }

680 WordIDSet& char_word_id_set(char_iter->second);

681 // It is possible for there to no longer be any words associated with

682 // a particular character. Give up in that case.

683 if (char_word_id_set.empty()) {

684 word_id_set.clear();

685 break;

686 }

687

688 if (c_iter == term_chars.begin()) {

689 // First character results becomes base set of results.

690 word_id_set = char_word_id_set;

691 } else {

692 // Subsequent character results get intersected in.

693 WordIDSet new_word_id_set;

694 std::set_intersection(word_id_set.begin(), word_id_set.end(),

695 char_word_id_set.begin(), char_word_id_set.end(),

696 std::inserter(new_word_id_set,

697 new_word_id_set.begin()));

698 word_id_set.swap(new_word_id_set);

699 }

700 }

701 return word_id_set;

702 }

703

704 // static	685 // static

705 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,	686 // TODO(mrossetti): This can be made a ctor for ScoredHistoryMatch.

706 const string16& string,

707 int term_num) {

708 const size_t kMaxCompareLength = 2048;

709 const string16& short_string = (string.length() > kMaxCompareLength) ?

710 string.substr(0, kMaxCompareLength) : string;

711 TermMatches matches;

712 for (size_t location = short_string.find(term); location != string16::npos;

713 location = short_string.find(term, location + 1)) {

714 matches.push_back(TermMatch(term_num, location, term.length()));

715 }

716 return matches;

717 }

718

719 // static

720 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {

721 if (matches.empty())

722 return matches;

723 TermMatches sorted_matches = matches;

724 std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess);

725 TermMatches clean_matches;

726 TermMatch last_match = sorted_matches[0];

727 clean_matches.push_back(last_match);

728 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;

729 iter != sorted_matches.end(); ++iter) {

730 if (iter->offset >= last_match.offset + last_match.length) {

731 last_match = *iter;

732 clean_matches.push_back(last_match);

733 }

734 }

735 return clean_matches;

736 }

737

738 // static

739 std::vector<size_t> InMemoryURLIndex::OffsetsFromTermMatches(

740 const TermMatches& matches) {

741 std::vector<size_t> offsets;

742 for (TermMatches::const_iterator i = matches.begin(); i != matches.end(); ++i)

743 offsets.push_back(i->offset);

744 return offsets;

745 }

746

747 // static

748 TermMatches InMemoryURLIndex::ReplaceOffsetsInTermMatches(

749 const TermMatches& matches,

750 const std::vector<size_t>& offsets) {

751 DCHECK_EQ(matches.size(), offsets.size());

752 TermMatches new_matches;

753 std::vector<size_t>::const_iterator offset_iter = offsets.begin();

754 for (TermMatches::const_iterator term_iter = matches.begin();

755 term_iter != matches.end(); ++term_iter, ++offset_iter) {

756 if (*offset_iter != string16::npos) {

757 TermMatch new_match(*term_iter);

758 new_match.offset = *offset_iter;

759 new_matches.push_back(new_match);

760 }

761 }

762 return new_matches;

763 }

764

765 // static

766 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(	687 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(

767 const URLRow& row,	688 const URLRow& row,

768 const String16Vector& terms) {	689 const String16Vector& terms) {

769 ScoredHistoryMatch match(row);	690 ScoredHistoryMatch match(row);

770 GURL gurl = row.url();	691 GURL gurl = row.url();

771 if (!gurl.is_valid())	692 if (!gurl.is_valid())

772 return match;	693 return match;

773	694

774 // Figure out where each search term appears in the URL and/or page title	695 // Figure out where each search term appears in the URL and/or page title

775 // so that we can score as well as provide autocomplete highlighting.	696 // so that we can score as well as provide autocomplete highlighting.

776 string16 url = base::i18n::ToLower(UTF8ToUTF16(gurl.spec()));	697 string16 url = base::i18n::ToLower(UTF8ToUTF16(gurl.spec()));

777 string16 title = base::i18n::ToLower(row.title());	698 string16 title = base::i18n::ToLower(row.title());

778 int term_num = 0;	699 int term_num = 0;

779 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();	700 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();

780 ++iter, ++term_num) {	701 ++iter, ++term_num) {

781 string16 term = *iter;	702 string16 term = *iter;

782 TermMatches url_term_matches = MatchTermInString(term, url, term_num);	703 TermMatches url_term_matches = MatchTermInString(term, url, term_num);

783 TermMatches title_term_matches = MatchTermInString(term, title, term_num);	704 TermMatches title_term_matches = MatchTermInString(term, title, term_num);

784 if (url_term_matches.empty() && title_term_matches.empty())	705 if (url_term_matches.empty() && title_term_matches.empty())

785 return match; // A term was not found in either URL or title - reject.	706 return match; // A term was not found in either URL or title - reject.

786 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),	707 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),

787 url_term_matches.end());	708 url_term_matches.end());

788 match.title_matches.insert(match.title_matches.end(),	709 match.title_matches.insert(match.title_matches.end(),

789 title_term_matches.begin(),	710 title_term_matches.begin(),

790 title_term_matches.end());	711 title_term_matches.end());

791 }	712 }

792	713

793 // Sort matches by offset and eliminate any which overlap.	714 // Sort matches by offset and eliminate any which overlap.

794 match.url_matches = SortAndDeoverlap(match.url_matches);	715 match.url_matches = SortAndDeoverlapMatches(match.url_matches);

795 match.title_matches = SortAndDeoverlap(match.title_matches);	716 match.title_matches = SortAndDeoverlapMatches(match.title_matches);

796	717

797 // We should not (currently) inline autocomplete a result unless both of the	718 // We should not (currently) inline autocomplete a result unless both of the

798 // following are true:	719 // following are true:

799 // * There is exactly one substring matches in the URL, and	720 // * There is exactly one substring matches in the URL, and

800 // * The one URL match starts at the beginning of the URL.	721 // * The one URL match starts at the beginning of the URL.

801 match.can_inline =	722 match.can_inline =

802 match.url_matches.size() == 1 && match.url_matches[0].offset == 0;	723 match.url_matches.size() == 1 && match.url_matches[0].offset == 0;

803	724

804 // Get partial scores based on term matching. Note that the score for	725 // Get partial scores based on term matching. Note that the score for

805 // each of the URL and title are adjusted by the fraction of the	726 // each of the URL and title are adjusted by the fraction of the

(...skipping 93 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
899	820

900 // Scale the sum of the three components above into a single score component	821 // Scale the sum of the three components above into a single score component

901 // on the same scale as that used in ScoredMatchForURL().	822 // on the same scale as that used in ScoredMatchForURL().

902 return ScoreForValue(raw_score, kTermScoreLevel);	823 return ScoreForValue(raw_score, kTermScoreLevel);

903 }	824 }

904	825

905 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(	826 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

906 const InMemoryURLIndex& index,	827 const InMemoryURLIndex& index,

907 const String16Vector& lower_terms)	828 const String16Vector& lower_terms)

908 : index_(index),	829 : index_(index),

909 lower_terms_(lower_terms) {	830 lower_terms_(lower_terms) {}

910 }

911	831

912 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}	832 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}

913	833

914 void InMemoryURLIndex::AddHistoryMatch::operator()(	834 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {

915 const InMemoryURLIndex::HistoryID history_id) {

916 HistoryInfoMap::const_iterator hist_pos =	835 HistoryInfoMap::const_iterator hist_pos =

917 index_.history_info_map_.find(history_id);	836 index_.private_data_->history_info_map_.find(history_id);

918 // Note that a history_id may be present in the word_id_history_map_ yet not	837 // Note that a history_id may be present in the word_id_history_map_ yet not

919 // be found in the history_info_map_. This occurs when an item has been	838 // be found in the history_info_map_. This occurs when an item has been

920 // deleted by the user or the item no longer qualifies as a quick result.	839 // deleted by the user or the item no longer qualifies as a quick result.

921 if (hist_pos != index_.history_info_map_.end()) {	840 if (hist_pos != index_.private_data_->history_info_map_.end()) {

922 const URLRow& hist_item = hist_pos->second;	841 const URLRow& hist_item = hist_pos->second;

923 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));	842 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));

924 if (match.raw_score > 0)	843 if (match.raw_score > 0)

925 scored_matches_.push_back(match);	844 scored_matches_.push_back(match);

926 }	845 }

927 }	846 }

928	847

929 bool InMemoryURLIndex::GetCacheFilePath(FilePath* file_path) {	848 bool InMemoryURLIndex::GetCacheFilePath(FilePath* file_path) {

930 if (history_dir_.empty())	849 if (history_dir_.empty())

931 return false;	850 return false;

932 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));	851 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));

933 return true;	852 return true;

934 }	853 }

935	854

936 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {	855 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {

937 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();	856 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();

938 }	857 }

939	858

940 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {	859 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {

941 DCHECK(cache);	860 DCHECK(cache);

942 cache->set_timestamp(base::Time::Now().ToInternalValue());	861 cache->set_timestamp(base::Time::Now().ToInternalValue());

943 cache->set_history_item_count(history_item_count_);	862 // history_item_count_ is no longer used but rather than change the protobuf

	863 // definition use a placeholder. This will go away with the switch to SQLite.

	864 cache->set_history_item_count(0);

944 SaveWordList(cache);	865 SaveWordList(cache);

945 SaveWordMap(cache);	866 SaveWordMap(cache);

946 SaveCharWordMap(cache);	867 SaveCharWordMap(cache);

947 SaveWordIDHistoryMap(cache);	868 SaveWordIDHistoryMap(cache);

948 SaveHistoryInfoMap(cache);	869 SaveHistoryInfoMap(cache);

949 }	870 }

950	871

951 bool InMemoryURLIndex::RestorePrivateData(	872 bool InMemoryURLIndex::RestorePrivateData(

952 const InMemoryURLIndexCacheItem& cache) {	873 const InMemoryURLIndexCacheItem& cache) {

953 last_saved_ = base::Time::FromInternalValue(cache.timestamp());	874 last_saved_ = base::Time::FromInternalValue(cache.timestamp());

954 history_item_count_ = cache.history_item_count();	875 return RestoreWordList(cache) && RestoreWordMap(cache) &&

955 return (history_item_count_ == 0) \|\| (RestoreWordList(cache) &&	876 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&

956 RestoreWordMap(cache) && RestoreCharWordMap(cache) &&	877 RestoreHistoryInfoMap(cache);

957 RestoreWordIDHistoryMap(cache) && RestoreHistoryInfoMap(cache));

958 }	878 }

959	879

960

961 void InMemoryURLIndex::SaveWordList(InMemoryURLIndexCacheItem* cache) const {	880 void InMemoryURLIndex::SaveWordList(InMemoryURLIndexCacheItem* cache) const {

962 if (word_list_.empty())	881 if (private_data_->word_list_.empty())

963 return;	882 return;

964 WordListItem* list_item = cache->mutable_word_list();	883 WordListItem* list_item = cache->mutable_word_list();

965 list_item->set_word_count(word_list_.size());	884 list_item->set_word_count(private_data_->word_list_.size());

966 for (String16Vector::const_iterator iter = word_list_.begin();	885 for (String16Vector::const_iterator iter = private_data_->word_list_.begin();

967 iter != word_list_.end(); ++iter)	886 iter != private_data_->word_list_.end(); ++iter)

968 list_item->add_word(UTF16ToUTF8(*iter));	887 list_item->add_word(UTF16ToUTF8(*iter));

969 }	888 }

970	889

971 bool InMemoryURLIndex::RestoreWordList(const InMemoryURLIndexCacheItem& cache) {	890 bool InMemoryURLIndex::RestoreWordList(const InMemoryURLIndexCacheItem& cache) {

972 if (!cache.has_word_list())	891 if (!cache.has_word_list())

973 return false;	892 return false;

974 const WordListItem& list_item(cache.word_list());	893 const WordListItem& list_item(cache.word_list());

975 uint32 expected_item_count = list_item.word_count();	894 uint32 expected_item_count = list_item.word_count();

976 uint32 actual_item_count = list_item.word_size();	895 uint32 actual_item_count = list_item.word_size();

977 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)	896 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

978 return false;	897 return false;

979 const RepeatedPtrField<std::string>& words(list_item.word());	898 const RepeatedPtrField<std::string>& words(list_item.word());

980 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();	899 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();

981 iter != words.end(); ++iter)	900 iter != words.end(); ++iter)

982 word_list_.push_back(UTF8ToUTF16(*iter));	901 private_data_->word_list_.push_back(UTF8ToUTF16(*iter));

983 return true;	902 return true;

984 }	903 }

985	904

986 void InMemoryURLIndex::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {	905 void InMemoryURLIndex::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {

987 if (word_map_.empty())	906 if (private_data_->word_map_.empty())

988 return;	907 return;

989 WordMapItem* map_item = cache->mutable_word_map();	908 WordMapItem* map_item = cache->mutable_word_map();

990 map_item->set_item_count(word_map_.size());	909 map_item->set_item_count(private_data_->word_map_.size());

991 for (WordMap::const_iterator iter = word_map_.begin();	910 for (WordMap::const_iterator iter = private_data_->word_map_.begin();

992 iter != word_map_.end(); ++iter) {	911 iter != private_data_->word_map_.end(); ++iter) {

993 WordMapEntry* map_entry = map_item->add_word_map_entry();	912 WordMapEntry* map_entry = map_item->add_word_map_entry();

994 map_entry->set_word(UTF16ToUTF8(iter->first));	913 map_entry->set_word(UTF16ToUTF8(iter->first));

995 map_entry->set_word_id(iter->second);	914 map_entry->set_word_id(iter->second);

996 }	915 }

997 }	916 }

998	917

999 bool InMemoryURLIndex::RestoreWordMap(const InMemoryURLIndexCacheItem& cache) {	918 bool InMemoryURLIndex::RestoreWordMap(const InMemoryURLIndexCacheItem& cache) {

1000 if (!cache.has_word_map())	919 if (!cache.has_word_map())

1001 return false;	920 return false;

1002 const WordMapItem& list_item(cache.word_map());	921 const WordMapItem& list_item(cache.word_map());

1003 uint32 expected_item_count = list_item.item_count();	922 uint32 expected_item_count = list_item.item_count();

1004 uint32 actual_item_count = list_item.word_map_entry_size();	923 uint32 actual_item_count = list_item.word_map_entry_size();

1005 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)	924 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1006 return false;	925 return false;

1007 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());	926 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());

1008 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();	927 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();

1009 iter != entries.end(); ++iter)	928 iter != entries.end(); ++iter)

1010 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();	929 private_data_->word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();

1011 return true;	930 return true;

1012 }	931 }

1013	932

1014 void InMemoryURLIndex::SaveCharWordMap(InMemoryURLIndexCacheItem* cache) const {	933 void InMemoryURLIndex::SaveCharWordMap(InMemoryURLIndexCacheItem* cache) const {

1015 if (char_word_map_.empty())	934 if (private_data_->char_word_map_.empty())

1016 return;	935 return;

1017 CharWordMapItem* map_item = cache->mutable_char_word_map();	936 CharWordMapItem* map_item = cache->mutable_char_word_map();

1018 map_item->set_item_count(char_word_map_.size());	937 map_item->set_item_count(private_data_->char_word_map_.size());

1019 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();	938 for (CharWordIDMap::const_iterator iter =

1020 iter != char_word_map_.end(); ++iter) {	939 private_data_->char_word_map_.begin();

	940 iter != private_data_->char_word_map_.end(); ++iter) {

1021 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();	941 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();

1022 map_entry->set_char_16(iter->first);	942 map_entry->set_char_16(iter->first);

1023 const WordIDSet& word_id_set(iter->second);	943 const WordIDSet& word_id_set(iter->second);

1024 map_entry->set_item_count(word_id_set.size());	944 map_entry->set_item_count(word_id_set.size());

1025 for (WordIDSet::const_iterator set_iter = word_id_set.begin();	945 for (WordIDSet::const_iterator set_iter = word_id_set.begin();

1026 set_iter != word_id_set.end(); ++set_iter)	946 set_iter != word_id_set.end(); ++set_iter)

1027 map_entry->add_word_id(*set_iter);	947 map_entry->add_word_id(*set_iter);

1028 }	948 }

1029 }	949 }

1030	950

(...skipping 13 matching lines...) Expand all Loading...
1044 expected_item_count = iter->item_count();	964 expected_item_count = iter->item_count();

1045 actual_item_count = iter->word_id_size();	965 actual_item_count = iter->word_id_size();

1046 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)	966 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1047 return false;	967 return false;

1048 char16 uni_char = static_cast<char16>(iter->char_16());	968 char16 uni_char = static_cast<char16>(iter->char_16());

1049 WordIDSet word_id_set;	969 WordIDSet word_id_set;

1050 const RepeatedField<int32>& word_ids(iter->word_id());	970 const RepeatedField<int32>& word_ids(iter->word_id());

1051 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();	971 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();

1052 jiter != word_ids.end(); ++jiter)	972 jiter != word_ids.end(); ++jiter)

1053 word_id_set.insert(*jiter);	973 word_id_set.insert(*jiter);

1054 char_word_map_[uni_char] = word_id_set;	974 private_data_->char_word_map_[uni_char] = word_id_set;

1055 }	975 }

1056 return true;	976 return true;

1057 }	977 }

1058	978

1059 void InMemoryURLIndex::SaveWordIDHistoryMap(InMemoryURLIndexCacheItem* cache)	979 void InMemoryURLIndex::SaveWordIDHistoryMap(InMemoryURLIndexCacheItem* cache)

1060 const {	980 const {

1061 if (word_id_history_map_.empty())	981 if (private_data_->word_id_history_map_.empty())

1062 return;	982 return;

1063 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();	983 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();

1064 map_item->set_item_count(word_id_history_map_.size());	984 map_item->set_item_count(private_data_->word_id_history_map_.size());

1065 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();	985 for (WordIDHistoryMap::const_iterator iter =

1066 iter != word_id_history_map_.end(); ++iter) {	986 private_data_->word_id_history_map_.begin();

	987 iter != private_data_->word_id_history_map_.end(); ++iter) {

1067 WordIDHistoryMapEntry* map_entry =	988 WordIDHistoryMapEntry* map_entry =

1068 map_item->add_word_id_history_map_entry();	989 map_item->add_word_id_history_map_entry();

1069 map_entry->set_word_id(iter->first);	990 map_entry->set_word_id(iter->first);

1070 const HistoryIDSet& history_id_set(iter->second);	991 const HistoryIDSet& history_id_set(iter->second);

1071 map_entry->set_item_count(history_id_set.size());	992 map_entry->set_item_count(history_id_set.size());

1072 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();	993 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();

1073 set_iter != history_id_set.end(); ++set_iter)	994 set_iter != history_id_set.end(); ++set_iter)

1074 map_entry->add_history_id(*set_iter);	995 map_entry->add_history_id(*set_iter);

1075 }	996 }

1076 }	997 }

(...skipping 12 matching lines...) Expand all Loading...
1089 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =	1010 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =

1090 entries.begin(); iter != entries.end(); ++iter) {	1011 entries.begin(); iter != entries.end(); ++iter) {

1091 expected_item_count = iter->item_count();	1012 expected_item_count = iter->item_count();

1092 actual_item_count = iter->history_id_size();	1013 actual_item_count = iter->history_id_size();

1093 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)	1014 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1094 return false;	1015 return false;

1095 WordID word_id = iter->word_id();	1016 WordID word_id = iter->word_id();

1096 HistoryIDSet history_id_set;	1017 HistoryIDSet history_id_set;

1097 const RepeatedField<int64>& history_ids(iter->history_id());	1018 const RepeatedField<int64>& history_ids(iter->history_id());

1098 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();	1019 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();

1099 jiter != history_ids.end(); ++jiter)	1020 jiter != history_ids.end(); ++jiter) {

1100 history_id_set.insert(*jiter);	1021 history_id_set.insert(*jiter);

1101 word_id_history_map_[word_id] = history_id_set;	1022 private_data_->AddToHistoryIDWordMap(*jiter, word_id);

	1023 }

	1024 private_data_->word_id_history_map_[word_id] = history_id_set;

1102 }	1025 }

1103 return true;	1026 return true;

1104 }	1027 }

1105	1028

1106 void InMemoryURLIndex::SaveHistoryInfoMap(	1029 void InMemoryURLIndex::SaveHistoryInfoMap(

1107 InMemoryURLIndexCacheItem* cache) const {	1030 InMemoryURLIndexCacheItem* cache) const {

1108 if (history_info_map_.empty())	1031 if (private_data_->history_info_map_.empty())

1109 return;	1032 return;

1110 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();	1033 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();

1111 map_item->set_item_count(history_info_map_.size());	1034 map_item->set_item_count(private_data_->history_info_map_.size());

1112 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();	1035 for (HistoryInfoMap::const_iterator iter =

1113 iter != history_info_map_.end(); ++iter) {	1036 private_data_->history_info_map_.begin();

	1037 iter != private_data_->history_info_map_.end(); ++iter) {

1114 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();	1038 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();

1115 map_entry->set_history_id(iter->first);	1039 map_entry->set_history_id(iter->first);

1116 const URLRow& url_row(iter->second);	1040 const URLRow& url_row(iter->second);

1117 // Note: We only save information that contributes to the index so there	1041 // Note: We only save information that contributes to the index so there

1118 // is no need to save search_term_cache_ (not persistent),	1042 // is no need to save search_term_cache_ (not persistent),

1119 // languages_, etc.	1043 // languages_, etc.

1120 map_entry->set_visit_count(url_row.visit_count());	1044 map_entry->set_visit_count(url_row.visit_count());

1121 map_entry->set_typed_count(url_row.typed_count());	1045 map_entry->set_typed_count(url_row.typed_count());

1122 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());	1046 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());

1123 map_entry->set_url(url_row.url().spec());	1047 map_entry->set_url(url_row.url().spec());

(...skipping 17 matching lines...) Expand all Loading...
1141 HistoryID history_id = iter->history_id();	1065 HistoryID history_id = iter->history_id();

1142 GURL url(iter->url());	1066 GURL url(iter->url());

1143 URLRow url_row(url, history_id);	1067 URLRow url_row(url, history_id);

1144 url_row.set_visit_count(iter->visit_count());	1068 url_row.set_visit_count(iter->visit_count());

1145 url_row.set_typed_count(iter->typed_count());	1069 url_row.set_typed_count(iter->typed_count());

1146 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));	1070 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));

1147 if (iter->has_title()) {	1071 if (iter->has_title()) {

1148 string16 title(UTF8ToUTF16(iter->title()));	1072 string16 title(UTF8ToUTF16(iter->title()));

1149 url_row.set_title(title);	1073 url_row.set_title(title);

1150 }	1074 }

1151 history_info_map_[history_id] = url_row;	1075 private_data_->history_info_map_[history_id] = url_row;

1152 }	1076 }

1153 return true;	1077 return true;

1154 }	1078 }

1155	1079

1156 } // namespace history	1080 } // namespace history

OLD	NEW