Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(17)

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 8359019: Create Private Data for InMemoryURLIndex (in Preparation for SQLite Cache) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 9 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/history/in_memory_url_index.h" 5 #include "chrome/browser/history/in_memory_url_index.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <functional> 8 #include <functional>
9 #include <iterator> 9 #include <iterator>
10 #include <limits> 10 #include <limits>
11 #include <numeric> 11 #include <numeric>
12 12
13 #include "base/file_util.h" 13 #include "base/file_util.h"
14 #include "base/i18n/break_iterator.h"
15 #include "base/i18n/case_conversion.h" 14 #include "base/i18n/case_conversion.h"
16 #include "base/metrics/histogram.h" 15 #include "base/metrics/histogram.h"
17 #include "base/string_util.h" 16 #include "base/string_util.h"
18 #include "base/threading/thread_restrictions.h" 17 #include "base/threading/thread_restrictions.h"
19 #include "base/time.h" 18 #include "base/time.h"
20 #include "base/utf_string_conversions.h" 19 #include "base/utf_string_conversions.h"
21 #include "chrome/browser/autocomplete/autocomplete.h" 20 #include "chrome/browser/autocomplete/autocomplete.h"
22 #include "chrome/browser/autocomplete/history_provider_util.h" 21 #include "chrome/browser/autocomplete/history_provider_util.h"
23 #include "chrome/browser/history/url_database.h" 22 #include "chrome/browser/history/url_database.h"
24 #include "chrome/browser/profiles/profile.h" 23 #include "chrome/browser/profiles/profile.h"
(...skipping 27 matching lines...) Expand all
52 HistoryInfoMapEntry; 51 HistoryInfoMapEntry;
53 52
54 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; 53 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;
55 54
56 // Score ranges used to get a 'base' score for each of the scoring factors 55 // Score ranges used to get a 'base' score for each of the scoring factors
57 // (such as recency of last visit, times visited, times the URL was typed, 56 // (such as recency of last visit, times visited, times the URL was typed,
58 // and the quality of the string match). There is a matching value range for 57 // and the quality of the string match). There is a matching value range for
59 // each of these scores for each factor. 58 // each of these scores for each factor.
60 const int kScoreRank[] = { 1425, 1200, 900, 400 }; 59 const int kScoreRank[] = { 1425, 1200, 900, 400 };
61 60
62 ScoredHistoryMatch::ScoredHistoryMatch()
63 : raw_score(0),
64 can_inline(false) {}
65
66 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)
67 : HistoryMatch(url_info, 0, false, false),
68 raw_score(0),
69 can_inline(false) {}
70
71 ScoredHistoryMatch::~ScoredHistoryMatch() {}
72
73 // Comparison function for sorting ScoredMatches by their scores.
74 bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,
75 const ScoredHistoryMatch& m2) {
76 return m1.raw_score >= m2.raw_score;
77 }
78
79 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem( 61 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem(
80 const WordIDSet& word_id_set, 62 const WordIDSet& word_id_set,
81 const HistoryIDSet& history_id_set) 63 const HistoryIDSet& history_id_set)
82 : word_id_set_(word_id_set), 64 : word_id_set_(word_id_set),
83 history_id_set_(history_id_set), 65 history_id_set_(history_id_set),
84 used_(true) {} 66 used_(true) {}
85 67
86 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem() 68 InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem()
87 : used_(true) {} 69 : used_(true) {}
88 70
89 InMemoryURLIndex::SearchTermCacheItem::~SearchTermCacheItem() {} 71 InMemoryURLIndex::SearchTermCacheItem::~SearchTermCacheItem() {}
90 72
91 // Comparison function for sorting TermMatches by their offsets.
92 bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) {
93 return m1.offset < m2.offset;
94 }
95
96 // Comparison function for sorting search terms by descending length. 73 // Comparison function for sorting search terms by descending length.
97 bool LengthGreater(const string16& string_a, const string16& string_b) { 74 bool LengthGreater(const string16& string_a, const string16& string_b) {
98 return string_a.length() > string_b.length(); 75 return string_a.length() > string_b.length();
99 } 76 }
100 77
101 // std::accumulate helper function to add up TermMatches' lengths. 78 // std::accumulate helper function to add up TermMatches' lengths.
102 int AccumulateMatchLength(int total, const TermMatch& match) { 79 int AccumulateMatchLength(int total, const TermMatch& match) {
103 return total + match.length; 80 return total + match.length;
104 } 81 }
105 82
(...skipping 24 matching lines...) Expand all
130 if (i > 0) { 107 if (i > 0) {
131 score += (value - value_ranks[i]) * 108 score += (value - value_ranks[i]) *
132 (kScoreRank[i - 1] - kScoreRank[i]) / 109 (kScoreRank[i - 1] - kScoreRank[i]) /
133 (value_ranks[i - 1] - value_ranks[i]); 110 (value_ranks[i - 1] - value_ranks[i]);
134 } 111 }
135 return score; 112 return score;
136 } 113 }
137 114
138 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) 115 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)
139 : history_dir_(history_dir), 116 : history_dir_(history_dir),
140 history_item_count_(0), 117 private_data_(new URLIndexPrivateData),
141 cached_at_shutdown_(false) { 118 cached_at_shutdown_(false) {
142 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_); 119 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_);
143 } 120 }
144 121
145 // Called only by unit tests. 122 // Called only by unit tests.
146 InMemoryURLIndex::InMemoryURLIndex() 123 InMemoryURLIndex::InMemoryURLIndex()
147 : history_item_count_(0), 124 : private_data_(new URLIndexPrivateData),
148 cached_at_shutdown_(false) { 125 cached_at_shutdown_(false) {
149 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_); 126 InMemoryURLIndex::InitializeSchemeWhitelist(&scheme_whitelist_);
150 } 127 }
151 128
152 InMemoryURLIndex::~InMemoryURLIndex() { 129 InMemoryURLIndex::~InMemoryURLIndex() {
153 // If there was a history directory (which there won't be for some unit tests) 130 // If there was a history directory (which there won't be for some unit tests)
154 // then insure that the cache has already been saved. 131 // then insure that the cache has already been saved.
155 DCHECK(history_dir_.empty() || cached_at_shutdown_); 132 DCHECK(history_dir_.empty() || cached_at_shutdown_);
156 } 133 }
157 134
158 // static 135 // static
159 void InMemoryURLIndex::InitializeSchemeWhitelist( 136 void InMemoryURLIndex::InitializeSchemeWhitelist(
160 std::set<std::string>* whitelist) { 137 std::set<std::string>* whitelist) {
161 DCHECK(whitelist); 138 DCHECK(whitelist);
162 whitelist->insert(std::string(chrome::kAboutScheme)); 139 whitelist->insert(std::string(chrome::kAboutScheme));
163 whitelist->insert(std::string(chrome::kChromeUIScheme)); 140 whitelist->insert(std::string(chrome::kChromeUIScheme));
164 whitelist->insert(std::string(chrome::kFileScheme)); 141 whitelist->insert(std::string(chrome::kFileScheme));
165 whitelist->insert(std::string(chrome::kFtpScheme)); 142 whitelist->insert(std::string(chrome::kFtpScheme));
166 whitelist->insert(std::string(chrome::kHttpScheme)); 143 whitelist->insert(std::string(chrome::kHttpScheme));
167 whitelist->insert(std::string(chrome::kHttpsScheme)); 144 whitelist->insert(std::string(chrome::kHttpsScheme));
168 whitelist->insert(std::string(chrome::kMailToScheme)); 145 whitelist->insert(std::string(chrome::kMailToScheme));
169 } 146 }
170 147
171 // Indexing 148 // Indexing
172 149
173 bool InMemoryURLIndex::Init(history::URLDatabase* history_db, 150 bool InMemoryURLIndex::Init(URLDatabase* history_db,
174 const std::string& languages) { 151 const std::string& languages) {
175 // TODO(mrossetti): Register for profile/language change notifications. 152 // TODO(mrossetti): Register for profile/language change notifications.
176 languages_ = languages; 153 languages_ = languages;
177 return ReloadFromHistory(history_db, false); 154 return ReloadFromHistory(history_db, false);
178 } 155 }
179 156
180 void InMemoryURLIndex::ShutDown() { 157 void InMemoryURLIndex::ShutDown() {
181 // Write our cache. 158 // Write our cache.
182 SaveToCacheFile(); 159 SaveToCacheFile();
183 cached_at_shutdown_ = true; 160 cached_at_shutdown_ = true;
184 } 161 }
185 162
186 bool InMemoryURLIndex::IndexRow(const URLRow& row) { 163 bool InMemoryURLIndex::IndexRow(const URLRow& row) {
187 const GURL& gurl(row.url()); 164 const GURL& gurl(row.url());
188 165
189 // Index only URLs with a whitelisted scheme. 166 // Index only URLs with a whitelisted scheme.
190 if (!InMemoryURLIndex::URLSchemeIsWhitelisted(gurl)) 167 if (!InMemoryURLIndex::URLSchemeIsWhitelisted(gurl))
191 return true; 168 return true;
192 169
170 URLID row_id = row.id();
171 // Strip out username and password before saving and indexing.
193 string16 url(net::FormatUrl(gurl, languages_, 172 string16 url(net::FormatUrl(gurl, languages_,
194 net::kFormatUrlOmitUsernamePassword, 173 net::kFormatUrlOmitUsernamePassword,
195 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 174 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
196 NULL, NULL, NULL)); 175 NULL, NULL, NULL));
197 176
198 HistoryID history_id = static_cast<HistoryID>(row.id()); 177 HistoryID history_id = static_cast<HistoryID>(row_id);
199 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max()); 178 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
200 179
201 // Add the row for quick lookup in the history info store. 180 // Add the row for quick lookup in the history info store.
202 URLRow new_row(GURL(url), row.id()); 181 URLRow new_row(GURL(url), row_id);
203 new_row.set_visit_count(row.visit_count()); 182 new_row.set_visit_count(row.visit_count());
204 new_row.set_typed_count(row.typed_count()); 183 new_row.set_typed_count(row.typed_count());
205 new_row.set_last_visit(row.last_visit()); 184 new_row.set_last_visit(row.last_visit());
206 new_row.set_title(row.title()); 185 new_row.set_title(row.title());
207 history_info_map_[history_id] = new_row; 186 private_data_->history_info_map_[history_id] = new_row;
208 187
188 // Index the words contained in the URL and title of the row.
189 AddRowWordsToIndex(new_row);
190 return true;
Peter Kasting 2011/10/24 22:01:40 Nit: I just noticed that this function (old and ne
mrossetti 2011/10/25 16:15:05 Done.
191 }
192
193 void InMemoryURLIndex::AddRowWordsToIndex(const URLRow& row) {
194 HistoryID history_id = static_cast<HistoryID>(row.id());
209 // Split URL into individual, unique words then add in the title words. 195 // Split URL into individual, unique words then add in the title words.
196 const GURL& gurl(row.url());
197 string16 url(net::FormatUrl(gurl, languages_,
198 net::kFormatUrlOmitUsernamePassword,
199 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
200 NULL, NULL, NULL));
210 url = base::i18n::ToLower(url); 201 url = base::i18n::ToLower(url);
211 String16Set url_words = WordSetFromString16(url); 202 String16Set url_words = String16SetFromString16(url);
212 String16Set title_words = WordSetFromString16(row.title()); 203 String16Set title_words = String16SetFromString16(row.title());
213 String16Set words; 204 String16Set words;
214 std::set_union(url_words.begin(), url_words.end(), 205 std::set_union(url_words.begin(), url_words.end(),
215 title_words.begin(), title_words.end(), 206 title_words.begin(), title_words.end(),
216 std::insert_iterator<String16Set>(words, words.begin())); 207 std::insert_iterator<String16Set>(words, words.begin()));
217 for (String16Set::iterator word_iter = words.begin(); 208 for (String16Set::iterator word_iter = words.begin();
218 word_iter != words.end(); ++word_iter) 209 word_iter != words.end(); ++word_iter)
219 AddWordToIndex(*word_iter, history_id); 210 AddWordToIndex(*word_iter, history_id);
220 211
221 ++history_item_count_; 212 search_term_cache_.clear(); // Invalidate the term cache.
222 return true; 213 }
214
215 void InMemoryURLIndex::RemoveRowFromIndex(const URLRow& row) {
216 RemoveRowWordsFromIndex(row);
217 HistoryID history_id = static_cast<HistoryID>(row.id());
218 private_data_->history_info_map_.erase(history_id);
219 }
220
221 void InMemoryURLIndex::RemoveRowWordsFromIndex(const URLRow& row) {
222 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
223 // this row.
224 URLIndexPrivateData& private_data(*(private_data_.get()));
225 HistoryID history_id = static_cast<HistoryID>(row.id());
226 WordIDSet word_id_set = private_data.history_id_word_map_[history_id];
227 private_data.history_id_word_map_.erase(history_id);
228
229 // Reconcile any changes to word usage.
230 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
231 word_id_iter != word_id_set.end(); ++word_id_iter) {
232 WordID word_id = *word_id_iter;
233 private_data.word_id_history_map_[word_id].erase(history_id);
234 if (!private_data.word_id_history_map_[word_id].empty())
235 continue; // The word is still in use.
236
237 // The word is no longer in use. Reconcile any changes to character usage.
238 string16 word = private_data.word_list_[word_id];
239 Char16Set characters = Char16SetFromString16(word);
240 for (Char16Set::iterator uni_char_iter = characters.begin();
241 uni_char_iter != characters.end(); ++uni_char_iter) {
242 char16 uni_char = *uni_char_iter;
243 private_data.char_word_map_[uni_char].erase(word_id);
244 if (private_data.char_word_map_[uni_char].empty())
245 private_data.char_word_map_.erase(uni_char); // No longer in use.
246 }
247
248 // Complete the removal of references to the word.
249 private_data.word_id_history_map_.erase(word_id);
250 private_data.word_map_.erase(word);
251 private_data.word_list_[word_id] = string16();
252 private_data.available_words_.insert(word_id);
253 }
223 } 254 }
224 255
225 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, 256 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,
226 bool clear_cache) { 257 bool clear_cache) {
227 ClearPrivateData(); 258 ClearPrivateData();
228 259
229 if (!history_db) 260 if (!history_db)
230 return false; 261 return false;
231 262
232 if (clear_cache || !RestoreFromCacheFile()) { 263 if (clear_cache || !RestoreFromCacheFile()) {
233 base::TimeTicks beginning_time = base::TimeTicks::Now(); 264 base::TimeTicks beginning_time = base::TimeTicks::Now();
234 // The index has to be built from scratch. 265 // The index has to be built from scratch.
235 URLDatabase::URLEnumerator history_enum; 266 URLDatabase::URLEnumerator history_enum;
236 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) 267 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
237 return false; 268 return false;
238 URLRow row; 269 URLRow row;
239 while (history_enum.GetNextURL(&row)) { 270 while (history_enum.GetNextURL(&row)) {
240 if (!IndexRow(row)) 271 if (!IndexRow(row))
241 return false; 272 return false;
242 } 273 }
243 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", 274 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
244 base::TimeTicks::Now() - beginning_time); 275 base::TimeTicks::Now() - beginning_time);
245 SaveToCacheFile(); 276 SaveToCacheFile();
246 } 277 }
247 return true; 278 return true;
248 } 279 }
249 280
250 void InMemoryURLIndex::ClearPrivateData() { 281 void InMemoryURLIndex::ClearPrivateData() {
251 history_item_count_ = 0; 282 private_data_->Clear();
252 word_list_.clear();
253 word_map_.clear();
254 char_word_map_.clear();
255 word_id_history_map_.clear();
256 history_info_map_.clear();
257 search_term_cache_.clear(); 283 search_term_cache_.clear();
258 } 284 }
259 285
260 bool InMemoryURLIndex::RestoreFromCacheFile() { 286 bool InMemoryURLIndex::RestoreFromCacheFile() {
261 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date. 287 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date.
262 // That is: ensure that the database has not been modified since the cache 288 // That is: ensure that the database has not been modified since the cache
263 // was last saved. DB file modification date is inadequate. There are no 289 // was last saved. DB file modification date is inadequate. There are no
264 // SQLite table checksums automatically stored. 290 // SQLite table checksums automatically stored.
265 // FIXME(mrossetti): Move File IO to another thread. 291 // FIXME(mrossetti): Move File IO to another thread.
266 base::ThreadRestrictions::ScopedAllowIO allow_io; 292 base::ThreadRestrictions::ScopedAllowIO allow_io;
(...skipping 15 matching lines...) Expand all
282 return false; 308 return false;
283 } 309 }
284 310
285 if (!RestorePrivateData(index_cache)) { 311 if (!RestorePrivateData(index_cache)) {
286 ClearPrivateData(); // Back to square one -- must build from scratch. 312 ClearPrivateData(); // Back to square one -- must build from scratch.
287 return false; 313 return false;
288 } 314 }
289 315
290 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", 316 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
291 base::TimeTicks::Now() - beginning_time); 317 base::TimeTicks::Now() - beginning_time);
292 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", history_item_count_); 318 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
319 private_data_->history_id_word_map_.size());
293 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); 320 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
294 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", word_map_.size()); 321 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
295 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", char_word_map_.size()); 322 private_data_->word_map_.size());
323 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
324 private_data_->char_word_map_.size());
296 return true; 325 return true;
297 } 326 }
298 327
299 bool InMemoryURLIndex::SaveToCacheFile() { 328 bool InMemoryURLIndex::SaveToCacheFile() {
300 // TODO(mrossetti): Move File IO to another thread. 329 // TODO(mrossetti): Move File IO to another thread.
301 base::ThreadRestrictions::ScopedAllowIO allow_io; 330 base::ThreadRestrictions::ScopedAllowIO allow_io;
302 FilePath file_path; 331 FilePath file_path;
303 if (!GetCacheFilePath(&file_path)) 332 if (!GetCacheFilePath(&file_path))
304 return false; 333 return false;
305 334
(...skipping 14 matching lines...) Expand all
320 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", 349 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
321 base::TimeTicks::Now() - beginning_time); 350 base::TimeTicks::Now() - beginning_time);
322 return true; 351 return true;
323 } 352 }
324 353
325 void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) { 354 void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) {
326 // The row may or may not already be in our index. If it is not already 355 // The row may or may not already be in our index. If it is not already
327 // indexed and it qualifies then it gets indexed. If it is already 356 // indexed and it qualifies then it gets indexed. If it is already
328 // indexed and still qualifies then it gets updated, otherwise it 357 // indexed and still qualifies then it gets updated, otherwise it
329 // is deleted from the index. 358 // is deleted from the index.
330 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); 359 HistoryInfoMap::iterator row_pos =
331 if (row_pos == history_info_map_.end()) { 360 private_data_->history_info_map_.find(row_id);
361 if (row_pos == private_data_->history_info_map_.end()) {
332 // This new row should be indexed if it qualifies. 362 // This new row should be indexed if it qualifies.
333 if (RowQualifiesAsSignificant(row, base::Time())) 363 URLRow new_row(row);
334 IndexRow(row); 364 new_row.set_id(row_id);
365 if (RowQualifiesAsSignificant(new_row, base::Time()))
366 IndexRow(new_row);
335 } else if (RowQualifiesAsSignificant(row, base::Time())) { 367 } else if (RowQualifiesAsSignificant(row, base::Time())) {
336 // This indexed row still qualifies and will be re-indexed. 368 // This indexed row still qualifies and will be re-indexed.
337 // The url won't have changed but the title, visit count, etc. 369 // The url won't have changed but the title, visit count, etc.
338 // might have changed. 370 // might have changed.
339 URLRow& old_row = row_pos->second; 371 URLRow& updated_row = row_pos->second;
340 old_row.set_visit_count(row.visit_count()); 372 updated_row.set_visit_count(row.visit_count());
341 old_row.set_typed_count(row.typed_count()); 373 updated_row.set_typed_count(row.typed_count());
342 old_row.set_last_visit(row.last_visit()); 374 updated_row.set_last_visit(row.last_visit());
343 // TODO(mrossetti): When we start indexing the title the next line 375 // While the URL is guaranteed to remain stable, the title may have changed.
344 // will need attention. 376 // If so, then we need to update the index with the changed words.
345 old_row.set_title(row.title()); 377 if (updated_row.title() != row.title()) {
378 // Clear all words associated with this row and re-index both the
379 // URL and title.
380 RemoveRowWordsFromIndex(updated_row);
381 updated_row.set_title(row.title());
382 AddRowWordsToIndex(updated_row);
383 }
346 } else { 384 } else {
347 // This indexed row no longer qualifies and will be de-indexed. 385 // This indexed row no longer qualifies and will be de-indexed by
348 history_info_map_.erase(row_id); 386 // clearing all words associated with this row.
387 URLRow& removed_row = row_pos->second;
388 RemoveRowFromIndex(removed_row);
349 } 389 }
350 // This invalidates the cache. 390 // This invalidates the cache.
351 search_term_cache_.clear(); 391 search_term_cache_.clear();
352 // TODO(mrossetti): Record this transaction in the cache.
353 } 392 }
354 393
355 void InMemoryURLIndex::DeleteURL(URLID row_id) { 394 void InMemoryURLIndex::DeleteURL(URLID row_id) {
356 // Note that this does not remove any reference to this row from the 395 // Note that this does not remove any reference to this row from the
357 // word_id_history_map_. That map will continue to contain (and return) 396 // word_id_history_map_. That map will continue to contain (and return)
358 // hits against this row until that map is rebuilt, but since the 397 // hits against this row until that map is rebuilt, but since the
359 // history_info_map_ no longer references the row no erroneous results 398 // history_info_map_ no longer references the row no erroneous results
360 // will propagate to the user. 399 // will propagate to the user.
361 history_info_map_.erase(row_id); 400 private_data_->history_info_map_.erase(row_id);
362 // This invalidates the word cache. 401 // This invalidates the word cache.
363 search_term_cache_.clear(); 402 search_term_cache_.clear();
364 // TODO(mrossetti): Record this transaction in the cache.
365 } 403 }
366 404
367 // Searching 405 // Searching
368 406
369 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( 407 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
370 const String16Vector& terms) { 408 const String16Vector& terms) {
371 ScoredHistoryMatches scored_items; 409 ScoredHistoryMatches scored_items;
410
411 // Do nothing if we have indexed no words (probably because we've not been
412 // initialized yet).
413 if (private_data_->word_list_.empty())
414 return scored_items;
415
372 if (!terms.empty()) { 416 if (!terms.empty()) {
373 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 417 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
374 // approach. 418 // approach.
375 ResetSearchTermCache(); 419 ResetSearchTermCache();
376 420
377 // Lowercase the terms. 421 // Lowercase the terms.
378 // TODO(mrossetti): Another opportunity for a transform algorithm. 422 // TODO(mrossetti): Another opportunity for a transform algorithm.
379 String16Vector lower_terms; 423 String16Vector lower_terms;
380 for (String16Vector::const_iterator term_iter = terms.begin(); 424 for (String16Vector::const_iterator term_iter = terms.begin();
381 term_iter != terms.end(); ++term_iter) 425 term_iter != terms.end(); ++term_iter)
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
419 463
420 return scored_items; 464 return scored_items;
421 } 465 }
422 466
423 void InMemoryURLIndex::ResetSearchTermCache() { 467 void InMemoryURLIndex::ResetSearchTermCache() {
424 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); 468 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
425 iter != search_term_cache_.end(); ++iter) 469 iter != search_term_cache_.end(); ++iter)
426 iter->second.used_ = false; 470 iter->second.used_ = false;
427 } 471 }
428 472
429 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( 473 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
430 const string16& uni_string) { 474 const string16& uni_string) {
431 // Break the terms down into individual terms (words), get the candidate 475 // Break the terms down into individual terms (words), get the candidate
432 // set for each term, and intersect each to get a final candidate list. 476 // set for each term, and intersect each to get a final candidate list.
433 // Note that a single 'term' from the user's perspective might be 477 // Note that a single 'term' from the user's perspective might be
434 // a string like "http://www.somewebsite.com" which, from our perspective, 478 // a string like "http://www.somewebsite.com" which, from our perspective,
435 // is four words: 'http', 'www', 'somewebsite', and 'com'. 479 // is four words: 'http', 'www', 'somewebsite', and 'com'.
436 HistoryIDSet history_id_set; 480 HistoryIDSet history_id_set;
437 String16Vector terms = WordVectorFromString16(uni_string, true); 481 String16Vector terms = String16VectorFromString16(uni_string, true);
438 // Sort the terms into the longest first as such are likely to narrow down 482 // Sort the terms into the longest first as such are likely to narrow down
439 // the results quicker. Also, single character terms are the most expensive 483 // the results quicker. Also, single character terms are the most expensive
440 // to process so save them for last. 484 // to process so save them for last.
441 std::sort(terms.begin(), terms.end(), LengthGreater); 485 std::sort(terms.begin(), terms.end(), LengthGreater);
442 for (String16Vector::iterator iter = terms.begin(); iter != terms.end(); 486 for (String16Vector::iterator iter = terms.begin(); iter != terms.end();
443 ++iter) { 487 ++iter) {
444 string16 uni_word = *iter; 488 string16 uni_word = *iter;
445 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); 489 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
446 if (term_history_set.empty()) { 490 if (term_history_set.empty()) {
447 history_id_set.clear(); 491 history_id_set.clear();
448 break; 492 break;
449 } 493 }
450 if (iter == terms.begin()) { 494 if (iter == terms.begin()) {
451 history_id_set.swap(term_history_set); 495 history_id_set.swap(term_history_set);
452 } else { 496 } else {
453 HistoryIDSet new_history_id_set; 497 HistoryIDSet new_history_id_set;
454 std::set_intersection(history_id_set.begin(), history_id_set.end(), 498 std::set_intersection(history_id_set.begin(), history_id_set.end(),
455 term_history_set.begin(), term_history_set.end(), 499 term_history_set.begin(), term_history_set.end(),
456 std::inserter(new_history_id_set, 500 std::inserter(new_history_id_set,
457 new_history_id_set.begin())); 501 new_history_id_set.begin()));
458 history_id_set.swap(new_history_id_set); 502 history_id_set.swap(new_history_id_set);
459 } 503 }
460 } 504 }
461 return history_id_set; 505 return history_id_set;
462 } 506 }
463 507
464 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 508 HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
465 const string16& term) { 509 const string16& term) {
466 if (term.empty()) 510 if (term.empty())
467 return HistoryIDSet(); 511 return HistoryIDSet();
468 512
469 // TODO(mrossetti): Consider optimizing for very common terms such as 513 // TODO(mrossetti): Consider optimizing for very common terms such as
470 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently 514 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
471 // occuring words in the user's searches. 515 // occuring words in the user's searches.
472 516
473 size_t term_length = term.length(); 517 size_t term_length = term.length();
474 InMemoryURLIndex::WordIDSet word_id_set; 518 WordIDSet word_id_set;
475 if (term_length > 1) { 519 if (term_length > 1) {
476 // See if this term or a prefix thereof is present in the cache. 520 // See if this term or a prefix thereof is present in the cache.
477 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); 521 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
478 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 522 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
479 cache_iter != search_term_cache_.end(); ++cache_iter) { 523 cache_iter != search_term_cache_.end(); ++cache_iter) {
480 if (StartsWith(term, cache_iter->first, false) && 524 if (StartsWith(term, cache_iter->first, false) &&
481 (best_prefix == search_term_cache_.end() || 525 (best_prefix == search_term_cache_.end() ||
482 cache_iter->first.length() > best_prefix->first.length())) 526 cache_iter->first.length() > best_prefix->first.length()))
483 best_prefix = cache_iter; 527 best_prefix = cache_iter;
484 } 528 }
(...skipping 25 matching lines...) Expand all
510 554
511 // Filter for each remaining, unique character in the term. 555 // Filter for each remaining, unique character in the term.
512 Char16Set leftover_chars = Char16SetFromString16(leftovers); 556 Char16Set leftover_chars = Char16SetFromString16(leftovers);
513 Char16Set unique_chars; 557 Char16Set unique_chars;
514 std::set_difference(leftover_chars.begin(), leftover_chars.end(), 558 std::set_difference(leftover_chars.begin(), leftover_chars.end(),
515 prefix_chars.begin(), prefix_chars.end(), 559 prefix_chars.begin(), prefix_chars.end(),
516 std::inserter(unique_chars, unique_chars.begin())); 560 std::inserter(unique_chars, unique_chars.begin()));
517 561
518 // Reduce the word set with any leftover, unprocessed characters. 562 // Reduce the word set with any leftover, unprocessed characters.
519 if (!unique_chars.empty()) { 563 if (!unique_chars.empty()) {
520 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); 564 WordIDSet leftover_set(
565 private_data_->WordIDSetForTermChars(unique_chars));
521 // We might come up empty on the leftovers. 566 // We might come up empty on the leftovers.
522 if (leftover_set.empty()) { 567 if (leftover_set.empty()) {
523 search_term_cache_[term] = SearchTermCacheItem(); 568 search_term_cache_[term] = SearchTermCacheItem();
524 return HistoryIDSet(); 569 return HistoryIDSet();
525 } 570 }
526 // Or there may not have been a prefix from which to start. 571 // Or there may not have been a prefix from which to start.
527 if (prefix_chars.empty()) { 572 if (prefix_chars.empty()) {
528 word_id_set.swap(leftover_set); 573 word_id_set.swap(leftover_set);
529 } else { 574 } else {
530 WordIDSet new_word_id_set; 575 WordIDSet new_word_id_set;
531 std::set_intersection(word_id_set.begin(), word_id_set.end(), 576 std::set_intersection(word_id_set.begin(), word_id_set.end(),
532 leftover_set.begin(), leftover_set.end(), 577 leftover_set.begin(), leftover_set.end(),
533 std::inserter(new_word_id_set, 578 std::inserter(new_word_id_set,
534 new_word_id_set.begin())); 579 new_word_id_set.begin()));
535 word_id_set.swap(new_word_id_set); 580 word_id_set.swap(new_word_id_set);
536 } 581 }
537 } 582 }
538 583
539 // We must filter the word list because the resulting word set surely 584 // We must filter the word list because the resulting word set surely
540 // contains words which do not have the search term as a proper subset. 585 // contains words which do not have the search term as a proper subset.
541 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); 586 for (WordIDSet::iterator word_set_iter = word_id_set.begin();
542 word_set_iter != word_id_set.end(); ) { 587 word_set_iter != word_id_set.end(); ) {
543 if (word_list_[*word_set_iter].find(term) == string16::npos) 588 if (private_data_->word_list_[*word_set_iter].find(term) ==
589 string16::npos)
544 word_id_set.erase(word_set_iter++); 590 word_id_set.erase(word_set_iter++);
545 else 591 else
546 ++word_set_iter; 592 ++word_set_iter;
547 } 593 }
548 } else { 594 } else {
549 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); 595 word_id_set =
596 private_data_->WordIDSetForTermChars(Char16SetFromString16(term));
550 } 597 }
551 598
552 // If any words resulted then we can compose a set of history IDs by unioning 599 // If any words resulted then we can compose a set of history IDs by unioning
553 // the sets from each word. 600 // the sets from each word.
554 HistoryIDSet history_id_set; 601 HistoryIDSet history_id_set;
555 if (!word_id_set.empty()) { 602 if (!word_id_set.empty()) {
556 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 603 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
557 word_id_iter != word_id_set.end(); ++word_id_iter) { 604 word_id_iter != word_id_set.end(); ++word_id_iter) {
558 WordID word_id = *word_id_iter; 605 WordID word_id = *word_id_iter;
559 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 606 WordIDHistoryMap::iterator word_iter =
560 if (word_iter != word_id_history_map_.end()) { 607 private_data_->word_id_history_map_.find(word_id);
608 if (word_iter != private_data_->word_id_history_map_.end()) {
561 HistoryIDSet& word_history_id_set(word_iter->second); 609 HistoryIDSet& word_history_id_set(word_iter->second);
562 history_id_set.insert(word_history_id_set.begin(), 610 history_id_set.insert(word_history_id_set.begin(),
563 word_history_id_set.end()); 611 word_history_id_set.end());
564 } 612 }
565 } 613 }
566 } 614 }
567 615
568 // Record a new cache entry for this word if the term is longer than 616 // Record a new cache entry for this word if the term is longer than
569 // a single character. 617 // a single character.
570 if (term_length > 1) 618 if (term_length > 1)
571 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); 619 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
572 620
573 return history_id_set; 621 return history_id_set;
574 } 622 }
575 623
576 // Utility Functions 624 // Utility Functions
577 625
578 // static
579 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
580 const string16& uni_string) {
581 const size_t kMaxWordLength = 64;
582 String16Vector words = WordVectorFromString16(uni_string, false);
583 String16Set word_set;
584 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
585 ++iter)
586 word_set.insert(base::i18n::ToLower(*iter).substr(0, kMaxWordLength));
587 return word_set;
588 }
589
590 // static
591 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(
592 const string16& uni_string,
593 bool break_on_space) {
594 base::i18n::BreakIterator iter(
595 uni_string,
596 break_on_space ? base::i18n::BreakIterator::BREAK_SPACE
597 : base::i18n::BreakIterator::BREAK_WORD);
598 String16Vector words;
599 if (!iter.Init())
600 return words;
601 while (iter.Advance()) {
602 if (break_on_space || iter.IsWord()) {
603 string16 word = iter.GetString();
604 if (break_on_space)
605 TrimWhitespace(word, TRIM_ALL, &word);
606 if (!word.empty())
607 words.push_back(word);
608 }
609 }
610 return words;
611 }
612
613 // static
614 InMemoryURLIndex::Char16Set InMemoryURLIndex::Char16SetFromString16(
615 const string16& term) {
616 Char16Set characters;
617 for (string16::const_iterator iter = term.begin(); iter != term.end();
618 ++iter)
619 characters.insert(*iter);
620 return characters;
621 }
622
623 void InMemoryURLIndex::AddWordToIndex(const string16& term, 626 void InMemoryURLIndex::AddWordToIndex(const string16& term,
624 HistoryID history_id) { 627 HistoryID history_id) {
625 WordMap::iterator word_pos = word_map_.find(term); 628 WordMap::iterator word_pos = private_data_->word_map_.find(term);
626 if (word_pos != word_map_.end()) 629 if (word_pos != private_data_->word_map_.end())
627 UpdateWordHistory(word_pos->second, history_id); 630 UpdateWordHistory(word_pos->second, history_id);
628 else 631 else
629 AddWordHistory(term, history_id); 632 AddWordHistory(term, history_id);
630 } 633 }
631 634
632 void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { 635 void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {
633 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 636 WordIDHistoryMap::iterator history_pos =
634 DCHECK(history_pos != word_id_history_map_.end()); 637 private_data_->word_id_history_map_.find(word_id);
635 HistoryIDSet& history_id_set(history_pos->second); 638 DCHECK(history_pos != private_data_->word_id_history_map_.end());
636 history_id_set.insert(history_id); 639 HistoryIDSet& history_id_set(history_pos->second);
640 history_id_set.insert(history_id);
641 private_data_->AddToHistoryIDWordMap(history_id, word_id);
637 } 642 }
638 643
639 // Add a new word to the word list and the word map, and then create a 644 // Add a new word to the word list and the word map, and then create a
640 // new entry in the word/history map. 645 // new entry in the word/history map.
641 void InMemoryURLIndex::AddWordHistory(const string16& term, 646 void InMemoryURLIndex::AddWordHistory(const string16& term,
642 HistoryID history_id) { 647 HistoryID history_id) {
643 word_list_.push_back(term); 648 URLIndexPrivateData& private_data(*(private_data_.get()));
644 WordID word_id = word_list_.size() - 1; 649 WordID word_id = private_data.word_list_.size();
645 word_map_[term] = word_id; 650 if (private_data.available_words_.empty()) {
651 private_data.word_list_.push_back(term);
652 } else {
653 word_id = *(private_data.available_words_.begin());
654 private_data.word_list_[word_id] = term;
655 private_data.available_words_.erase(word_id);
656 }
657 private_data.word_map_[term] = word_id;
658
646 HistoryIDSet history_id_set; 659 HistoryIDSet history_id_set;
647 history_id_set.insert(history_id); 660 history_id_set.insert(history_id);
648 word_id_history_map_[word_id] = history_id_set; 661 private_data.word_id_history_map_[word_id] = history_id_set;
662 private_data.AddToHistoryIDWordMap(history_id, word_id);
663
649 // For each character in the newly added word (i.e. a word that is not 664 // For each character in the newly added word (i.e. a word that is not
650 // already in the word index), add the word to the character index. 665 // already in the word index), add the word to the character index.
651 Char16Set characters = Char16SetFromString16(term); 666 Char16Set characters = Char16SetFromString16(term);
652 for (Char16Set::iterator uni_char_iter = characters.begin(); 667 for (Char16Set::iterator uni_char_iter = characters.begin();
653 uni_char_iter != characters.end(); ++uni_char_iter) { 668 uni_char_iter != characters.end(); ++uni_char_iter) {
654 char16 uni_char = *uni_char_iter; 669 char16 uni_char = *uni_char_iter;
655 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 670 CharWordIDMap::iterator char_iter =
656 if (char_iter != char_word_map_.end()) { 671 private_data.char_word_map_.find(uni_char);
672 if (char_iter != private_data.char_word_map_.end()) {
657 // Update existing entry in the char/word index. 673 // Update existing entry in the char/word index.
658 WordIDSet& word_id_set(char_iter->second); 674 WordIDSet& word_id_set(char_iter->second);
659 word_id_set.insert(word_id); 675 word_id_set.insert(word_id);
660 } else { 676 } else {
661 // Create a new entry in the char/word index. 677 // Create a new entry in the char/word index.
662 WordIDSet word_id_set; 678 WordIDSet word_id_set;
663 word_id_set.insert(word_id); 679 word_id_set.insert(word_id);
664 char_word_map_[uni_char] = word_id_set; 680 private_data.char_word_map_[uni_char] = word_id_set;
665 } 681 }
666 } 682 }
667 } 683 }
668 684
669 InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
670 const Char16Set& term_chars) {
671 WordIDSet word_id_set;
672 for (Char16Set::const_iterator c_iter = term_chars.begin();
673 c_iter != term_chars.end(); ++c_iter) {
674 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
675 if (char_iter == char_word_map_.end()) {
676 // A character was not found so there are no matching results: bail.
677 word_id_set.clear();
678 break;
679 }
680 WordIDSet& char_word_id_set(char_iter->second);
681 // It is possible for there to no longer be any words associated with
682 // a particular character. Give up in that case.
683 if (char_word_id_set.empty()) {
684 word_id_set.clear();
685 break;
686 }
687
688 if (c_iter == term_chars.begin()) {
689 // First character results becomes base set of results.
690 word_id_set = char_word_id_set;
691 } else {
692 // Subsequent character results get intersected in.
693 WordIDSet new_word_id_set;
694 std::set_intersection(word_id_set.begin(), word_id_set.end(),
695 char_word_id_set.begin(), char_word_id_set.end(),
696 std::inserter(new_word_id_set,
697 new_word_id_set.begin()));
698 word_id_set.swap(new_word_id_set);
699 }
700 }
701 return word_id_set;
702 }
703
704 // static 685 // static
705 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, 686 // TODO(mrossetti): This can be made a ctor for ScoredHistoryMatch.
706 const string16& string,
707 int term_num) {
708 const size_t kMaxCompareLength = 2048;
709 const string16& short_string = (string.length() > kMaxCompareLength) ?
710 string.substr(0, kMaxCompareLength) : string;
711 TermMatches matches;
712 for (size_t location = short_string.find(term); location != string16::npos;
713 location = short_string.find(term, location + 1)) {
714 matches.push_back(TermMatch(term_num, location, term.length()));
715 }
716 return matches;
717 }
718
719 // static
720 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {
721 if (matches.empty())
722 return matches;
723 TermMatches sorted_matches = matches;
724 std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess);
725 TermMatches clean_matches;
726 TermMatch last_match = sorted_matches[0];
727 clean_matches.push_back(last_match);
728 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;
729 iter != sorted_matches.end(); ++iter) {
730 if (iter->offset >= last_match.offset + last_match.length) {
731 last_match = *iter;
732 clean_matches.push_back(last_match);
733 }
734 }
735 return clean_matches;
736 }
737
738 // static
739 std::vector<size_t> InMemoryURLIndex::OffsetsFromTermMatches(
740 const TermMatches& matches) {
741 std::vector<size_t> offsets;
742 for (TermMatches::const_iterator i = matches.begin(); i != matches.end(); ++i)
743 offsets.push_back(i->offset);
744 return offsets;
745 }
746
747 // static
748 TermMatches InMemoryURLIndex::ReplaceOffsetsInTermMatches(
749 const TermMatches& matches,
750 const std::vector<size_t>& offsets) {
751 DCHECK_EQ(matches.size(), offsets.size());
752 TermMatches new_matches;
753 std::vector<size_t>::const_iterator offset_iter = offsets.begin();
754 for (TermMatches::const_iterator term_iter = matches.begin();
755 term_iter != matches.end(); ++term_iter, ++offset_iter) {
756 if (*offset_iter != string16::npos) {
757 TermMatch new_match(*term_iter);
758 new_match.offset = *offset_iter;
759 new_matches.push_back(new_match);
760 }
761 }
762 return new_matches;
763 }
764
765 // static
766 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL( 687 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(
767 const URLRow& row, 688 const URLRow& row,
768 const String16Vector& terms) { 689 const String16Vector& terms) {
769 ScoredHistoryMatch match(row); 690 ScoredHistoryMatch match(row);
770 GURL gurl = row.url(); 691 GURL gurl = row.url();
771 if (!gurl.is_valid()) 692 if (!gurl.is_valid())
772 return match; 693 return match;
773 694
774 // Figure out where each search term appears in the URL and/or page title 695 // Figure out where each search term appears in the URL and/or page title
775 // so that we can score as well as provide autocomplete highlighting. 696 // so that we can score as well as provide autocomplete highlighting.
776 string16 url = base::i18n::ToLower(UTF8ToUTF16(gurl.spec())); 697 string16 url = base::i18n::ToLower(UTF8ToUTF16(gurl.spec()));
777 string16 title = base::i18n::ToLower(row.title()); 698 string16 title = base::i18n::ToLower(row.title());
778 int term_num = 0; 699 int term_num = 0;
779 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 700 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
780 ++iter, ++term_num) { 701 ++iter, ++term_num) {
781 string16 term = *iter; 702 string16 term = *iter;
782 TermMatches url_term_matches = MatchTermInString(term, url, term_num); 703 TermMatches url_term_matches = MatchTermInString(term, url, term_num);
783 TermMatches title_term_matches = MatchTermInString(term, title, term_num); 704 TermMatches title_term_matches = MatchTermInString(term, title, term_num);
784 if (url_term_matches.empty() && title_term_matches.empty()) 705 if (url_term_matches.empty() && title_term_matches.empty())
785 return match; // A term was not found in either URL or title - reject. 706 return match; // A term was not found in either URL or title - reject.
786 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(), 707 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),
787 url_term_matches.end()); 708 url_term_matches.end());
788 match.title_matches.insert(match.title_matches.end(), 709 match.title_matches.insert(match.title_matches.end(),
789 title_term_matches.begin(), 710 title_term_matches.begin(),
790 title_term_matches.end()); 711 title_term_matches.end());
791 } 712 }
792 713
793 // Sort matches by offset and eliminate any which overlap. 714 // Sort matches by offset and eliminate any which overlap.
794 match.url_matches = SortAndDeoverlap(match.url_matches); 715 match.url_matches = SortAndDeoverlapMatches(match.url_matches);
795 match.title_matches = SortAndDeoverlap(match.title_matches); 716 match.title_matches = SortAndDeoverlapMatches(match.title_matches);
796 717
797 // We should not (currently) inline autocomplete a result unless both of the 718 // We should not (currently) inline autocomplete a result unless both of the
798 // following are true: 719 // following are true:
799 // * There is exactly one substring matches in the URL, and 720 // * There is exactly one substring matches in the URL, and
800 // * The one URL match starts at the beginning of the URL. 721 // * The one URL match starts at the beginning of the URL.
801 match.can_inline = 722 match.can_inline =
802 match.url_matches.size() == 1 && match.url_matches[0].offset == 0; 723 match.url_matches.size() == 1 && match.url_matches[0].offset == 0;
803 724
804 // Get partial scores based on term matching. Note that the score for 725 // Get partial scores based on term matching. Note that the score for
805 // each of the URL and title are adjusted by the fraction of the 726 // each of the URL and title are adjusted by the fraction of the
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
899 820
900 // Scale the sum of the three components above into a single score component 821 // Scale the sum of the three components above into a single score component
901 // on the same scale as that used in ScoredMatchForURL(). 822 // on the same scale as that used in ScoredMatchForURL().
902 return ScoreForValue(raw_score, kTermScoreLevel); 823 return ScoreForValue(raw_score, kTermScoreLevel);
903 } 824 }
904 825
905 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 826 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
906 const InMemoryURLIndex& index, 827 const InMemoryURLIndex& index,
907 const String16Vector& lower_terms) 828 const String16Vector& lower_terms)
908 : index_(index), 829 : index_(index),
909 lower_terms_(lower_terms) { 830 lower_terms_(lower_terms) {}
910 }
911 831
912 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 832 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
913 833
914 void InMemoryURLIndex::AddHistoryMatch::operator()( 834 void InMemoryURLIndex::AddHistoryMatch::operator()(const HistoryID history_id) {
915 const InMemoryURLIndex::HistoryID history_id) {
916 HistoryInfoMap::const_iterator hist_pos = 835 HistoryInfoMap::const_iterator hist_pos =
917 index_.history_info_map_.find(history_id); 836 index_.private_data_->history_info_map_.find(history_id);
918 // Note that a history_id may be present in the word_id_history_map_ yet not 837 // Note that a history_id may be present in the word_id_history_map_ yet not
919 // be found in the history_info_map_. This occurs when an item has been 838 // be found in the history_info_map_. This occurs when an item has been
920 // deleted by the user or the item no longer qualifies as a quick result. 839 // deleted by the user or the item no longer qualifies as a quick result.
921 if (hist_pos != index_.history_info_map_.end()) { 840 if (hist_pos != index_.private_data_->history_info_map_.end()) {
922 const URLRow& hist_item = hist_pos->second; 841 const URLRow& hist_item = hist_pos->second;
923 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_)); 842 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));
924 if (match.raw_score > 0) 843 if (match.raw_score > 0)
925 scored_matches_.push_back(match); 844 scored_matches_.push_back(match);
926 } 845 }
927 } 846 }
928 847
929 bool InMemoryURLIndex::GetCacheFilePath(FilePath* file_path) { 848 bool InMemoryURLIndex::GetCacheFilePath(FilePath* file_path) {
930 if (history_dir_.empty()) 849 if (history_dir_.empty())
931 return false; 850 return false;
932 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); 851 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));
933 return true; 852 return true;
934 } 853 }
935 854
936 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const { 855 bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {
937 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); 856 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();
938 } 857 }
939 858
940 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { 859 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {
941 DCHECK(cache); 860 DCHECK(cache);
942 cache->set_timestamp(base::Time::Now().ToInternalValue()); 861 cache->set_timestamp(base::Time::Now().ToInternalValue());
943 cache->set_history_item_count(history_item_count_); 862 // history_item_count_ is no longer used but rather than change the protobuf
863 // definition use a placeholder. This will go away with the switch to SQLite.
864 cache->set_history_item_count(0);
944 SaveWordList(cache); 865 SaveWordList(cache);
945 SaveWordMap(cache); 866 SaveWordMap(cache);
946 SaveCharWordMap(cache); 867 SaveCharWordMap(cache);
947 SaveWordIDHistoryMap(cache); 868 SaveWordIDHistoryMap(cache);
948 SaveHistoryInfoMap(cache); 869 SaveHistoryInfoMap(cache);
949 } 870 }
950 871
951 bool InMemoryURLIndex::RestorePrivateData( 872 bool InMemoryURLIndex::RestorePrivateData(
952 const InMemoryURLIndexCacheItem& cache) { 873 const InMemoryURLIndexCacheItem& cache) {
953 last_saved_ = base::Time::FromInternalValue(cache.timestamp()); 874 last_saved_ = base::Time::FromInternalValue(cache.timestamp());
954 history_item_count_ = cache.history_item_count(); 875 return RestoreWordList(cache) && RestoreWordMap(cache) &&
955 return (history_item_count_ == 0) || (RestoreWordList(cache) && 876 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
956 RestoreWordMap(cache) && RestoreCharWordMap(cache) && 877 RestoreHistoryInfoMap(cache);
957 RestoreWordIDHistoryMap(cache) && RestoreHistoryInfoMap(cache));
958 } 878 }
959 879
960
961 void InMemoryURLIndex::SaveWordList(InMemoryURLIndexCacheItem* cache) const { 880 void InMemoryURLIndex::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
962 if (word_list_.empty()) 881 if (private_data_->word_list_.empty())
963 return; 882 return;
964 WordListItem* list_item = cache->mutable_word_list(); 883 WordListItem* list_item = cache->mutable_word_list();
965 list_item->set_word_count(word_list_.size()); 884 list_item->set_word_count(private_data_->word_list_.size());
966 for (String16Vector::const_iterator iter = word_list_.begin(); 885 for (String16Vector::const_iterator iter = private_data_->word_list_.begin();
967 iter != word_list_.end(); ++iter) 886 iter != private_data_->word_list_.end(); ++iter)
968 list_item->add_word(UTF16ToUTF8(*iter)); 887 list_item->add_word(UTF16ToUTF8(*iter));
969 } 888 }
970 889
971 bool InMemoryURLIndex::RestoreWordList(const InMemoryURLIndexCacheItem& cache) { 890 bool InMemoryURLIndex::RestoreWordList(const InMemoryURLIndexCacheItem& cache) {
972 if (!cache.has_word_list()) 891 if (!cache.has_word_list())
973 return false; 892 return false;
974 const WordListItem& list_item(cache.word_list()); 893 const WordListItem& list_item(cache.word_list());
975 uint32 expected_item_count = list_item.word_count(); 894 uint32 expected_item_count = list_item.word_count();
976 uint32 actual_item_count = list_item.word_size(); 895 uint32 actual_item_count = list_item.word_size();
977 if (actual_item_count == 0 || actual_item_count != expected_item_count) 896 if (actual_item_count == 0 || actual_item_count != expected_item_count)
978 return false; 897 return false;
979 const RepeatedPtrField<std::string>& words(list_item.word()); 898 const RepeatedPtrField<std::string>& words(list_item.word());
980 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); 899 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
981 iter != words.end(); ++iter) 900 iter != words.end(); ++iter)
982 word_list_.push_back(UTF8ToUTF16(*iter)); 901 private_data_->word_list_.push_back(UTF8ToUTF16(*iter));
983 return true; 902 return true;
984 } 903 }
985 904
986 void InMemoryURLIndex::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { 905 void InMemoryURLIndex::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
987 if (word_map_.empty()) 906 if (private_data_->word_map_.empty())
988 return; 907 return;
989 WordMapItem* map_item = cache->mutable_word_map(); 908 WordMapItem* map_item = cache->mutable_word_map();
990 map_item->set_item_count(word_map_.size()); 909 map_item->set_item_count(private_data_->word_map_.size());
991 for (WordMap::const_iterator iter = word_map_.begin(); 910 for (WordMap::const_iterator iter = private_data_->word_map_.begin();
992 iter != word_map_.end(); ++iter) { 911 iter != private_data_->word_map_.end(); ++iter) {
993 WordMapEntry* map_entry = map_item->add_word_map_entry(); 912 WordMapEntry* map_entry = map_item->add_word_map_entry();
994 map_entry->set_word(UTF16ToUTF8(iter->first)); 913 map_entry->set_word(UTF16ToUTF8(iter->first));
995 map_entry->set_word_id(iter->second); 914 map_entry->set_word_id(iter->second);
996 } 915 }
997 } 916 }
998 917
999 bool InMemoryURLIndex::RestoreWordMap(const InMemoryURLIndexCacheItem& cache) { 918 bool InMemoryURLIndex::RestoreWordMap(const InMemoryURLIndexCacheItem& cache) {
1000 if (!cache.has_word_map()) 919 if (!cache.has_word_map())
1001 return false; 920 return false;
1002 const WordMapItem& list_item(cache.word_map()); 921 const WordMapItem& list_item(cache.word_map());
1003 uint32 expected_item_count = list_item.item_count(); 922 uint32 expected_item_count = list_item.item_count();
1004 uint32 actual_item_count = list_item.word_map_entry_size(); 923 uint32 actual_item_count = list_item.word_map_entry_size();
1005 if (actual_item_count == 0 || actual_item_count != expected_item_count) 924 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1006 return false; 925 return false;
1007 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); 926 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1008 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); 927 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1009 iter != entries.end(); ++iter) 928 iter != entries.end(); ++iter)
1010 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id(); 929 private_data_->word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();
1011 return true; 930 return true;
1012 } 931 }
1013 932
1014 void InMemoryURLIndex::SaveCharWordMap(InMemoryURLIndexCacheItem* cache) const { 933 void InMemoryURLIndex::SaveCharWordMap(InMemoryURLIndexCacheItem* cache) const {
1015 if (char_word_map_.empty()) 934 if (private_data_->char_word_map_.empty())
1016 return; 935 return;
1017 CharWordMapItem* map_item = cache->mutable_char_word_map(); 936 CharWordMapItem* map_item = cache->mutable_char_word_map();
1018 map_item->set_item_count(char_word_map_.size()); 937 map_item->set_item_count(private_data_->char_word_map_.size());
1019 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); 938 for (CharWordIDMap::const_iterator iter =
1020 iter != char_word_map_.end(); ++iter) { 939 private_data_->char_word_map_.begin();
940 iter != private_data_->char_word_map_.end(); ++iter) {
1021 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); 941 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
1022 map_entry->set_char_16(iter->first); 942 map_entry->set_char_16(iter->first);
1023 const WordIDSet& word_id_set(iter->second); 943 const WordIDSet& word_id_set(iter->second);
1024 map_entry->set_item_count(word_id_set.size()); 944 map_entry->set_item_count(word_id_set.size());
1025 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); 945 for (WordIDSet::const_iterator set_iter = word_id_set.begin();
1026 set_iter != word_id_set.end(); ++set_iter) 946 set_iter != word_id_set.end(); ++set_iter)
1027 map_entry->add_word_id(*set_iter); 947 map_entry->add_word_id(*set_iter);
1028 } 948 }
1029 } 949 }
1030 950
(...skipping 13 matching lines...) Expand all
1044 expected_item_count = iter->item_count(); 964 expected_item_count = iter->item_count();
1045 actual_item_count = iter->word_id_size(); 965 actual_item_count = iter->word_id_size();
1046 if (actual_item_count == 0 || actual_item_count != expected_item_count) 966 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1047 return false; 967 return false;
1048 char16 uni_char = static_cast<char16>(iter->char_16()); 968 char16 uni_char = static_cast<char16>(iter->char_16());
1049 WordIDSet word_id_set; 969 WordIDSet word_id_set;
1050 const RepeatedField<int32>& word_ids(iter->word_id()); 970 const RepeatedField<int32>& word_ids(iter->word_id());
1051 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); 971 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1052 jiter != word_ids.end(); ++jiter) 972 jiter != word_ids.end(); ++jiter)
1053 word_id_set.insert(*jiter); 973 word_id_set.insert(*jiter);
1054 char_word_map_[uni_char] = word_id_set; 974 private_data_->char_word_map_[uni_char] = word_id_set;
1055 } 975 }
1056 return true; 976 return true;
1057 } 977 }
1058 978
1059 void InMemoryURLIndex::SaveWordIDHistoryMap(InMemoryURLIndexCacheItem* cache) 979 void InMemoryURLIndex::SaveWordIDHistoryMap(InMemoryURLIndexCacheItem* cache)
1060 const { 980 const {
1061 if (word_id_history_map_.empty()) 981 if (private_data_->word_id_history_map_.empty())
1062 return; 982 return;
1063 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); 983 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
1064 map_item->set_item_count(word_id_history_map_.size()); 984 map_item->set_item_count(private_data_->word_id_history_map_.size());
1065 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); 985 for (WordIDHistoryMap::const_iterator iter =
1066 iter != word_id_history_map_.end(); ++iter) { 986 private_data_->word_id_history_map_.begin();
987 iter != private_data_->word_id_history_map_.end(); ++iter) {
1067 WordIDHistoryMapEntry* map_entry = 988 WordIDHistoryMapEntry* map_entry =
1068 map_item->add_word_id_history_map_entry(); 989 map_item->add_word_id_history_map_entry();
1069 map_entry->set_word_id(iter->first); 990 map_entry->set_word_id(iter->first);
1070 const HistoryIDSet& history_id_set(iter->second); 991 const HistoryIDSet& history_id_set(iter->second);
1071 map_entry->set_item_count(history_id_set.size()); 992 map_entry->set_item_count(history_id_set.size());
1072 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); 993 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
1073 set_iter != history_id_set.end(); ++set_iter) 994 set_iter != history_id_set.end(); ++set_iter)
1074 map_entry->add_history_id(*set_iter); 995 map_entry->add_history_id(*set_iter);
1075 } 996 }
1076 } 997 }
(...skipping 12 matching lines...) Expand all
1089 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = 1010 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1090 entries.begin(); iter != entries.end(); ++iter) { 1011 entries.begin(); iter != entries.end(); ++iter) {
1091 expected_item_count = iter->item_count(); 1012 expected_item_count = iter->item_count();
1092 actual_item_count = iter->history_id_size(); 1013 actual_item_count = iter->history_id_size();
1093 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1014 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1094 return false; 1015 return false;
1095 WordID word_id = iter->word_id(); 1016 WordID word_id = iter->word_id();
1096 HistoryIDSet history_id_set; 1017 HistoryIDSet history_id_set;
1097 const RepeatedField<int64>& history_ids(iter->history_id()); 1018 const RepeatedField<int64>& history_ids(iter->history_id());
1098 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); 1019 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1099 jiter != history_ids.end(); ++jiter) 1020 jiter != history_ids.end(); ++jiter) {
1100 history_id_set.insert(*jiter); 1021 history_id_set.insert(*jiter);
1101 word_id_history_map_[word_id] = history_id_set; 1022 private_data_->AddToHistoryIDWordMap(*jiter, word_id);
1023 }
1024 private_data_->word_id_history_map_[word_id] = history_id_set;
1102 } 1025 }
1103 return true; 1026 return true;
1104 } 1027 }
1105 1028
1106 void InMemoryURLIndex::SaveHistoryInfoMap( 1029 void InMemoryURLIndex::SaveHistoryInfoMap(
1107 InMemoryURLIndexCacheItem* cache) const { 1030 InMemoryURLIndexCacheItem* cache) const {
1108 if (history_info_map_.empty()) 1031 if (private_data_->history_info_map_.empty())
1109 return; 1032 return;
1110 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); 1033 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
1111 map_item->set_item_count(history_info_map_.size()); 1034 map_item->set_item_count(private_data_->history_info_map_.size());
1112 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 1035 for (HistoryInfoMap::const_iterator iter =
1113 iter != history_info_map_.end(); ++iter) { 1036 private_data_->history_info_map_.begin();
1037 iter != private_data_->history_info_map_.end(); ++iter) {
1114 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); 1038 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
1115 map_entry->set_history_id(iter->first); 1039 map_entry->set_history_id(iter->first);
1116 const URLRow& url_row(iter->second); 1040 const URLRow& url_row(iter->second);
1117 // Note: We only save information that contributes to the index so there 1041 // Note: We only save information that contributes to the index so there
1118 // is no need to save search_term_cache_ (not persistent), 1042 // is no need to save search_term_cache_ (not persistent),
1119 // languages_, etc. 1043 // languages_, etc.
1120 map_entry->set_visit_count(url_row.visit_count()); 1044 map_entry->set_visit_count(url_row.visit_count());
1121 map_entry->set_typed_count(url_row.typed_count()); 1045 map_entry->set_typed_count(url_row.typed_count());
1122 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); 1046 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1123 map_entry->set_url(url_row.url().spec()); 1047 map_entry->set_url(url_row.url().spec());
(...skipping 17 matching lines...) Expand all
1141 HistoryID history_id = iter->history_id(); 1065 HistoryID history_id = iter->history_id();
1142 GURL url(iter->url()); 1066 GURL url(iter->url());
1143 URLRow url_row(url, history_id); 1067 URLRow url_row(url, history_id);
1144 url_row.set_visit_count(iter->visit_count()); 1068 url_row.set_visit_count(iter->visit_count());
1145 url_row.set_typed_count(iter->typed_count()); 1069 url_row.set_typed_count(iter->typed_count());
1146 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); 1070 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1147 if (iter->has_title()) { 1071 if (iter->has_title()) {
1148 string16 title(UTF8ToUTF16(iter->title())); 1072 string16 title(UTF8ToUTF16(iter->title()));
1149 url_row.set_title(title); 1073 url_row.set_title(title);
1150 } 1074 }
1151 history_info_map_[history_id] = url_row; 1075 private_data_->history_info_map_[history_id] = url_row;
1152 } 1076 }
1153 return true; 1077 return true;
1154 } 1078 }
1155 1079
1156 } // namespace history 1080 } // namespace history
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698