chrome/browser/autocomplete/url_index_private_data.cc - Issue 1182113009: Componentize HistoryQuickProvider and its supporting cast

Side by Side Diff: chrome/browser/autocomplete/url_index_private_data.cc

Issue 1182113009: Componentize HistoryQuickProvider and its supporting cast (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/autocomplete/url_index_private_data.h"

6

7 #include <functional>

8 #include <iterator>

9 #include <limits>

10 #include <numeric>

11 #include <string>

12 #include <vector>

13

14 #include "base/basictypes.h"

15 #include "base/files/file_util.h"

16 #include "base/i18n/break_iterator.h"

17 #include "base/i18n/case_conversion.h"

18 #include "base/metrics/histogram.h"

19 #include "base/strings/string_util.h"

20 #include "base/strings/utf_string_conversions.h"

21 #include "base/time/time.h"

22 #include "chrome/browser/autocomplete/in_memory_url_index.h"

23 #include "components/bookmarks/browser/bookmark_model.h"

24 #include "components/bookmarks/browser/bookmark_utils.h"

25 #include "components/history/core/browser/history_database.h"

26 #include "components/history/core/browser/history_db_task.h"

27 #include "components/history/core/browser/history_service.h"

28 #include "net/base/net_util.h"

29

30 #if defined(USE_SYSTEM_PROTOBUF)

31 #include <google/protobuf/repeated_field.h>

32 #else

33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"

34 #endif

35

36 using google::protobuf::RepeatedField;

37 using google::protobuf::RepeatedPtrField;

38 using in_memory_url_index::InMemoryURLIndexCacheItem;

39

40 namespace {

41 static const size_t kMaxVisitsToStoreInCache = 10u;

42 }

43

44 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem

45 WordListItem;

46 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry

47 WordMapEntry;

48 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;

49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem

50 CharWordMapItem;

51 typedef in_memory_url_index::

52 InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry;

53 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem

54 WordIDHistoryMapItem;

55 typedef in_memory_url_index::

56 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry

57 WordIDHistoryMapEntry;

58 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem

59 HistoryInfoMapItem;

60 typedef in_memory_url_index::

61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry

62 HistoryInfoMapEntry;

63 typedef in_memory_url_index::

64 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo

65 HistoryInfoMapEntry_VisitInfo;

66 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem

67 WordStartsMapItem;

68 typedef in_memory_url_index::

69 InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry

70 WordStartsMapEntry;

71

72 // Algorithm Functions ---------------------------------------------------------

73

74 // Comparison function for sorting search terms by descending length.

75 bool LengthGreater(const base::string16& string_a,

76 const base::string16& string_b) {

77 return string_a.length() > string_b.length();

78 }

79

80

81 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

82

83 // HistoryDBTask used to update the recent visit data for a particular

84 // row from the history database.

85 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {

86 public:

87 explicit UpdateRecentVisitsFromHistoryDBTask(

88 URLIndexPrivateData* private_data,

89 history::URLID url_id);

90

91 bool RunOnDBThread(history::HistoryBackend* backend,

92 history::HistoryDatabase* db) override;

93 void DoneRunOnMainThread() override;

94

95 private:

96 ~UpdateRecentVisitsFromHistoryDBTask() override;

97

98 // The URLIndexPrivateData that gets updated after the historyDB

99 // task returns.

100 URLIndexPrivateData* private_data_;

101 // The ID of the URL to get visits for and then update.

102 history::URLID url_id_;

103 // Whether fetching the recent visits for the URL succeeded.

104 bool succeeded_;

105 // The awaited data that's shown to private_data_ for it to copy and

106 // store.

107 history::VisitVector recent_visits_;

108

109 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);

110 };

111

112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(

113 URLIndexPrivateData* private_data,

114 history::URLID url_id)

115 : private_data_(private_data), url_id_(url_id), succeeded_(false) {

116 }

117

118 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(

119 history::HistoryBackend* backend,

120 history::HistoryDatabase* db) {

121 // Make sure the private data is going to get as many recent visits as

122 // ScoredHistoryMatch::GetFrequency() hopes to use.

123 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);

124 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,

125 kMaxVisitsToStoreInCache,

126 &recent_visits_);

127 if (!succeeded_)

128 recent_visits_.clear();

129 return true; // Always claim to be done; do not retry failures.

130 }

131

132 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {

133 if (succeeded_)

134 private_data_->UpdateRecentVisits(url_id_, recent_visits_);

135 }

136

137 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {

138 }

139

140

141 // URLIndexPrivateData ---------------------------------------------------------

142

143 URLIndexPrivateData::URLIndexPrivateData()

144 : restored_cache_version_(0),

145 saved_cache_version_(kCurrentCacheFileVersion),

146 pre_filter_item_count_(0),

147 post_filter_item_count_(0),

148 post_scoring_item_count_(0) {

149 }

150

151 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

152 base::string16 search_string,

153 size_t cursor_position,

154 size_t max_matches,

155 const std::string& languages,

156 bookmarks::BookmarkModel* bookmark_model) {

157 // If cursor position is set and useful (not at either end of the

158 // string), allow the search string to be broken at cursor position.

159 // We do this by pretending there's a space where the cursor is.

160 if ((cursor_position != base::string16::npos) &&

161 (cursor_position < search_string.length()) &&

162 (cursor_position > 0)) {

163 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

164 }

165 pre_filter_item_count_ = 0;

166 post_filter_item_count_ = 0;

167 post_scoring_item_count_ = 0;

168 // The search string we receive may contain escaped characters. For reducing

169 // the index we need individual, lower-cased words, ignoring escapings. For

170 // the final filtering we need whitespace separated substrings possibly

171 // containing escaped characters.

172 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

173 base::string16 lower_unescaped_string =

174 net::UnescapeURLComponent(lower_raw_string,

175 net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

176 // Extract individual 'words' (as opposed to 'terms'; see below) from the

177 // search string. When the user types "colspec=ID%20Mstone Release" we get

178 // four 'words': "colspec", "id", "mstone" and "release".

179 String16Vector lower_words(

180 String16VectorFromString16(lower_unescaped_string, false, NULL));

181 ScoredHistoryMatches scored_items;

182

183 // Do nothing if we have indexed no words (probably because we've not been

184 // initialized yet) or the search string has no words.

185 if (word_list_.empty() \|\| lower_words.empty()) {

186 search_term_cache_.clear(); // Invalidate the term cache.

187 return scored_items;

188 }

189

190 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

191 // approach.

192 ResetSearchTermCache();

193

194 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

195

196 // Trim the candidate pool if it is large. Note that we do not filter out

197 // items that do not contain the search terms as proper substrings -- doing

198 // so is the performance-costly operation we are trying to avoid in order

199 // to maintain omnibox responsiveness.

200 const size_t kItemsToScoreLimit = 500;

201 pre_filter_item_count_ = history_id_set.size();

202 // If we trim the results set we do not want to cache the results for next

203 // time as the user's ultimately desired result could easily be eliminated

204 // in this early rough filter.

205 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

206 if (was_trimmed) {

207 HistoryIDVector history_ids;

208 std::copy(history_id_set.begin(), history_id_set.end(),

209 std::back_inserter(history_ids));

210 // Trim down the set by sorting by typed-count, visit-count, and last

211 // visit.

212 HistoryItemFactorGreater

213 item_factor_functor(history_info_map_);

214 std::partial_sort(history_ids.begin(),

215 history_ids.begin() + kItemsToScoreLimit,

216 history_ids.end(),

217 item_factor_functor);

218 history_id_set.clear();

219 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

220 std::inserter(history_id_set, history_id_set.end()));

221 post_filter_item_count_ = history_id_set.size();

222 }

223

224 // Pass over all of the candidates filtering out any without a proper

225 // substring match, inserting those which pass in order by score. Note that

226 // in this step we are using the raw search string complete with escaped

227 // URL elements. When the user has specifically typed something akin to

228 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

229 // specific substring appears in the URL or page title.

230

231 // We call these 'terms' (as opposed to 'words'; see above) as in this case

232 // we only want to break up the search string on 'true' whitespace rather than

233 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

234 // get two 'terms': "colspec=id%20mstone" and "release".

235 String16Vector lower_raw_terms;

236 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,

237 &lower_raw_terms) == 0) {

238 // Don't score matches when there are no terms to score against. (It's

239 // possible that the word break iterater that extracts words to search

240 // for in the database allows some whitespace "words" whereas Tokenize

241 // excludes a long list of whitespace.) One could write a scoring

242 // function that gives a reasonable order to matches when there

243 // are no terms (i.e., all the words are some form of whitespace),

244 // but this is such a rare edge case that it's not worth the time.

245 return scored_items;

246 }

247 scored_items =

248 std::for_each(

249 history_id_set.begin(), history_id_set.end(),

250 AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string,

251 lower_raw_terms, base::Time::Now())).ScoredMatches();

252

253 // Select and sort only the top \|max_matches\| results.

254 if (scored_items.size() > max_matches) {

255 std::partial_sort(scored_items.begin(),

256 scored_items.begin() +

257 max_matches,

258 scored_items.end(),

259 ScoredHistoryMatch::MatchScoreGreater);

260 scored_items.resize(max_matches);

261 } else {

262 std::sort(scored_items.begin(), scored_items.end(),

263 ScoredHistoryMatch::MatchScoreGreater);

264 }

265 post_scoring_item_count_ = scored_items.size();

266

267 if (was_trimmed) {

268 search_term_cache_.clear(); // Invalidate the term cache.

269 } else {

270 // Remove any stale SearchTermCacheItems.

271 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

272 cache_iter != search_term_cache_.end(); ) {

273 if (!cache_iter->second.used_)

274 search_term_cache_.erase(cache_iter++);

275 else

276 ++cache_iter;

277 }

278 }

279

280 return scored_items;

281 }

282

283 bool URLIndexPrivateData::UpdateURL(

284 history::HistoryService* history_service,

285 const history::URLRow& row,

286 const std::string& languages,

287 const std::set<std::string>& scheme_whitelist,

288 base::CancelableTaskTracker* tracker) {

289 // The row may or may not already be in our index. If it is not already

290 // indexed and it qualifies then it gets indexed. If it is already

291 // indexed and still qualifies then it gets updated, otherwise it

292 // is deleted from the index.

293 bool row_was_updated = false;

294 history::URLID row_id = row.id();

295 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);

296 if (row_pos == history_info_map_.end()) {

297 // This new row should be indexed if it qualifies.

298 history::URLRow new_row(row);

299 new_row.set_id(row_id);

300 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&

301 IndexRow(NULL,

302 history_service,

303 new_row,

304 languages,

305 scheme_whitelist,

306 tracker);

307 } else if (RowQualifiesAsSignificant(row, base::Time())) {

308 // This indexed row still qualifies and will be re-indexed.

309 // The url won't have changed but the title, visit count, etc.

310 // might have changed.

311 history::URLRow& row_to_update = row_pos->second.url_row;

312 bool title_updated = row_to_update.title() != row.title();

313 if (row_to_update.visit_count() != row.visit_count() \|\|

314 row_to_update.typed_count() != row.typed_count() \|\|

315 row_to_update.last_visit() != row.last_visit() \|\| title_updated) {

316 row_to_update.set_visit_count(row.visit_count());

317 row_to_update.set_typed_count(row.typed_count());

318 row_to_update.set_last_visit(row.last_visit());

319 // If something appears to have changed, update the recent visits

320 // information.

321 ScheduleUpdateRecentVisits(history_service, row_id, tracker);

322 // While the URL is guaranteed to remain stable, the title may have

323 // changed. If so, then update the index with the changed words.

324 if (title_updated) {

325 // Clear all words associated with this row and re-index both the

326 // URL and title.

327 RemoveRowWordsFromIndex(row_to_update);

328 row_to_update.set_title(row.title());

329 RowWordStarts word_starts;

330 AddRowWordsToIndex(row_to_update, &word_starts, languages);

331 word_starts_map_[row_id] = word_starts;

332 }

333 row_was_updated = true;

334 }

335 } else {

336 // This indexed row no longer qualifies and will be de-indexed by

337 // clearing all words associated with this row.

338 RemoveRowFromIndex(row);

339 row_was_updated = true;

340 }

341 if (row_was_updated)

342 search_term_cache_.clear(); // This invalidates the cache.

343 return row_was_updated;

344 }

345

346 void URLIndexPrivateData::UpdateRecentVisits(

347 history::URLID url_id,

348 const history::VisitVector& recent_visits) {

349 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);

350 if (row_pos != history_info_map_.end()) {

351 VisitInfoVector* visits = &row_pos->second.visits;

352 visits->clear();

353 const size_t size =

354 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);

355 visits->reserve(size);

356 for (size_t i = 0; i < size; i++) {

357 // Copy from the history::VisitVector the only fields visits needs.

358 visits->push_back(std::make_pair(recent_visits[i].visit_time,

359 recent_visits[i].transition));

360 }

361 }

362 // Else: Oddly, the URL doesn't seem to exist in the private index.

363 // Ignore this update. This can happen if, for instance, the user

364 // removes the URL from URLIndexPrivateData before the historyDB call

365 // returns.

366 }

367

368 void URLIndexPrivateData::ScheduleUpdateRecentVisits(

369 history::HistoryService* history_service,

370 history::URLID url_id,

371 base::CancelableTaskTracker* tracker) {

372 history_service->ScheduleDBTask(

373 scoped_ptr<history::HistoryDBTask>(

374 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);

375 }

376

377 // Helper functor for DeleteURL.

378 class HistoryInfoMapItemHasURL {

379 public:

380 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}

381

382 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {

383 return item.second.url_row.url() == url_;

384 }

385

386 private:

387 const GURL& url_;

388 };

389

390 bool URLIndexPrivateData::DeleteURL(const GURL& url) {

391 // Find the matching entry in the history_info_map_.

392 HistoryInfoMap::iterator pos = std::find_if(

393 history_info_map_.begin(),

394 history_info_map_.end(),

395 HistoryInfoMapItemHasURL(url));

396 if (pos == history_info_map_.end())

397 return false;

398 RemoveRowFromIndex(pos->second.url_row);

399 search_term_cache_.clear(); // This invalidates the cache.

400 return true;

401 }

402

403 // static

404 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(

405 const base::FilePath& file_path,

406 const std::string& languages) {

407 base::TimeTicks beginning_time = base::TimeTicks::Now();

408 if (!base::PathExists(file_path))

409 return NULL;

410 std::string data;

411 // If there is no cache file then simply give up. This will cause us to

412 // attempt to rebuild from the history database.

413 if (!base::ReadFileToString(file_path, &data))

414 return NULL;

415

416 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);

417 InMemoryURLIndexCacheItem index_cache;

418 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {

419 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "

420 << file_path.value();

421 return restored_data;

422 }

423

424 if (!restored_data->RestorePrivateData(index_cache, languages))

425 return NULL;

426

427 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",

428 base::TimeTicks::Now() - beginning_time);

429 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",

430 restored_data->history_id_word_map_.size());

431 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());

432 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",

433 restored_data->word_map_.size());

434 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",

435 restored_data->char_word_map_.size());

436 if (restored_data->Empty())

437 return NULL; // 'No data' is the same as a failed reload.

438 return restored_data;

439 }

440

441 // static

442 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(

443 history::HistoryDatabase* history_db,

444 const std::string& languages,

445 const std::set<std::string>& scheme_whitelist) {

446 if (!history_db)

447 return NULL;

448

449 base::TimeTicks beginning_time = base::TimeTicks::Now();

450

451 scoped_refptr<URLIndexPrivateData>

452 rebuilt_data(new URLIndexPrivateData);

453 history::URLDatabase::URLEnumerator history_enum;

454 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))

455 return NULL;

456 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();

457 for (history::URLRow row; history_enum.GetNextURL(&row);) {

458 rebuilt_data->IndexRow(

459 history_db, NULL, row, languages, scheme_whitelist, NULL);

460 }

461

462 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",

463 base::TimeTicks::Now() - beginning_time);

464 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",

465 rebuilt_data->history_id_word_map_.size());

466 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",

467 rebuilt_data->word_map_.size());

468 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",

469 rebuilt_data->char_word_map_.size());

470 return rebuilt_data;

471 }

472

473 // static

474 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(

475 scoped_refptr<URLIndexPrivateData> private_data,

476 const base::FilePath& file_path) {

477 DCHECK(private_data.get());

478 DCHECK(!file_path.empty());

479 return private_data->SaveToFile(file_path);

480 }

481

482 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {

483 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;

484 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;

485 data_copy->word_list_ = word_list_;

486 data_copy->available_words_ = available_words_;

487 data_copy->word_map_ = word_map_;

488 data_copy->char_word_map_ = char_word_map_;

489 data_copy->word_id_history_map_ = word_id_history_map_;

490 data_copy->history_id_word_map_ = history_id_word_map_;

491 data_copy->history_info_map_ = history_info_map_;

492 data_copy->word_starts_map_ = word_starts_map_;

493 return data_copy;

494 // Not copied:

495 // search_term_cache_

496 // pre_filter_item_count_

497 // post_filter_item_count_

498 // post_scoring_item_count_

499 }

500

501 bool URLIndexPrivateData::Empty() const {

502 return history_info_map_.empty();

503 }

504

505 void URLIndexPrivateData::Clear() {

506 last_time_rebuilt_from_history_ = base::Time();

507 word_list_.clear();

508 available_words_.clear();

509 word_map_.clear();

510 char_word_map_.clear();

511 word_id_history_map_.clear();

512 history_id_word_map_.clear();

513 history_info_map_.clear();

514 word_starts_map_.clear();

515 }

516

517 URLIndexPrivateData::~URLIndexPrivateData() {}

518

519 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(

520 const String16Vector& unsorted_words) {

521 // Break the terms down into individual terms (words), get the candidate

522 // set for each term, and intersect each to get a final candidate list.

523 // Note that a single 'term' from the user's perspective might be

524 // a string like "http://www.somewebsite.com" which, from our perspective,

525 // is four words: 'http', 'www', 'somewebsite', and 'com'.

526 HistoryIDSet history_id_set;

527 String16Vector words(unsorted_words);

528 // Sort the words into the longest first as such are likely to narrow down

529 // the results quicker. Also, single character words are the most expensive

530 // to process so save them for last.

531 std::sort(words.begin(), words.end(), LengthGreater);

532 for (String16Vector::iterator iter = words.begin(); iter != words.end();

533 ++iter) {

534 base::string16 uni_word = *iter;

535 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

536 if (term_history_set.empty()) {

537 history_id_set.clear();

538 break;

539 }

540 if (iter == words.begin()) {

541 history_id_set.swap(term_history_set);

542 } else {

543 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(

544 history_id_set, term_history_set);

545 history_id_set.swap(new_history_id_set);

546 }

547 }

548 return history_id_set;

549 }

550

551 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(

552 const base::string16& term) {

553 if (term.empty())

554 return HistoryIDSet();

555

556 // TODO(mrossetti): Consider optimizing for very common terms such as

557 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently

558 // occuring words in the user's searches.

559

560 size_t term_length = term.length();

561 WordIDSet word_id_set;

562 if (term_length > 1) {

563 // See if this term or a prefix thereof is present in the cache.

564 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());

565 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

566 cache_iter != search_term_cache_.end(); ++cache_iter) {

567 if (base::StartsWith(term, cache_iter->first, false) &&

568 (best_prefix == search_term_cache_.end() \|\|

569 cache_iter->first.length() > best_prefix->first.length()))

570 best_prefix = cache_iter;

571 }

572

573 // If a prefix was found then determine the leftover characters to be used

574 // for further refining the results from that prefix.

575 Char16Set prefix_chars;

576 base::string16 leftovers(term);

577 if (best_prefix != search_term_cache_.end()) {

578 // If the prefix is an exact match for the term then grab the cached

579 // results and we're done.

580 size_t prefix_length = best_prefix->first.length();

581 if (prefix_length == term_length) {

582 best_prefix->second.used_ = true;

583 return best_prefix->second.history_id_set_;

584 }

585

586 // Otherwise we have a handy starting point.

587 // If there are no history results for this prefix then we can bail early

588 // as there will be no history results for the full term.

589 if (best_prefix->second.history_id_set_.empty()) {

590 search_term_cache_[term] = SearchTermCacheItem();

591 return HistoryIDSet();

592 }

593 word_id_set = best_prefix->second.word_id_set_;

594 prefix_chars = Char16SetFromString16(best_prefix->first);

595 leftovers = term.substr(prefix_length);

596 }

597

598 // Filter for each remaining, unique character in the term.

599 Char16Set leftover_chars = Char16SetFromString16(leftovers);

600 Char16Set unique_chars =

601 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);

602

603 // Reduce the word set with any leftover, unprocessed characters.

604 if (!unique_chars.empty()) {

605 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));

606 // We might come up empty on the leftovers.

607 if (leftover_set.empty()) {

608 search_term_cache_[term] = SearchTermCacheItem();

609 return HistoryIDSet();

610 }

611 // Or there may not have been a prefix from which to start.

612 if (prefix_chars.empty()) {

613 word_id_set.swap(leftover_set);

614 } else {

615 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(

616 word_id_set, leftover_set);

617 word_id_set.swap(new_word_id_set);

618 }

619 }

620

621 // We must filter the word list because the resulting word set surely

622 // contains words which do not have the search term as a proper subset.

623 for (WordIDSet::iterator word_set_iter = word_id_set.begin();

624 word_set_iter != word_id_set.end(); ) {

625 if (word_list_[*word_set_iter].find(term) == base::string16::npos)

626 word_id_set.erase(word_set_iter++);

627 else

628 ++word_set_iter;

629 }

630 } else {

631 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));

632 }

633

634 // If any words resulted then we can compose a set of history IDs by unioning

635 // the sets from each word.

636 HistoryIDSet history_id_set;

637 if (!word_id_set.empty()) {

638 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

639 word_id_iter != word_id_set.end(); ++word_id_iter) {

640 WordID word_id = *word_id_iter;

641 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);

642 if (word_iter != word_id_history_map_.end()) {

643 HistoryIDSet& word_history_id_set(word_iter->second);

644 history_id_set.insert(word_history_id_set.begin(),

645 word_history_id_set.end());

646 }

647 }

648 }

649

650 // Record a new cache entry for this word if the term is longer than

651 // a single character.

652 if (term_length > 1)

653 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);

654

655 return history_id_set;

656 }

657

658 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(

659 const Char16Set& term_chars) {

660 WordIDSet word_id_set;

661 for (Char16Set::const_iterator c_iter = term_chars.begin();

662 c_iter != term_chars.end(); ++c_iter) {

663 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);

664 if (char_iter == char_word_map_.end()) {

665 // A character was not found so there are no matching results: bail.

666 word_id_set.clear();

667 break;

668 }

669 WordIDSet& char_word_id_set(char_iter->second);

670 // It is possible for there to no longer be any words associated with

671 // a particular character. Give up in that case.

672 if (char_word_id_set.empty()) {

673 word_id_set.clear();

674 break;

675 }

676

677 if (c_iter == term_chars.begin()) {

678 // First character results becomes base set of results.

679 word_id_set = char_word_id_set;

680 } else {

681 // Subsequent character results get intersected in.

682 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(

683 word_id_set, char_word_id_set);

684 word_id_set.swap(new_word_id_set);

685 }

686 }

687 return word_id_set;

688 }

689

690 bool URLIndexPrivateData::IndexRow(

691 history::HistoryDatabase* history_db,

692 history::HistoryService* history_service,

693 const history::URLRow& row,

694 const std::string& languages,

695 const std::set<std::string>& scheme_whitelist,

696 base::CancelableTaskTracker* tracker) {

697 const GURL& gurl(row.url());

698

699 // Index only URLs with a whitelisted scheme.

700 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))

701 return false;

702

703 history::URLID row_id = row.id();

704 // Strip out username and password before saving and indexing.

705 base::string16 url(net::FormatUrl(gurl, languages,

706 net::kFormatUrlOmitUsernamePassword,

707 net::UnescapeRule::NONE,

708 NULL, NULL, NULL));

709

710 HistoryID history_id = static_cast<HistoryID>(row_id);

711 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());

712

713 // Add the row for quick lookup in the history info store.

714 history::URLRow new_row(GURL(url), row_id);

715 new_row.set_visit_count(row.visit_count());

716 new_row.set_typed_count(row.typed_count());

717 new_row.set_last_visit(row.last_visit());

718 new_row.set_title(row.title());

719 history_info_map_[history_id].url_row = new_row;

720

721 // Index the words contained in the URL and title of the row.

722 RowWordStarts word_starts;

723 AddRowWordsToIndex(new_row, &word_starts, languages);

724 word_starts_map_[history_id] = word_starts;

725

726 // Update the recent visits information or schedule the update

727 // as appropriate.

728 if (history_db) {

729 // We'd like to check that we're on the history DB thread.

730 // However, unittest code actually calls this on the UI thread.

731 // So we don't do any thread checks.

732 history::VisitVector recent_visits;

733 // Make sure the private data is going to get as many recent visits as

734 // ScoredHistoryMatch::GetFrequency() hopes to use.

735 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);

736 if (history_db->GetMostRecentVisitsForURL(row_id,

737 kMaxVisitsToStoreInCache,

738 &recent_visits))

739 UpdateRecentVisits(row_id, recent_visits);

740 } else {

741 DCHECK(tracker);

742 DCHECK(history_service);

743 ScheduleUpdateRecentVisits(history_service, row_id, tracker);

744 }

745

746 return true;

747 }

748

749 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row,

750 RowWordStarts* word_starts,

751 const std::string& languages) {

752 HistoryID history_id = static_cast<HistoryID>(row.id());

753 // Split URL into individual, unique words then add in the title words.

754 const GURL& gurl(row.url());

755 const base::string16& url =

756 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);

757 String16Set url_words = String16SetFromString16(url,

758 word_starts ? &word_starts->url_word_starts_ : NULL);

759 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());

760 String16Set title_words = String16SetFromString16(title,

761 word_starts ? &word_starts->title_word_starts_ : NULL);

762 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);

763 for (String16Set::iterator word_iter = words.begin();

764 word_iter != words.end(); ++word_iter)

765 AddWordToIndex(*word_iter, history_id);

766

767 search_term_cache_.clear(); // Invalidate the term cache.

768 }

769

770 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,

771 HistoryID history_id) {

772 WordMap::iterator word_pos = word_map_.find(term);

773 if (word_pos != word_map_.end())

774 UpdateWordHistory(word_pos->second, history_id);

775 else

776 AddWordHistory(term, history_id);

777 }

778

779 void URLIndexPrivateData::AddWordHistory(const base::string16& term,

780 HistoryID history_id) {

781 WordID word_id = word_list_.size();

782 if (available_words_.empty()) {

783 word_list_.push_back(term);

784 } else {

785 word_id = *(available_words_.begin());

786 word_list_[word_id] = term;

787 available_words_.erase(word_id);

788 }

789 word_map_[term] = word_id;

790

791 HistoryIDSet history_id_set;

792 history_id_set.insert(history_id);

793 word_id_history_map_[word_id] = history_id_set;

794 AddToHistoryIDWordMap(history_id, word_id);

795

796 // For each character in the newly added word (i.e. a word that is not

797 // already in the word index), add the word to the character index.

798 Char16Set characters = Char16SetFromString16(term);

799 for (Char16Set::iterator uni_char_iter = characters.begin();

800 uni_char_iter != characters.end(); ++uni_char_iter) {

801 base::char16 uni_char = *uni_char_iter;

802 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);

803 if (char_iter != char_word_map_.end()) {

804 // Update existing entry in the char/word index.

805 WordIDSet& word_id_set(char_iter->second);

806 word_id_set.insert(word_id);

807 } else {

808 // Create a new entry in the char/word index.

809 WordIDSet word_id_set;

810 word_id_set.insert(word_id);

811 char_word_map_[uni_char] = word_id_set;

812 }

813 }

814 }

815

816 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,

817 HistoryID history_id) {

818 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);

819 DCHECK(history_pos != word_id_history_map_.end());

820 HistoryIDSet& history_id_set(history_pos->second);

821 history_id_set.insert(history_id);

822 AddToHistoryIDWordMap(history_id, word_id);

823 }

824

825 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,

826 WordID word_id) {

827 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);

828 if (iter != history_id_word_map_.end()) {

829 WordIDSet& word_id_set(iter->second);

830 word_id_set.insert(word_id);

831 } else {

832 WordIDSet word_id_set;

833 word_id_set.insert(word_id);

834 history_id_word_map_[history_id] = word_id_set;

835 }

836 }

837

838 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) {

839 RemoveRowWordsFromIndex(row);

840 HistoryID history_id = static_cast<HistoryID>(row.id());

841 history_info_map_.erase(history_id);

842 word_starts_map_.erase(history_id);

843 }

844

845 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) {

846 // Remove the entries in history_id_word_map_ and word_id_history_map_ for

847 // this row.

848 HistoryID history_id = static_cast<HistoryID>(row.id());

849 WordIDSet word_id_set = history_id_word_map_[history_id];

850 history_id_word_map_.erase(history_id);

851

852 // Reconcile any changes to word usage.

853 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

854 word_id_iter != word_id_set.end(); ++word_id_iter) {

855 WordID word_id = *word_id_iter;

856 word_id_history_map_[word_id].erase(history_id);

857 if (!word_id_history_map_[word_id].empty())

858 continue; // The word is still in use.

859

860 // The word is no longer in use. Reconcile any changes to character usage.

861 base::string16 word = word_list_[word_id];

862 Char16Set characters = Char16SetFromString16(word);

863 for (Char16Set::iterator uni_char_iter = characters.begin();

864 uni_char_iter != characters.end(); ++uni_char_iter) {

865 base::char16 uni_char = *uni_char_iter;

866 char_word_map_[uni_char].erase(word_id);

867 if (char_word_map_[uni_char].empty())

868 char_word_map_.erase(uni_char); // No longer in use.

869 }

870

871 // Complete the removal of references to the word.

872 word_id_history_map_.erase(word_id);

873 word_map_.erase(word);

874 word_list_[word_id] = base::string16();

875 available_words_.insert(word_id);

876 }

877 }

878

879 void URLIndexPrivateData::ResetSearchTermCache() {

880 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();

881 iter != search_term_cache_.end(); ++iter)

882 iter->second.used_ = false;

883 }

884

885 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {

886 base::TimeTicks beginning_time = base::TimeTicks::Now();

887 InMemoryURLIndexCacheItem index_cache;

888 SavePrivateData(&index_cache);

889 std::string data;

890 if (!index_cache.SerializeToString(&data)) {

891 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";

892 return false;

893 }

894

895 int size = data.size();

896 if (base::WriteFile(file_path, data.c_str(), size) != size) {

897 LOG(WARNING) << "Failed to write " << file_path.value();

898 return false;

899 }

900 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",

901 base::TimeTicks::Now() - beginning_time);

902 return true;

903 }

904

905 void URLIndexPrivateData::SavePrivateData(

906 InMemoryURLIndexCacheItem* cache) const {

907 DCHECK(cache);

908 cache->set_last_rebuild_timestamp(

909 last_time_rebuilt_from_history_.ToInternalValue());

910 cache->set_version(saved_cache_version_);

911 // history_item_count_ is no longer used but rather than change the protobuf

912 // definition use a placeholder. This will go away with the switch to SQLite.

913 cache->set_history_item_count(0);

914 SaveWordList(cache);

915 SaveWordMap(cache);

916 SaveCharWordMap(cache);

917 SaveWordIDHistoryMap(cache);

918 SaveHistoryInfoMap(cache);

919 SaveWordStartsMap(cache);

920 }

921

922 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {

923 if (word_list_.empty())

924 return;

925 WordListItem* list_item = cache->mutable_word_list();

926 list_item->set_word_count(word_list_.size());

927 for (String16Vector::const_iterator iter = word_list_.begin();

928 iter != word_list_.end(); ++iter)

929 list_item->add_word(base::UTF16ToUTF8(*iter));

930 }

931

932 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {

933 if (word_map_.empty())

934 return;

935 WordMapItem* map_item = cache->mutable_word_map();

936 map_item->set_item_count(word_map_.size());

937 for (WordMap::const_iterator iter = word_map_.begin();

938 iter != word_map_.end(); ++iter) {

939 WordMapEntry* map_entry = map_item->add_word_map_entry();

940 map_entry->set_word(base::UTF16ToUTF8(iter->first));

941 map_entry->set_word_id(iter->second);

942 }

943 }

944

945 void URLIndexPrivateData::SaveCharWordMap(

946 InMemoryURLIndexCacheItem* cache) const {

947 if (char_word_map_.empty())

948 return;

949 CharWordMapItem* map_item = cache->mutable_char_word_map();

950 map_item->set_item_count(char_word_map_.size());

951 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();

952 iter != char_word_map_.end(); ++iter) {

953 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();

954 map_entry->set_char_16(iter->first);

955 const WordIDSet& word_id_set(iter->second);

956 map_entry->set_item_count(word_id_set.size());

957 for (WordIDSet::const_iterator set_iter = word_id_set.begin();

958 set_iter != word_id_set.end(); ++set_iter)

959 map_entry->add_word_id(*set_iter);

960 }

961 }

962

963 void URLIndexPrivateData::SaveWordIDHistoryMap(

964 InMemoryURLIndexCacheItem* cache) const {

965 if (word_id_history_map_.empty())

966 return;

967 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();

968 map_item->set_item_count(word_id_history_map_.size());

969 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();

970 iter != word_id_history_map_.end(); ++iter) {

971 WordIDHistoryMapEntry* map_entry =

972 map_item->add_word_id_history_map_entry();

973 map_entry->set_word_id(iter->first);

974 const HistoryIDSet& history_id_set(iter->second);

975 map_entry->set_item_count(history_id_set.size());

976 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();

977 set_iter != history_id_set.end(); ++set_iter)

978 map_entry->add_history_id(*set_iter);

979 }

980 }

981

982 void URLIndexPrivateData::SaveHistoryInfoMap(

983 InMemoryURLIndexCacheItem* cache) const {

984 if (history_info_map_.empty())

985 return;

986 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();

987 map_item->set_item_count(history_info_map_.size());

988 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();

989 iter != history_info_map_.end(); ++iter) {

990 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();

991 map_entry->set_history_id(iter->first);

992 const history::URLRow& url_row(iter->second.url_row);

993 // Note: We only save information that contributes to the index so there

994 // is no need to save search_term_cache_ (not persistent).

995 map_entry->set_visit_count(url_row.visit_count());

996 map_entry->set_typed_count(url_row.typed_count());

997 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());

998 map_entry->set_url(url_row.url().spec());

999 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));

1000 const VisitInfoVector& visits(iter->second.visits);

1001 for (VisitInfoVector::const_iterator visit_iter = visits.begin();

1002 visit_iter != visits.end(); ++visit_iter) {

1003 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();

1004 visit_info->set_visit_time(visit_iter->first.ToInternalValue());

1005 visit_info->set_transition_type(visit_iter->second);

1006 }

1007 }

1008 }

1009

1010 void URLIndexPrivateData::SaveWordStartsMap(

1011 InMemoryURLIndexCacheItem* cache) const {

1012 if (word_starts_map_.empty())

1013 return;

1014 // For unit testing: Enable saving of the cache as an earlier version to

1015 // allow testing of cache file upgrading in ReadFromFile().

1016 // TODO(mrossetti): Instead of intruding on production code with this kind of

1017 // test harness, save a copy of an older version cache with known results.

1018 // Implement this when switching the caching over to SQLite.

1019 if (saved_cache_version_ < 1)

1020 return;

1021

1022 WordStartsMapItem* map_item = cache->mutable_word_starts_map();

1023 map_item->set_item_count(word_starts_map_.size());

1024 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();

1025 iter != word_starts_map_.end(); ++iter) {

1026 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();

1027 map_entry->set_history_id(iter->first);

1028 const RowWordStarts& word_starts(iter->second);

1029 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();

1030 i != word_starts.url_word_starts_.end(); ++i)

1031 map_entry->add_url_word_starts(*i);

1032 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();

1033 i != word_starts.title_word_starts_.end(); ++i)

1034 map_entry->add_title_word_starts(*i);

1035 }

1036 }

1037

1038 bool URLIndexPrivateData::RestorePrivateData(

1039 const InMemoryURLIndexCacheItem& cache,

1040 const std::string& languages) {

1041 last_time_rebuilt_from_history_ =

1042 base::Time::FromInternalValue(cache.last_rebuild_timestamp());

1043 const base::TimeDelta rebuilt_ago =

1044 base::Time::Now() - last_time_rebuilt_from_history_;

1045 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) \|\|

1046 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {

1047 // Cache is more than a week old or, somehow, from some time in the future.

1048 // It's probably a good time to rebuild the index from history to

1049 // allow synced entries to now appear, expired entries to disappear, etc.

1050 // Allow one day in the future to make the cache not rebuild on simple

1051 // system clock changes such as time zone changes.

1052 return false;

1053 }

1054 if (cache.has_version()) {

1055 if (cache.version() < kCurrentCacheFileVersion) {

1056 // Don't try to restore an old format cache file. (This will cause

1057 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData

1058 // from history.)

1059 return false;

1060 }

1061 restored_cache_version_ = cache.version();

1062 }

1063 return RestoreWordList(cache) && RestoreWordMap(cache) &&

1064 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&

1065 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);

1066 }

1067

1068 bool URLIndexPrivateData::RestoreWordList(

1069 const InMemoryURLIndexCacheItem& cache) {

1070 if (!cache.has_word_list())

1071 return false;

1072 const WordListItem& list_item(cache.word_list());

1073 uint32 expected_item_count = list_item.word_count();

1074 uint32 actual_item_count = list_item.word_size();

1075 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1076 return false;

1077 const RepeatedPtrField<std::string>& words(list_item.word());

1078 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();

1079 iter != words.end(); ++iter)

1080 word_list_.push_back(base::UTF8ToUTF16(*iter));

1081 return true;

1082 }

1083

1084 bool URLIndexPrivateData::RestoreWordMap(

1085 const InMemoryURLIndexCacheItem& cache) {

1086 if (!cache.has_word_map())

1087 return false;

1088 const WordMapItem& list_item(cache.word_map());

1089 uint32 expected_item_count = list_item.item_count();

1090 uint32 actual_item_count = list_item.word_map_entry_size();

1091 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1092 return false;

1093 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());

1094 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();

1095 iter != entries.end(); ++iter)

1096 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();

1097 return true;

1098 }

1099

1100 bool URLIndexPrivateData::RestoreCharWordMap(

1101 const InMemoryURLIndexCacheItem& cache) {

1102 if (!cache.has_char_word_map())

1103 return false;

1104 const CharWordMapItem& list_item(cache.char_word_map());

1105 uint32 expected_item_count = list_item.item_count();

1106 uint32 actual_item_count = list_item.char_word_map_entry_size();

1107 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1108 return false;

1109 const RepeatedPtrField<CharWordMapEntry>&

1110 entries(list_item.char_word_map_entry());

1111 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =

1112 entries.begin(); iter != entries.end(); ++iter) {

1113 expected_item_count = iter->item_count();

1114 actual_item_count = iter->word_id_size();

1115 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1116 return false;

1117 base::char16 uni_char = static_cast<base::char16>(iter->char_16());

1118 WordIDSet word_id_set;

1119 const RepeatedField<int32>& word_ids(iter->word_id());

1120 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();

1121 jiter != word_ids.end(); ++jiter)

1122 word_id_set.insert(*jiter);

1123 char_word_map_[uni_char] = word_id_set;

1124 }

1125 return true;

1126 }

1127

1128 bool URLIndexPrivateData::RestoreWordIDHistoryMap(

1129 const InMemoryURLIndexCacheItem& cache) {

1130 if (!cache.has_word_id_history_map())

1131 return false;

1132 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());

1133 uint32 expected_item_count = list_item.item_count();

1134 uint32 actual_item_count = list_item.word_id_history_map_entry_size();

1135 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1136 return false;

1137 const RepeatedPtrField<WordIDHistoryMapEntry>&

1138 entries(list_item.word_id_history_map_entry());

1139 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =

1140 entries.begin(); iter != entries.end(); ++iter) {

1141 expected_item_count = iter->item_count();

1142 actual_item_count = iter->history_id_size();

1143 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1144 return false;

1145 WordID word_id = iter->word_id();

1146 HistoryIDSet history_id_set;

1147 const RepeatedField<int64>& history_ids(iter->history_id());

1148 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();

1149 jiter != history_ids.end(); ++jiter) {

1150 history_id_set.insert(*jiter);

1151 AddToHistoryIDWordMap(*jiter, word_id);

1152 }

1153 word_id_history_map_[word_id] = history_id_set;

1154 }

1155 return true;

1156 }

1157

1158 bool URLIndexPrivateData::RestoreHistoryInfoMap(

1159 const InMemoryURLIndexCacheItem& cache) {

1160 if (!cache.has_history_info_map())

1161 return false;

1162 const HistoryInfoMapItem& list_item(cache.history_info_map());

1163 uint32 expected_item_count = list_item.item_count();

1164 uint32 actual_item_count = list_item.history_info_map_entry_size();

1165 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1166 return false;

1167 const RepeatedPtrField<HistoryInfoMapEntry>&

1168 entries(list_item.history_info_map_entry());

1169 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =

1170 entries.begin(); iter != entries.end(); ++iter) {

1171 HistoryID history_id = iter->history_id();

1172 GURL url(iter->url());

1173 history::URLRow url_row(url, history_id);

1174 url_row.set_visit_count(iter->visit_count());

1175 url_row.set_typed_count(iter->typed_count());

1176 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));

1177 if (iter->has_title()) {

1178 base::string16 title(base::UTF8ToUTF16(iter->title()));

1179 url_row.set_title(title);

1180 }

1181 history_info_map_[history_id].url_row = url_row;

1182

1183 // Restore visits list.

1184 VisitInfoVector visits;

1185 visits.reserve(iter->visits_size());

1186 for (int i = 0; i < iter->visits_size(); ++i) {

1187 visits.push_back(std::make_pair(

1188 base::Time::FromInternalValue(iter->visits(i).visit_time()),

1189 ui::PageTransitionFromInt(iter->visits(i).transition_type())));

1190 }

1191 history_info_map_[history_id].visits = visits;

1192 }

1193 return true;

1194 }

1195

1196 bool URLIndexPrivateData::RestoreWordStartsMap(

1197 const InMemoryURLIndexCacheItem& cache,

1198 const std::string& languages) {

1199 // Note that this function must be called after RestoreHistoryInfoMap() has

1200 // been run as the word starts may have to be recalculated from the urls and

1201 // page titles.

1202 if (cache.has_word_starts_map()) {

1203 const WordStartsMapItem& list_item(cache.word_starts_map());

1204 uint32 expected_item_count = list_item.item_count();

1205 uint32 actual_item_count = list_item.word_starts_map_entry_size();

1206 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1207 return false;

1208 const RepeatedPtrField<WordStartsMapEntry>&

1209 entries(list_item.word_starts_map_entry());

1210 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =

1211 entries.begin(); iter != entries.end(); ++iter) {

1212 HistoryID history_id = iter->history_id();

1213 RowWordStarts word_starts;

1214 // Restore the URL word starts.

1215 const RepeatedField<int32>& url_starts(iter->url_word_starts());

1216 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();

1217 jiter != url_starts.end(); ++jiter)

1218 word_starts.url_word_starts_.push_back(*jiter);

1219 // Restore the page title word starts.

1220 const RepeatedField<int32>& title_starts(iter->title_word_starts());

1221 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();

1222 jiter != title_starts.end(); ++jiter)

1223 word_starts.title_word_starts_.push_back(*jiter);

1224 word_starts_map_[history_id] = word_starts;

1225 }

1226 } else {

1227 // Since the cache did not contain any word starts we must rebuild then from

1228 // the URL and page titles.

1229 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();

1230 iter != history_info_map_.end(); ++iter) {

1231 RowWordStarts word_starts;

1232 const history::URLRow& row(iter->second.url_row);

1233 const base::string16& url =

1234 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);

1235 String16VectorFromString16(url, false, &word_starts.url_word_starts_);

1236 const base::string16& title =

1237 bookmarks::CleanUpTitleForMatching(row.title());

1238 String16VectorFromString16(title, false, &word_starts.title_word_starts_);

1239 word_starts_map_[iter->first] = word_starts;

1240 }

1241 }

1242 return true;

1243 }

1244

1245 // static

1246 bool URLIndexPrivateData::URLSchemeIsWhitelisted(

1247 const GURL& gurl,

1248 const std::set<std::string>& whitelist) {

1249 return whitelist.find(gurl.scheme()) != whitelist.end();

1250 }

1251

1252

1253 // SearchTermCacheItem ---------------------------------------------------------

1254

1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(

1256 const WordIDSet& word_id_set,

1257 const HistoryIDSet& history_id_set)

1258 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {

1259 }

1260

1261 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {

1262 }

1263

1264 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {

1265 }

1266

1267 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------

1268

1269 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(

1270 bookmarks::BookmarkModel* bookmark_model,

1271 const URLIndexPrivateData& private_data,

1272 const std::string& languages,

1273 const base::string16& lower_string,

1274 const String16Vector& lower_terms,

1275 const base::Time now)

1276 : bookmark_model_(bookmark_model),

1277 private_data_(private_data),

1278 languages_(languages),

1279 lower_string_(lower_string),

1280 lower_terms_(lower_terms),

1281 now_(now) {

1282 // Calculate offsets for each term. For instance, the offset for

1283 // ".net" should be 1, indicating that the actual word-part of the term

1284 // starts at offset 1.

1285 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);

1286 for (size_t i = 0; i < lower_terms_.size(); ++i) {

1287 base::i18n::BreakIterator iter(lower_terms_[i],

1288 base::i18n::BreakIterator::BREAK_WORD);

1289 // If the iterator doesn't work, assume an offset of 0.

1290 if (!iter.Init())

1291 continue;

1292 // Find the first word start. If the iterator didn't find a word break, set

1293 // an offset of term size. For example, the offset for "://" should be 3,

1294 // indicating that the word-part is missing.

1295 while (iter.Advance() && !iter.IsWord()) {}

1296

1297 lower_terms_to_word_starts_offsets_[i] = iter.prev();

1298 }

1299 }

1300

1301 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {

1302 }

1303

1304 void URLIndexPrivateData::AddHistoryMatch::operator()(

1305 const HistoryID history_id) {

1306 HistoryInfoMap::const_iterator hist_pos =

1307 private_data_.history_info_map_.find(history_id);

1308 if (hist_pos != private_data_.history_info_map_.end()) {

1309 const history::URLRow& hist_item = hist_pos->second.url_row;

1310 const VisitInfoVector& visits = hist_pos->second.visits;

1311 WordStartsMap::const_iterator starts_pos =

1312 private_data_.word_starts_map_.find(history_id);

1313 DCHECK(starts_pos != private_data_.word_starts_map_.end());

1314 ScoredHistoryMatch match(

1315 hist_item, visits, languages_, lower_string_, lower_terms_,

1316 lower_terms_to_word_starts_offsets_, starts_pos->second,

1317 bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()),

1318 now_);

1319 if (match.raw_score > 0)

1320 scored_matches_.push_back(match);

1321 }

1322 }

1323

1324

1325 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------

1326

1327 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(

1328 const HistoryInfoMap& history_info_map)

1329 : history_info_map_(history_info_map) {

1330 }

1331

1332 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {

1333 }

1334

1335 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(

1336 const HistoryID h1,

1337 const HistoryID h2) {

1338 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));

1339 if (entry1 == history_info_map_.end())

1340 return false;

1341 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));

1342 if (entry2 == history_info_map_.end())

1343 return true;

1344 const history::URLRow& r1(entry1->second.url_row);

1345 const history::URLRow& r2(entry2->second.url_row);

1346 // First cut: typed count, visit count, recency.

1347 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1348 // recently visited (within the last 12/24 hours) as highly important. Get

1349 // input from mpearson.

1350 if (r1.typed_count() != r2.typed_count())

1351 return (r1.typed_count() > r2.typed_count());

1352 if (r1.visit_count() != r2.visit_count())

1353 return (r1.visit_count() > r2.visit_count());

1354 return (r1.last_visit() > r2.last_visit());

1355 }

OLD	NEW

« no previous file with comments | « chrome/browser/autocomplete/url_index_private_data.h ('k') | chrome/browser/ui/omnibox/omnibox_view_browsertest.cc » ('j') | no next file with comments »