chrome/browser/history/url_index_private_data.cc - Issue 963823003: Move InMemoryURLIndex into chrome/browser/autocomplete

Side by Side Diff: chrome/browser/history/url_index_private_data.cc

Issue 963823003: Move InMemoryURLIndex into chrome/browser/autocomplete (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@shortcut-database

Patch Set: Fixing win_chromium_x64_rel_ng build Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/history/url_index_private_data.h"

6

7 #include <functional>

8 #include <iterator>

9 #include <limits>

10 #include <numeric>

11 #include <string>

12 #include <vector>

13

14 #include "base/basictypes.h"

15 #include "base/files/file_util.h"

16 #include "base/i18n/break_iterator.h"

17 #include "base/i18n/case_conversion.h"

18 #include "base/metrics/histogram.h"

19 #include "base/strings/string_util.h"

20 #include "base/strings/utf_string_conversions.h"

21 #include "base/time/time.h"

22 #include "chrome/browser/history/history_service.h"

23 #include "chrome/browser/history/in_memory_url_index.h"

24 #include "components/bookmarks/browser/bookmark_utils.h"

25 #include "components/history/core/browser/history_database.h"

26 #include "components/history/core/browser/history_db_task.h"

27 #include "net/base/net_util.h"

28

29 #if defined(USE_SYSTEM_PROTOBUF)

30 #include <google/protobuf/repeated_field.h>

31 #else

32 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"

33 #endif

34

35 using google::protobuf::RepeatedField;

36 using google::protobuf::RepeatedPtrField;

37 using in_memory_url_index::InMemoryURLIndexCacheItem;

38

39 namespace {

40 static const size_t kMaxVisitsToStoreInCache = 10u;

41 } // anonymous namespace

42

43 namespace history {

44

45 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;

46 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;

47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;

48 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;

49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry

50 CharWordMapEntry;

51 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem

52 WordIDHistoryMapItem;

53 typedef imui::

54 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry

55 WordIDHistoryMapEntry;

56 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;

57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry

58 HistoryInfoMapEntry;

59 typedef imui::

60 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo

61 HistoryInfoMapEntry_VisitInfo;

62 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;

63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry

64 WordStartsMapEntry;

65

66

67 // Algorithm Functions ---------------------------------------------------------

68

69 // Comparison function for sorting search terms by descending length.

70 bool LengthGreater(const base::string16& string_a,

71 const base::string16& string_b) {

72 return string_a.length() > string_b.length();

73 }

74

75

76 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

77

78 // HistoryDBTask used to update the recent visit data for a particular

79 // row from the history database.

80 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {

81 public:

82 explicit UpdateRecentVisitsFromHistoryDBTask(

83 URLIndexPrivateData* private_data,

84 URLID url_id);

85

86 bool RunOnDBThread(HistoryBackend* backend,

87 history::HistoryDatabase* db) override;

88 void DoneRunOnMainThread() override;

89

90 private:

91 ~UpdateRecentVisitsFromHistoryDBTask() override;

92

93 // The URLIndexPrivateData that gets updated after the historyDB

94 // task returns.

95 URLIndexPrivateData* private_data_;

96 // The ID of the URL to get visits for and then update.

97 URLID url_id_;

98 // Whether fetching the recent visits for the URL succeeded.

99 bool succeeded_;

100 // The awaited data that's shown to private_data_ for it to copy and

101 // store.

102 VisitVector recent_visits_;

103

104 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);

105 };

106

107 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(

108 URLIndexPrivateData* private_data,

109 URLID url_id)

110 : private_data_(private_data),

111 url_id_(url_id),

112 succeeded_(false) {

113 }

114

115 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(

116 HistoryBackend* backend,

117 HistoryDatabase* db) {

118 // Make sure the private data is going to get as many recent visits as

119 // ScoredHistoryMatch::GetFrequency() hopes to use.

120 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);

121 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,

122 kMaxVisitsToStoreInCache,

123 &recent_visits_);

124 if (!succeeded_)

125 recent_visits_.clear();

126 return true; // Always claim to be done; do not retry failures.

127 }

128

129 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {

130 if (succeeded_)

131 private_data_->UpdateRecentVisits(url_id_, recent_visits_);

132 }

133

134 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {

135 }

136

137

138 // URLIndexPrivateData ---------------------------------------------------------

139

140 URLIndexPrivateData::URLIndexPrivateData()

141 : restored_cache_version_(0),

142 saved_cache_version_(kCurrentCacheFileVersion),

143 pre_filter_item_count_(0),

144 post_filter_item_count_(0),

145 post_scoring_item_count_(0) {

146 }

147

148 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(

149 base::string16 search_string,

150 size_t cursor_position,

151 size_t max_matches,

152 const std::string& languages,

153 const history::ScoredHistoryMatch::Builder& builder) {

154 // If cursor position is set and useful (not at either end of the

155 // string), allow the search string to be broken at cursor position.

156 // We do this by pretending there's a space where the cursor is.

157 if ((cursor_position != base::string16::npos) &&

158 (cursor_position < search_string.length()) &&

159 (cursor_position > 0)) {

160 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));

161 }

162 pre_filter_item_count_ = 0;

163 post_filter_item_count_ = 0;

164 post_scoring_item_count_ = 0;

165 // The search string we receive may contain escaped characters. For reducing

166 // the index we need individual, lower-cased words, ignoring escapings. For

167 // the final filtering we need whitespace separated substrings possibly

168 // containing escaped characters.

169 base::string16 lower_raw_string(base::i18n::ToLower(search_string));

170 base::string16 lower_unescaped_string =

171 net::UnescapeURLComponent(lower_raw_string,

172 net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

173 // Extract individual 'words' (as opposed to 'terms'; see below) from the

174 // search string. When the user types "colspec=ID%20Mstone Release" we get

175 // four 'words': "colspec", "id", "mstone" and "release".

176 String16Vector lower_words(

177 history::String16VectorFromString16(lower_unescaped_string, false, NULL));

178 ScoredHistoryMatches scored_items;

179

180 // Do nothing if we have indexed no words (probably because we've not been

181 // initialized yet) or the search string has no words.

182 if (word_list_.empty() \|\| lower_words.empty()) {

183 search_term_cache_.clear(); // Invalidate the term cache.

184 return scored_items;

185 }

186

187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep

188 // approach.

189 ResetSearchTermCache();

190

191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);

192

193 // Trim the candidate pool if it is large. Note that we do not filter out

194 // items that do not contain the search terms as proper substrings -- doing

195 // so is the performance-costly operation we are trying to avoid in order

196 // to maintain omnibox responsiveness.

197 const size_t kItemsToScoreLimit = 500;

198 pre_filter_item_count_ = history_id_set.size();

199 // If we trim the results set we do not want to cache the results for next

200 // time as the user's ultimately desired result could easily be eliminated

201 // in this early rough filter.

202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);

203 if (was_trimmed) {

204 HistoryIDVector history_ids;

205 std::copy(history_id_set.begin(), history_id_set.end(),

206 std::back_inserter(history_ids));

207 // Trim down the set by sorting by typed-count, visit-count, and last

208 // visit.

209 HistoryItemFactorGreater

210 item_factor_functor(history_info_map_);

211 std::partial_sort(history_ids.begin(),

212 history_ids.begin() + kItemsToScoreLimit,

213 history_ids.end(),

214 item_factor_functor);

215 history_id_set.clear();

216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,

217 std::inserter(history_id_set, history_id_set.end()));

218 post_filter_item_count_ = history_id_set.size();

219 }

220

221 // Pass over all of the candidates filtering out any without a proper

222 // substring match, inserting those which pass in order by score. Note that

223 // in this step we are using the raw search string complete with escaped

224 // URL elements. When the user has specifically typed something akin to

225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that

226 // specific substring appears in the URL or page title.

227

228 // We call these 'terms' (as opposed to 'words'; see above) as in this case

229 // we only want to break up the search string on 'true' whitespace rather than

230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we

231 // get two 'terms': "colspec=id%20mstone" and "release".

232 history::String16Vector lower_raw_terms;

233 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,

234 &lower_raw_terms) == 0) {

235 // Don't score matches when there are no terms to score against. (It's

236 // possible that the word break iterater that extracts words to search

237 // for in the database allows some whitespace "words" whereas Tokenize

238 // excludes a long list of whitespace.) One could write a scoring

239 // function that gives a reasonable order to matches when there

240 // are no terms (i.e., all the words are some form of whitespace),

241 // but this is such a rare edge case that it's not worth the time.

242 return scored_items;

243 }

244 scored_items =

245 std::for_each(

246 history_id_set.begin(), history_id_set.end(),

247 AddHistoryMatch(*this, languages, lower_raw_string, lower_raw_terms,

248 base::Time::Now(), builder)).ScoredMatches();

249

250 // Select and sort only the top \|max_matches\| results.

251 if (scored_items.size() > max_matches) {

252 std::partial_sort(scored_items.begin(),

253 scored_items.begin() +

254 max_matches,

255 scored_items.end(),

256 ScoredHistoryMatch::MatchScoreGreater);

257 scored_items.resize(max_matches);

258 } else {

259 std::sort(scored_items.begin(), scored_items.end(),

260 ScoredHistoryMatch::MatchScoreGreater);

261 }

262 post_scoring_item_count_ = scored_items.size();

263

264 if (was_trimmed) {

265 search_term_cache_.clear(); // Invalidate the term cache.

266 } else {

267 // Remove any stale SearchTermCacheItems.

268 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

269 cache_iter != search_term_cache_.end(); ) {

270 if (!cache_iter->second.used_)

271 search_term_cache_.erase(cache_iter++);

272 else

273 ++cache_iter;

274 }

275 }

276

277 return scored_items;

278 }

279

280 bool URLIndexPrivateData::UpdateURL(

281 HistoryService* history_service,

282 const URLRow& row,

283 const std::string& languages,

284 const std::set<std::string>& scheme_whitelist,

285 base::CancelableTaskTracker* tracker) {

286 // The row may or may not already be in our index. If it is not already

287 // indexed and it qualifies then it gets indexed. If it is already

288 // indexed and still qualifies then it gets updated, otherwise it

289 // is deleted from the index.

290 bool row_was_updated = false;

291 URLID row_id = row.id();

292 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);

293 if (row_pos == history_info_map_.end()) {

294 // This new row should be indexed if it qualifies.

295 URLRow new_row(row);

296 new_row.set_id(row_id);

297 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&

298 IndexRow(NULL,

299 history_service,

300 new_row,

301 languages,

302 scheme_whitelist,

303 tracker);

304 } else if (RowQualifiesAsSignificant(row, base::Time())) {

305 // This indexed row still qualifies and will be re-indexed.

306 // The url won't have changed but the title, visit count, etc.

307 // might have changed.

308 URLRow& row_to_update = row_pos->second.url_row;

309 bool title_updated = row_to_update.title() != row.title();

310 if (row_to_update.visit_count() != row.visit_count() \|\|

311 row_to_update.typed_count() != row.typed_count() \|\|

312 row_to_update.last_visit() != row.last_visit() \|\| title_updated) {

313 row_to_update.set_visit_count(row.visit_count());

314 row_to_update.set_typed_count(row.typed_count());

315 row_to_update.set_last_visit(row.last_visit());

316 // If something appears to have changed, update the recent visits

317 // information.

318 ScheduleUpdateRecentVisits(history_service, row_id, tracker);

319 // While the URL is guaranteed to remain stable, the title may have

320 // changed. If so, then update the index with the changed words.

321 if (title_updated) {

322 // Clear all words associated with this row and re-index both the

323 // URL and title.

324 RemoveRowWordsFromIndex(row_to_update);

325 row_to_update.set_title(row.title());

326 RowWordStarts word_starts;

327 AddRowWordsToIndex(row_to_update, &word_starts, languages);

328 word_starts_map_[row_id] = word_starts;

329 }

330 row_was_updated = true;

331 }

332 } else {

333 // This indexed row no longer qualifies and will be de-indexed by

334 // clearing all words associated with this row.

335 RemoveRowFromIndex(row);

336 row_was_updated = true;

337 }

338 if (row_was_updated)

339 search_term_cache_.clear(); // This invalidates the cache.

340 return row_was_updated;

341 }

342

343 void URLIndexPrivateData::UpdateRecentVisits(

344 URLID url_id,

345 const VisitVector& recent_visits) {

346 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);

347 if (row_pos != history_info_map_.end()) {

348 VisitInfoVector* visits = &row_pos->second.visits;

349 visits->clear();

350 const size_t size =

351 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);

352 visits->reserve(size);

353 for (size_t i = 0; i < size; i++) {

354 // Copy from the VisitVector the only fields visits needs.

355 visits->push_back(std::make_pair(recent_visits[i].visit_time,

356 recent_visits[i].transition));

357 }

358 }

359 // Else: Oddly, the URL doesn't seem to exist in the private index.

360 // Ignore this update. This can happen if, for instance, the user

361 // removes the URL from URLIndexPrivateData before the historyDB call

362 // returns.

363 }

364

365 void URLIndexPrivateData::ScheduleUpdateRecentVisits(

366 HistoryService* history_service,

367 URLID url_id,

368 base::CancelableTaskTracker* tracker) {

369 history_service->ScheduleDBTask(

370 scoped_ptr<history::HistoryDBTask>(

371 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);

372 }

373

374 // Helper functor for DeleteURL.

375 class HistoryInfoMapItemHasURL {

376 public:

377 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}

378

379 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {

380 return item.second.url_row.url() == url_;

381 }

382

383 private:

384 const GURL& url_;

385 };

386

387 bool URLIndexPrivateData::DeleteURL(const GURL& url) {

388 // Find the matching entry in the history_info_map_.

389 HistoryInfoMap::iterator pos = std::find_if(

390 history_info_map_.begin(),

391 history_info_map_.end(),

392 HistoryInfoMapItemHasURL(url));

393 if (pos == history_info_map_.end())

394 return false;

395 RemoveRowFromIndex(pos->second.url_row);

396 search_term_cache_.clear(); // This invalidates the cache.

397 return true;

398 }

399

400 // static

401 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(

402 const base::FilePath& file_path,

403 const std::string& languages) {

404 base::TimeTicks beginning_time = base::TimeTicks::Now();

405 if (!base::PathExists(file_path))

406 return NULL;

407 std::string data;

408 // If there is no cache file then simply give up. This will cause us to

409 // attempt to rebuild from the history database.

410 if (!base::ReadFileToString(file_path, &data))

411 return NULL;

412

413 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);

414 InMemoryURLIndexCacheItem index_cache;

415 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {

416 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "

417 << file_path.value();

418 return restored_data;

419 }

420

421 if (!restored_data->RestorePrivateData(index_cache, languages))

422 return NULL;

423

424 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",

425 base::TimeTicks::Now() - beginning_time);

426 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",

427 restored_data->history_id_word_map_.size());

428 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());

429 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",

430 restored_data->word_map_.size());

431 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",

432 restored_data->char_word_map_.size());

433 if (restored_data->Empty())

434 return NULL; // 'No data' is the same as a failed reload.

435 return restored_data;

436 }

437

438 // static

439 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(

440 HistoryDatabase* history_db,

441 const std::string& languages,

442 const std::set<std::string>& scheme_whitelist) {

443 if (!history_db)

444 return NULL;

445

446 base::TimeTicks beginning_time = base::TimeTicks::Now();

447

448 scoped_refptr<URLIndexPrivateData>

449 rebuilt_data(new URLIndexPrivateData);

450 URLDatabase::URLEnumerator history_enum;

451 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))

452 return NULL;

453 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();

454 for (URLRow row; history_enum.GetNextURL(&row); ) {

455 rebuilt_data->IndexRow(

456 history_db, NULL, row, languages, scheme_whitelist, NULL);

457 }

458

459 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",

460 base::TimeTicks::Now() - beginning_time);

461 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",

462 rebuilt_data->history_id_word_map_.size());

463 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",

464 rebuilt_data->word_map_.size());

465 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",

466 rebuilt_data->char_word_map_.size());

467 return rebuilt_data;

468 }

469

470 // static

471 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(

472 scoped_refptr<URLIndexPrivateData> private_data,

473 const base::FilePath& file_path) {

474 DCHECK(private_data.get());

475 DCHECK(!file_path.empty());

476 return private_data->SaveToFile(file_path);

477 }

478

479 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {

480 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;

481 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;

482 data_copy->word_list_ = word_list_;

483 data_copy->available_words_ = available_words_;

484 data_copy->word_map_ = word_map_;

485 data_copy->char_word_map_ = char_word_map_;

486 data_copy->word_id_history_map_ = word_id_history_map_;

487 data_copy->history_id_word_map_ = history_id_word_map_;

488 data_copy->history_info_map_ = history_info_map_;

489 data_copy->word_starts_map_ = word_starts_map_;

490 return data_copy;

491 // Not copied:

492 // search_term_cache_

493 // pre_filter_item_count_

494 // post_filter_item_count_

495 // post_scoring_item_count_

496 }

497

498 bool URLIndexPrivateData::Empty() const {

499 return history_info_map_.empty();

500 }

501

502 void URLIndexPrivateData::Clear() {

503 last_time_rebuilt_from_history_ = base::Time();

504 word_list_.clear();

505 available_words_.clear();

506 word_map_.clear();

507 char_word_map_.clear();

508 word_id_history_map_.clear();

509 history_id_word_map_.clear();

510 history_info_map_.clear();

511 word_starts_map_.clear();

512 }

513

514 URLIndexPrivateData::~URLIndexPrivateData() {}

515

516 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(

517 const String16Vector& unsorted_words) {

518 // Break the terms down into individual terms (words), get the candidate

519 // set for each term, and intersect each to get a final candidate list.

520 // Note that a single 'term' from the user's perspective might be

521 // a string like "http://www.somewebsite.com" which, from our perspective,

522 // is four words: 'http', 'www', 'somewebsite', and 'com'.

523 HistoryIDSet history_id_set;

524 String16Vector words(unsorted_words);

525 // Sort the words into the longest first as such are likely to narrow down

526 // the results quicker. Also, single character words are the most expensive

527 // to process so save them for last.

528 std::sort(words.begin(), words.end(), LengthGreater);

529 for (String16Vector::iterator iter = words.begin(); iter != words.end();

530 ++iter) {

531 base::string16 uni_word = *iter;

532 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);

533 if (term_history_set.empty()) {

534 history_id_set.clear();

535 break;

536 }

537 if (iter == words.begin()) {

538 history_id_set.swap(term_history_set);

539 } else {

540 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(

541 history_id_set, term_history_set);

542 history_id_set.swap(new_history_id_set);

543 }

544 }

545 return history_id_set;

546 }

547

548 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(

549 const base::string16& term) {

550 if (term.empty())

551 return HistoryIDSet();

552

553 // TODO(mrossetti): Consider optimizing for very common terms such as

554 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently

555 // occuring words in the user's searches.

556

557 size_t term_length = term.length();

558 WordIDSet word_id_set;

559 if (term_length > 1) {

560 // See if this term or a prefix thereof is present in the cache.

561 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());

562 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();

563 cache_iter != search_term_cache_.end(); ++cache_iter) {

564 if (StartsWith(term, cache_iter->first, false) &&

565 (best_prefix == search_term_cache_.end() \|\|

566 cache_iter->first.length() > best_prefix->first.length()))

567 best_prefix = cache_iter;

568 }

569

570 // If a prefix was found then determine the leftover characters to be used

571 // for further refining the results from that prefix.

572 Char16Set prefix_chars;

573 base::string16 leftovers(term);

574 if (best_prefix != search_term_cache_.end()) {

575 // If the prefix is an exact match for the term then grab the cached

576 // results and we're done.

577 size_t prefix_length = best_prefix->first.length();

578 if (prefix_length == term_length) {

579 best_prefix->second.used_ = true;

580 return best_prefix->second.history_id_set_;

581 }

582

583 // Otherwise we have a handy starting point.

584 // If there are no history results for this prefix then we can bail early

585 // as there will be no history results for the full term.

586 if (best_prefix->second.history_id_set_.empty()) {

587 search_term_cache_[term] = SearchTermCacheItem();

588 return HistoryIDSet();

589 }

590 word_id_set = best_prefix->second.word_id_set_;

591 prefix_chars = Char16SetFromString16(best_prefix->first);

592 leftovers = term.substr(prefix_length);

593 }

594

595 // Filter for each remaining, unique character in the term.

596 Char16Set leftover_chars = Char16SetFromString16(leftovers);

597 Char16Set unique_chars =

598 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);

599

600 // Reduce the word set with any leftover, unprocessed characters.

601 if (!unique_chars.empty()) {

602 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));

603 // We might come up empty on the leftovers.

604 if (leftover_set.empty()) {

605 search_term_cache_[term] = SearchTermCacheItem();

606 return HistoryIDSet();

607 }

608 // Or there may not have been a prefix from which to start.

609 if (prefix_chars.empty()) {

610 word_id_set.swap(leftover_set);

611 } else {

612 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(

613 word_id_set, leftover_set);

614 word_id_set.swap(new_word_id_set);

615 }

616 }

617

618 // We must filter the word list because the resulting word set surely

619 // contains words which do not have the search term as a proper subset.

620 for (WordIDSet::iterator word_set_iter = word_id_set.begin();

621 word_set_iter != word_id_set.end(); ) {

622 if (word_list_[*word_set_iter].find(term) == base::string16::npos)

623 word_id_set.erase(word_set_iter++);

624 else

625 ++word_set_iter;

626 }

627 } else {

628 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));

629 }

630

631 // If any words resulted then we can compose a set of history IDs by unioning

632 // the sets from each word.

633 HistoryIDSet history_id_set;

634 if (!word_id_set.empty()) {

635 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

636 word_id_iter != word_id_set.end(); ++word_id_iter) {

637 WordID word_id = *word_id_iter;

638 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);

639 if (word_iter != word_id_history_map_.end()) {

640 HistoryIDSet& word_history_id_set(word_iter->second);

641 history_id_set.insert(word_history_id_set.begin(),

642 word_history_id_set.end());

643 }

644 }

645 }

646

647 // Record a new cache entry for this word if the term is longer than

648 // a single character.

649 if (term_length > 1)

650 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);

651

652 return history_id_set;

653 }

654

655 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(

656 const Char16Set& term_chars) {

657 WordIDSet word_id_set;

658 for (Char16Set::const_iterator c_iter = term_chars.begin();

659 c_iter != term_chars.end(); ++c_iter) {

660 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);

661 if (char_iter == char_word_map_.end()) {

662 // A character was not found so there are no matching results: bail.

663 word_id_set.clear();

664 break;

665 }

666 WordIDSet& char_word_id_set(char_iter->second);

667 // It is possible for there to no longer be any words associated with

668 // a particular character. Give up in that case.

669 if (char_word_id_set.empty()) {

670 word_id_set.clear();

671 break;

672 }

673

674 if (c_iter == term_chars.begin()) {

675 // First character results becomes base set of results.

676 word_id_set = char_word_id_set;

677 } else {

678 // Subsequent character results get intersected in.

679 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(

680 word_id_set, char_word_id_set);

681 word_id_set.swap(new_word_id_set);

682 }

683 }

684 return word_id_set;

685 }

686

687 bool URLIndexPrivateData::IndexRow(

688 HistoryDatabase* history_db,

689 HistoryService* history_service,

690 const URLRow& row,

691 const std::string& languages,

692 const std::set<std::string>& scheme_whitelist,

693 base::CancelableTaskTracker* tracker) {

694 const GURL& gurl(row.url());

695

696 // Index only URLs with a whitelisted scheme.

697 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))

698 return false;

699

700 URLID row_id = row.id();

701 // Strip out username and password before saving and indexing.

702 base::string16 url(net::FormatUrl(gurl, languages,

703 net::kFormatUrlOmitUsernamePassword,

704 net::UnescapeRule::NONE,

705 NULL, NULL, NULL));

706

707 HistoryID history_id = static_cast<HistoryID>(row_id);

708 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());

709

710 // Add the row for quick lookup in the history info store.

711 URLRow new_row(GURL(url), row_id);

712 new_row.set_visit_count(row.visit_count());

713 new_row.set_typed_count(row.typed_count());

714 new_row.set_last_visit(row.last_visit());

715 new_row.set_title(row.title());

716 history_info_map_[history_id].url_row = new_row;

717

718 // Index the words contained in the URL and title of the row.

719 RowWordStarts word_starts;

720 AddRowWordsToIndex(new_row, &word_starts, languages);

721 word_starts_map_[history_id] = word_starts;

722

723 // Update the recent visits information or schedule the update

724 // as appropriate.

725 if (history_db) {

726 // We'd like to check that we're on the history DB thread.

727 // However, unittest code actually calls this on the UI thread.

728 // So we don't do any thread checks.

729 VisitVector recent_visits;

730 // Make sure the private data is going to get as many recent visits as

731 // ScoredHistoryMatch::GetFrequency() hopes to use.

732 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);

733 if (history_db->GetMostRecentVisitsForURL(row_id,

734 kMaxVisitsToStoreInCache,

735 &recent_visits))

736 UpdateRecentVisits(row_id, recent_visits);

737 } else {

738 DCHECK(tracker);

739 DCHECK(history_service);

740 ScheduleUpdateRecentVisits(history_service, row_id, tracker);

741 }

742

743 return true;

744 }

745

746 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,

747 RowWordStarts* word_starts,

748 const std::string& languages) {

749 HistoryID history_id = static_cast<HistoryID>(row.id());

750 // Split URL into individual, unique words then add in the title words.

751 const GURL& gurl(row.url());

752 const base::string16& url =

753 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);

754 String16Set url_words = String16SetFromString16(url,

755 word_starts ? &word_starts->url_word_starts_ : NULL);

756 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());

757 String16Set title_words = String16SetFromString16(title,

758 word_starts ? &word_starts->title_word_starts_ : NULL);

759 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);

760 for (String16Set::iterator word_iter = words.begin();

761 word_iter != words.end(); ++word_iter)

762 AddWordToIndex(*word_iter, history_id);

763

764 search_term_cache_.clear(); // Invalidate the term cache.

765 }

766

767 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,

768 HistoryID history_id) {

769 WordMap::iterator word_pos = word_map_.find(term);

770 if (word_pos != word_map_.end())

771 UpdateWordHistory(word_pos->second, history_id);

772 else

773 AddWordHistory(term, history_id);

774 }

775

776 void URLIndexPrivateData::AddWordHistory(const base::string16& term,

777 HistoryID history_id) {

778 WordID word_id = word_list_.size();

779 if (available_words_.empty()) {

780 word_list_.push_back(term);

781 } else {

782 word_id = *(available_words_.begin());

783 word_list_[word_id] = term;

784 available_words_.erase(word_id);

785 }

786 word_map_[term] = word_id;

787

788 HistoryIDSet history_id_set;

789 history_id_set.insert(history_id);

790 word_id_history_map_[word_id] = history_id_set;

791 AddToHistoryIDWordMap(history_id, word_id);

792

793 // For each character in the newly added word (i.e. a word that is not

794 // already in the word index), add the word to the character index.

795 Char16Set characters = Char16SetFromString16(term);

796 for (Char16Set::iterator uni_char_iter = characters.begin();

797 uni_char_iter != characters.end(); ++uni_char_iter) {

798 base::char16 uni_char = *uni_char_iter;

799 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);

800 if (char_iter != char_word_map_.end()) {

801 // Update existing entry in the char/word index.

802 WordIDSet& word_id_set(char_iter->second);

803 word_id_set.insert(word_id);

804 } else {

805 // Create a new entry in the char/word index.

806 WordIDSet word_id_set;

807 word_id_set.insert(word_id);

808 char_word_map_[uni_char] = word_id_set;

809 }

810 }

811 }

812

813 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,

814 HistoryID history_id) {

815 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);

816 DCHECK(history_pos != word_id_history_map_.end());

817 HistoryIDSet& history_id_set(history_pos->second);

818 history_id_set.insert(history_id);

819 AddToHistoryIDWordMap(history_id, word_id);

820 }

821

822 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,

823 WordID word_id) {

824 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);

825 if (iter != history_id_word_map_.end()) {

826 WordIDSet& word_id_set(iter->second);

827 word_id_set.insert(word_id);

828 } else {

829 WordIDSet word_id_set;

830 word_id_set.insert(word_id);

831 history_id_word_map_[history_id] = word_id_set;

832 }

833 }

834

835 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {

836 RemoveRowWordsFromIndex(row);

837 HistoryID history_id = static_cast<HistoryID>(row.id());

838 history_info_map_.erase(history_id);

839 word_starts_map_.erase(history_id);

840 }

841

842 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {

843 // Remove the entries in history_id_word_map_ and word_id_history_map_ for

844 // this row.

845 HistoryID history_id = static_cast<HistoryID>(row.id());

846 WordIDSet word_id_set = history_id_word_map_[history_id];

847 history_id_word_map_.erase(history_id);

848

849 // Reconcile any changes to word usage.

850 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

851 word_id_iter != word_id_set.end(); ++word_id_iter) {

852 WordID word_id = *word_id_iter;

853 word_id_history_map_[word_id].erase(history_id);

854 if (!word_id_history_map_[word_id].empty())

855 continue; // The word is still in use.

856

857 // The word is no longer in use. Reconcile any changes to character usage.

858 base::string16 word = word_list_[word_id];

859 Char16Set characters = Char16SetFromString16(word);

860 for (Char16Set::iterator uni_char_iter = characters.begin();

861 uni_char_iter != characters.end(); ++uni_char_iter) {

862 base::char16 uni_char = *uni_char_iter;

863 char_word_map_[uni_char].erase(word_id);

864 if (char_word_map_[uni_char].empty())

865 char_word_map_.erase(uni_char); // No longer in use.

866 }

867

868 // Complete the removal of references to the word.

869 word_id_history_map_.erase(word_id);

870 word_map_.erase(word);

871 word_list_[word_id] = base::string16();

872 available_words_.insert(word_id);

873 }

874 }

875

876 void URLIndexPrivateData::ResetSearchTermCache() {

877 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();

878 iter != search_term_cache_.end(); ++iter)

879 iter->second.used_ = false;

880 }

881

882 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {

883 base::TimeTicks beginning_time = base::TimeTicks::Now();

884 InMemoryURLIndexCacheItem index_cache;

885 SavePrivateData(&index_cache);

886 std::string data;

887 if (!index_cache.SerializeToString(&data)) {

888 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";

889 return false;

890 }

891

892 int size = data.size();

893 if (base::WriteFile(file_path, data.c_str(), size) != size) {

894 LOG(WARNING) << "Failed to write " << file_path.value();

895 return false;

896 }

897 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",

898 base::TimeTicks::Now() - beginning_time);

899 return true;

900 }

901

902 void URLIndexPrivateData::SavePrivateData(

903 InMemoryURLIndexCacheItem* cache) const {

904 DCHECK(cache);

905 cache->set_last_rebuild_timestamp(

906 last_time_rebuilt_from_history_.ToInternalValue());

907 cache->set_version(saved_cache_version_);

908 // history_item_count_ is no longer used but rather than change the protobuf

909 // definition use a placeholder. This will go away with the switch to SQLite.

910 cache->set_history_item_count(0);

911 SaveWordList(cache);

912 SaveWordMap(cache);

913 SaveCharWordMap(cache);

914 SaveWordIDHistoryMap(cache);

915 SaveHistoryInfoMap(cache);

916 SaveWordStartsMap(cache);

917 }

918

919 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {

920 if (word_list_.empty())

921 return;

922 WordListItem* list_item = cache->mutable_word_list();

923 list_item->set_word_count(word_list_.size());

924 for (String16Vector::const_iterator iter = word_list_.begin();

925 iter != word_list_.end(); ++iter)

926 list_item->add_word(base::UTF16ToUTF8(*iter));

927 }

928

929 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {

930 if (word_map_.empty())

931 return;

932 WordMapItem* map_item = cache->mutable_word_map();

933 map_item->set_item_count(word_map_.size());

934 for (WordMap::const_iterator iter = word_map_.begin();

935 iter != word_map_.end(); ++iter) {

936 WordMapEntry* map_entry = map_item->add_word_map_entry();

937 map_entry->set_word(base::UTF16ToUTF8(iter->first));

938 map_entry->set_word_id(iter->second);

939 }

940 }

941

942 void URLIndexPrivateData::SaveCharWordMap(

943 InMemoryURLIndexCacheItem* cache) const {

944 if (char_word_map_.empty())

945 return;

946 CharWordMapItem* map_item = cache->mutable_char_word_map();

947 map_item->set_item_count(char_word_map_.size());

948 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();

949 iter != char_word_map_.end(); ++iter) {

950 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();

951 map_entry->set_char_16(iter->first);

952 const WordIDSet& word_id_set(iter->second);

953 map_entry->set_item_count(word_id_set.size());

954 for (WordIDSet::const_iterator set_iter = word_id_set.begin();

955 set_iter != word_id_set.end(); ++set_iter)

956 map_entry->add_word_id(*set_iter);

957 }

958 }

959

960 void URLIndexPrivateData::SaveWordIDHistoryMap(

961 InMemoryURLIndexCacheItem* cache) const {

962 if (word_id_history_map_.empty())

963 return;

964 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();

965 map_item->set_item_count(word_id_history_map_.size());

966 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();

967 iter != word_id_history_map_.end(); ++iter) {

968 WordIDHistoryMapEntry* map_entry =

969 map_item->add_word_id_history_map_entry();

970 map_entry->set_word_id(iter->first);

971 const HistoryIDSet& history_id_set(iter->second);

972 map_entry->set_item_count(history_id_set.size());

973 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();

974 set_iter != history_id_set.end(); ++set_iter)

975 map_entry->add_history_id(*set_iter);

976 }

977 }

978

979 void URLIndexPrivateData::SaveHistoryInfoMap(

980 InMemoryURLIndexCacheItem* cache) const {

981 if (history_info_map_.empty())

982 return;

983 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();

984 map_item->set_item_count(history_info_map_.size());

985 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();

986 iter != history_info_map_.end(); ++iter) {

987 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();

988 map_entry->set_history_id(iter->first);

989 const URLRow& url_row(iter->second.url_row);

990 // Note: We only save information that contributes to the index so there

991 // is no need to save search_term_cache_ (not persistent).

992 map_entry->set_visit_count(url_row.visit_count());

993 map_entry->set_typed_count(url_row.typed_count());

994 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());

995 map_entry->set_url(url_row.url().spec());

996 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));

997 const VisitInfoVector& visits(iter->second.visits);

998 for (VisitInfoVector::const_iterator visit_iter = visits.begin();

999 visit_iter != visits.end(); ++visit_iter) {

1000 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();

1001 visit_info->set_visit_time(visit_iter->first.ToInternalValue());

1002 visit_info->set_transition_type(visit_iter->second);

1003 }

1004 }

1005 }

1006

1007 void URLIndexPrivateData::SaveWordStartsMap(

1008 InMemoryURLIndexCacheItem* cache) const {

1009 if (word_starts_map_.empty())

1010 return;

1011 // For unit testing: Enable saving of the cache as an earlier version to

1012 // allow testing of cache file upgrading in ReadFromFile().

1013 // TODO(mrossetti): Instead of intruding on production code with this kind of

1014 // test harness, save a copy of an older version cache with known results.

1015 // Implement this when switching the caching over to SQLite.

1016 if (saved_cache_version_ < 1)

1017 return;

1018

1019 WordStartsMapItem* map_item = cache->mutable_word_starts_map();

1020 map_item->set_item_count(word_starts_map_.size());

1021 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();

1022 iter != word_starts_map_.end(); ++iter) {

1023 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();

1024 map_entry->set_history_id(iter->first);

1025 const RowWordStarts& word_starts(iter->second);

1026 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();

1027 i != word_starts.url_word_starts_.end(); ++i)

1028 map_entry->add_url_word_starts(*i);

1029 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();

1030 i != word_starts.title_word_starts_.end(); ++i)

1031 map_entry->add_title_word_starts(*i);

1032 }

1033 }

1034

1035 bool URLIndexPrivateData::RestorePrivateData(

1036 const InMemoryURLIndexCacheItem& cache,

1037 const std::string& languages) {

1038 last_time_rebuilt_from_history_ =

1039 base::Time::FromInternalValue(cache.last_rebuild_timestamp());

1040 const base::TimeDelta rebuilt_ago =

1041 base::Time::Now() - last_time_rebuilt_from_history_;

1042 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) \|\|

1043 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {

1044 // Cache is more than a week old or, somehow, from some time in the future.

1045 // It's probably a good time to rebuild the index from history to

1046 // allow synced entries to now appear, expired entries to disappear, etc.

1047 // Allow one day in the future to make the cache not rebuild on simple

1048 // system clock changes such as time zone changes.

1049 return false;

1050 }

1051 if (cache.has_version()) {

1052 if (cache.version() < kCurrentCacheFileVersion) {

1053 // Don't try to restore an old format cache file. (This will cause

1054 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData

1055 // from history.)

1056 return false;

1057 }

1058 restored_cache_version_ = cache.version();

1059 }

1060 return RestoreWordList(cache) && RestoreWordMap(cache) &&

1061 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&

1062 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);

1063 }

1064

1065 bool URLIndexPrivateData::RestoreWordList(

1066 const InMemoryURLIndexCacheItem& cache) {

1067 if (!cache.has_word_list())

1068 return false;

1069 const WordListItem& list_item(cache.word_list());

1070 uint32 expected_item_count = list_item.word_count();

1071 uint32 actual_item_count = list_item.word_size();

1072 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1073 return false;

1074 const RepeatedPtrField<std::string>& words(list_item.word());

1075 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();

1076 iter != words.end(); ++iter)

1077 word_list_.push_back(base::UTF8ToUTF16(*iter));

1078 return true;

1079 }

1080

1081 bool URLIndexPrivateData::RestoreWordMap(

1082 const InMemoryURLIndexCacheItem& cache) {

1083 if (!cache.has_word_map())

1084 return false;

1085 const WordMapItem& list_item(cache.word_map());

1086 uint32 expected_item_count = list_item.item_count();

1087 uint32 actual_item_count = list_item.word_map_entry_size();

1088 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1089 return false;

1090 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());

1091 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();

1092 iter != entries.end(); ++iter)

1093 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();

1094 return true;

1095 }

1096

1097 bool URLIndexPrivateData::RestoreCharWordMap(

1098 const InMemoryURLIndexCacheItem& cache) {

1099 if (!cache.has_char_word_map())

1100 return false;

1101 const CharWordMapItem& list_item(cache.char_word_map());

1102 uint32 expected_item_count = list_item.item_count();

1103 uint32 actual_item_count = list_item.char_word_map_entry_size();

1104 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1105 return false;

1106 const RepeatedPtrField<CharWordMapEntry>&

1107 entries(list_item.char_word_map_entry());

1108 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =

1109 entries.begin(); iter != entries.end(); ++iter) {

1110 expected_item_count = iter->item_count();

1111 actual_item_count = iter->word_id_size();

1112 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1113 return false;

1114 base::char16 uni_char = static_cast<base::char16>(iter->char_16());

1115 WordIDSet word_id_set;

1116 const RepeatedField<int32>& word_ids(iter->word_id());

1117 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();

1118 jiter != word_ids.end(); ++jiter)

1119 word_id_set.insert(*jiter);

1120 char_word_map_[uni_char] = word_id_set;

1121 }

1122 return true;

1123 }

1124

1125 bool URLIndexPrivateData::RestoreWordIDHistoryMap(

1126 const InMemoryURLIndexCacheItem& cache) {

1127 if (!cache.has_word_id_history_map())

1128 return false;

1129 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());

1130 uint32 expected_item_count = list_item.item_count();

1131 uint32 actual_item_count = list_item.word_id_history_map_entry_size();

1132 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1133 return false;

1134 const RepeatedPtrField<WordIDHistoryMapEntry>&

1135 entries(list_item.word_id_history_map_entry());

1136 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =

1137 entries.begin(); iter != entries.end(); ++iter) {

1138 expected_item_count = iter->item_count();

1139 actual_item_count = iter->history_id_size();

1140 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1141 return false;

1142 WordID word_id = iter->word_id();

1143 HistoryIDSet history_id_set;

1144 const RepeatedField<int64>& history_ids(iter->history_id());

1145 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();

1146 jiter != history_ids.end(); ++jiter) {

1147 history_id_set.insert(*jiter);

1148 AddToHistoryIDWordMap(*jiter, word_id);

1149 }

1150 word_id_history_map_[word_id] = history_id_set;

1151 }

1152 return true;

1153 }

1154

1155 bool URLIndexPrivateData::RestoreHistoryInfoMap(

1156 const InMemoryURLIndexCacheItem& cache) {

1157 if (!cache.has_history_info_map())

1158 return false;

1159 const HistoryInfoMapItem& list_item(cache.history_info_map());

1160 uint32 expected_item_count = list_item.item_count();

1161 uint32 actual_item_count = list_item.history_info_map_entry_size();

1162 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1163 return false;

1164 const RepeatedPtrField<HistoryInfoMapEntry>&

1165 entries(list_item.history_info_map_entry());

1166 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =

1167 entries.begin(); iter != entries.end(); ++iter) {

1168 HistoryID history_id = iter->history_id();

1169 GURL url(iter->url());

1170 URLRow url_row(url, history_id);

1171 url_row.set_visit_count(iter->visit_count());

1172 url_row.set_typed_count(iter->typed_count());

1173 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));

1174 if (iter->has_title()) {

1175 base::string16 title(base::UTF8ToUTF16(iter->title()));

1176 url_row.set_title(title);

1177 }

1178 history_info_map_[history_id].url_row = url_row;

1179

1180 // Restore visits list.

1181 VisitInfoVector visits;

1182 visits.reserve(iter->visits_size());

1183 for (int i = 0; i < iter->visits_size(); ++i) {

1184 visits.push_back(std::make_pair(

1185 base::Time::FromInternalValue(iter->visits(i).visit_time()),

1186 ui::PageTransitionFromInt(iter->visits(i).transition_type())));

1187 }

1188 history_info_map_[history_id].visits = visits;

1189 }

1190 return true;

1191 }

1192

1193 bool URLIndexPrivateData::RestoreWordStartsMap(

1194 const InMemoryURLIndexCacheItem& cache,

1195 const std::string& languages) {

1196 // Note that this function must be called after RestoreHistoryInfoMap() has

1197 // been run as the word starts may have to be recalculated from the urls and

1198 // page titles.

1199 if (cache.has_word_starts_map()) {

1200 const WordStartsMapItem& list_item(cache.word_starts_map());

1201 uint32 expected_item_count = list_item.item_count();

1202 uint32 actual_item_count = list_item.word_starts_map_entry_size();

1203 if (actual_item_count == 0 \|\| actual_item_count != expected_item_count)

1204 return false;

1205 const RepeatedPtrField<WordStartsMapEntry>&

1206 entries(list_item.word_starts_map_entry());

1207 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =

1208 entries.begin(); iter != entries.end(); ++iter) {

1209 HistoryID history_id = iter->history_id();

1210 RowWordStarts word_starts;

1211 // Restore the URL word starts.

1212 const RepeatedField<int32>& url_starts(iter->url_word_starts());

1213 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();

1214 jiter != url_starts.end(); ++jiter)

1215 word_starts.url_word_starts_.push_back(*jiter);

1216 // Restore the page title word starts.

1217 const RepeatedField<int32>& title_starts(iter->title_word_starts());

1218 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();

1219 jiter != title_starts.end(); ++jiter)

1220 word_starts.title_word_starts_.push_back(*jiter);

1221 word_starts_map_[history_id] = word_starts;

1222 }

1223 } else {

1224 // Since the cache did not contain any word starts we must rebuild then from

1225 // the URL and page titles.

1226 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();

1227 iter != history_info_map_.end(); ++iter) {

1228 RowWordStarts word_starts;

1229 const URLRow& row(iter->second.url_row);

1230 const base::string16& url =

1231 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);

1232 String16VectorFromString16(url, false, &word_starts.url_word_starts_);

1233 const base::string16& title =

1234 bookmarks::CleanUpTitleForMatching(row.title());

1235 String16VectorFromString16(title, false, &word_starts.title_word_starts_);

1236 word_starts_map_[iter->first] = word_starts;

1237 }

1238 }

1239 return true;

1240 }

1241

1242 // static

1243 bool URLIndexPrivateData::URLSchemeIsWhitelisted(

1244 const GURL& gurl,

1245 const std::set<std::string>& whitelist) {

1246 return whitelist.find(gurl.scheme()) != whitelist.end();

1247 }

1248

1249

1250 // SearchTermCacheItem ---------------------------------------------------------

1251

1252 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(

1253 const WordIDSet& word_id_set,

1254 const HistoryIDSet& history_id_set)

1255 : word_id_set_(word_id_set),

1256 history_id_set_(history_id_set),

1257 used_(true) {}

1258

1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()

1260 : used_(true) {}

1261

1262 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}

1263

1264

1265 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------

1266

1267 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(

1268 const URLIndexPrivateData& private_data,

1269 const std::string& languages,

1270 const base::string16& lower_string,

1271 const String16Vector& lower_terms,

1272 const base::Time now,

1273 const ScoredHistoryMatch::Builder& builder)

1274 : private_data_(private_data),

1275 languages_(languages),

1276 builder_(builder),

1277 lower_string_(lower_string),

1278 lower_terms_(lower_terms),

1279 now_(now) {

1280 // Calculate offsets for each term. For instance, the offset for

1281 // ".net" should be 1, indicating that the actual word-part of the term

1282 // starts at offset 1.

1283 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);

1284 for (size_t i = 0; i < lower_terms_.size(); ++i) {

1285 base::i18n::BreakIterator iter(lower_terms_[i],

1286 base::i18n::BreakIterator::BREAK_WORD);

1287 // If the iterator doesn't work, assume an offset of 0.

1288 if (!iter.Init())

1289 continue;

1290 // Find the first word start.

1291 while (iter.Advance() && !iter.IsWord()) {}

1292 if (iter.IsWord())

1293 lower_terms_to_word_starts_offsets_[i] = iter.prev();

1294 // Else: the iterator didn't find a word break. Assume an offset of 0.

1295 }

1296 }

1297

1298 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}

1299

1300 void URLIndexPrivateData::AddHistoryMatch::operator()(

1301 const HistoryID history_id) {

1302 HistoryInfoMap::const_iterator hist_pos =

1303 private_data_.history_info_map_.find(history_id);

1304 if (hist_pos != private_data_.history_info_map_.end()) {

1305 const URLRow& hist_item = hist_pos->second.url_row;

1306 const VisitInfoVector& visits = hist_pos->second.visits;

1307 WordStartsMap::const_iterator starts_pos =

1308 private_data_.word_starts_map_.find(history_id);

1309 DCHECK(starts_pos != private_data_.word_starts_map_.end());

1310 ScoredHistoryMatch match = builder_.Build(

1311 hist_item, visits, languages_, lower_string_, lower_terms_,

1312 lower_terms_to_word_starts_offsets_, starts_pos->second, now_);

1313 if (match.raw_score > 0)

1314 scored_matches_.push_back(match);

1315 }

1316 }

1317

1318

1319 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------

1320

1321 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(

1322 const HistoryInfoMap& history_info_map)

1323 : history_info_map_(history_info_map) {

1324 }

1325

1326 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}

1327

1328 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(

1329 const HistoryID h1,

1330 const HistoryID h2) {

1331 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));

1332 if (entry1 == history_info_map_.end())

1333 return false;

1334 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));

1335 if (entry2 == history_info_map_.end())

1336 return true;

1337 const URLRow& r1(entry1->second.url_row);

1338 const URLRow& r2(entry2->second.url_row);

1339 // First cut: typed count, visit count, recency.

1340 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks

1341 // recently visited (within the last 12/24 hours) as highly important. Get

1342 // input from mpearson.

1343 if (r1.typed_count() != r2.typed_count())

1344 return (r1.typed_count() > r2.typed_count());

1345 if (r1.visit_count() != r2.visit_count())

1346 return (r1.visit_count() > r2.visit_count());

1347 return (r1.last_visit() > r2.last_visit());

1348 }

1349

1350 } // namespace history

OLD	NEW

« no previous file with comments | « chrome/browser/history/url_index_private_data.h ('k') | chrome/browser/ui/BUILD.gn » ('j') | no next file with comments »