chrome/browser/history/text_database_manager.cc - Issue 16951015: Remove TextDatabase from the history service.

Side by Side Diff: chrome/browser/history/text_database_manager.cc

Issue 16951015: Remove TextDatabase from the history service. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@replace_fts

Patch Set: Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« chrome/browser/history/history_backend.cc ('K') | « chrome/browser/history/text_database_manager.h ('k') | chrome/browser/history/text_database_manager_unittest.cc » ('j') | chrome/browser/history/visit_database.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/history/text_database_manager.h"

6

7 #include <algorithm>

8 #include <functional>

9

10 #include "base/bind.h"

11 #include "base/compiler_specific.h"

12 #include "base/file_util.h"

13 #include "base/files/file_enumerator.h"

14 #include "base/logging.h"

15 #include "base/message_loop.h"

16 #include "base/metrics/histogram.h"

17 #include "base/strings/string_util.h"

18 #include "base/strings/utf_string_conversions.h"

19 #include "chrome/browser/history/history_publisher.h"

20 #include "chrome/browser/history/visit_database.h"

21

22 using base::Time;

23 using base::TimeDelta;

24 using base::TimeTicks;

25

26 namespace history {

27

28 namespace {

29

30 // The number of database files we will be attached to at once.

31 const int kCacheDBSize = 5;

32

33 std::string ConvertStringForIndexer(const string16& input) {

34 // TODO(evanm): other transformations here?

35 return UTF16ToUTF8(CollapseWhitespace(input, false));

36 }

37

38 // Data older than this will be committed to the full text index even if we

39 // haven't gotten a title and/or body.

40 const int kExpirationSeconds = 20;

41

42 } // namespace

43

44 // TextDatabaseManager::ChangeSet ----------------------------------------------

45

46 TextDatabaseManager::ChangeSet::ChangeSet() {}

47

48 TextDatabaseManager::ChangeSet::~ChangeSet() {}

49

50 // TextDatabaseManager::PageInfo -----------------------------------------------

51

52 TextDatabaseManager::PageInfo::PageInfo(URLID url_id,

53 VisitID visit_id,

54 Time visit_time)

55 : url_id_(url_id),

56 visit_id_(visit_id),

57 visit_time_(visit_time) {

58 added_time_ = TimeTicks::Now();

59 }

60

61 TextDatabaseManager::PageInfo::~PageInfo() {}

62

63 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {

64 if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet.

65 title_ = ASCIIToUTF16(" ");

66 else

67 title_ = ttl;

68 }

69

70 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {

71 if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet.

72 body_ = ASCIIToUTF16(" ");

73 else

74 body_ = bdy;

75 }

76

77 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {

78 return now - added_time_ > base::TimeDelta::FromSeconds(kExpirationSeconds);

79 }

80

81 // TextDatabaseManager ---------------------------------------------------------

82

83 TextDatabaseManager::TextDatabaseManager(const base::FilePath& dir,

84 URLDatabase* url_database,

85 VisitDatabase* visit_database)

86 : dir_(dir),

87 url_database_(url_database),

88 visit_database_(visit_database),

89 recent_changes_(RecentChangeList::NO_AUTO_EVICT),

90 transaction_nesting_(0),

91 db_cache_(DBCache::NO_AUTO_EVICT),

92 present_databases_loaded_(false),

93 weak_factory_(this),

94 history_publisher_(NULL) {

95 }

96

97 TextDatabaseManager::~TextDatabaseManager() {

98 if (transaction_nesting_)

99 CommitTransaction();

100 }

101

102 // static

103 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {

104 Time::Exploded exploded;

105 time.UTCExplode(&exploded);

106

107 // We combine the month and year into a 6-digit number (200801 for

108 // January, 2008). The month is 1-based.

109 return exploded.year * 100 + exploded.month;

110 }

111

112 // static

113 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {

114 Time::Exploded exploded;

115 memset(&exploded, 0, sizeof(Time::Exploded));

116 exploded.year = id / 100;

117 exploded.month = id % 100;

118 return Time::FromUTCExploded(exploded);

119 }

120

121 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {

122 history_publisher_ = history_publisher;

123

124 // Start checking recent changes and committing them.

125 ScheduleFlushOldChanges();

126 return true;

127 }

128

129 void TextDatabaseManager::BeginTransaction() {

130 transaction_nesting_++;

131 }

132

133 void TextDatabaseManager::CommitTransaction() {

134 DCHECK(transaction_nesting_);

135 transaction_nesting_--;

136 if (transaction_nesting_)

137 return; // Still more nesting of transactions before committing.

138

139 // Commit all databases with open transactions on them.

140 for (DBIdentSet::const_iterator i = open_transactions_.begin();

141 i != open_transactions_.end(); ++i) {

142 DBCache::iterator iter = db_cache_.Get(*i);

143 if (iter == db_cache_.end()) {

144 NOTREACHED() << "All open transactions should be cached.";

145 continue;

146 }

147 iter->second->CommitTransaction();

148 }

149 open_transactions_.clear();

150

151 // Now that the transaction is over, we can expire old connections.

152 db_cache_.ShrinkToSize(kCacheDBSize);

153 }

154

155 void TextDatabaseManager::InitDBList() {

156 if (present_databases_loaded_)

157 return;

158

159 present_databases_loaded_ = true;

160

161 // Find files on disk matching our pattern so we can quickly test for them.

162 base::FilePath::StringType filepattern(TextDatabase::file_base());

163 filepattern.append(FILE_PATH_LITERAL("*"));

164 base::FileEnumerator enumerator(

165 dir_, false, base::FileEnumerator::FILES, filepattern);

166 base::FilePath cur_file;

167 while (!(cur_file = enumerator.Next()).empty()) {

168 // Convert to the number representing this file.

169 TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);

170 if (id) // Will be 0 on error.

171 present_databases_.insert(id);

172 }

173 }

174

175 void TextDatabaseManager::AddPageURL(const GURL& url,

176 URLID url_id,

177 VisitID visit_id,

178 Time time) {

179 // Delete any existing page info.

180 RecentChangeList::iterator found = recent_changes_.Peek(url);

181 if (found != recent_changes_.end())

182 recent_changes_.Erase(found);

183

184 // Just save this info for later. We will save it when it expires or when all

185 // the data is complete.

186 recent_changes_.Put(url, PageInfo(url_id, visit_id, time));

187 }

188

189 void TextDatabaseManager::AddPageTitle(const GURL& url,

190 const string16& title) {

191 RecentChangeList::iterator found = recent_changes_.Peek(url);

192 if (found == recent_changes_.end()) {

193 // This page is not in our cache of recent pages. This is very much an edge

194 // case as normally a title will come in <20 seconds after the page commits,

195 // and WebContents will avoid spamming us with >1 title per page. However,

196 // it could come up if your connection is unhappy, and we don't want to

197 // miss anything.

198 //

199 // To solve this problem, we'll just associate the most recent visit with

200 // the new title and index that using the regular code path.

201 URLRow url_row;

202 if (!url_database_->GetRowForURL(url, &url_row))

203 return; // URL is unknown, give up.

204 VisitRow visit;

205 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))

206 return; // No recent visit, give up.

207

208 if (visit.is_indexed) {

209 // If this page was already indexed, we could have a body that came in

210 // first and we don't want to overwrite it. We could go query for the

211 // current body, or have a special setter for only the title, but this is

212 // not worth it for this edge case.

213 //

214 // It will be almost impossible for the title to take longer than

215 // kExpirationSeconds yet we got a body in less than that time, since

216 // the title should always come in first.

217 return;

218 }

219

220 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,

221 title, string16());

222 return; // We don't know about this page, give up.

223 }

224

225 PageInfo& info = found->second;

226 if (info.has_body()) {

227 // This info is complete, write to the database.

228 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),

229 title, info.body());

230 recent_changes_.Erase(found);

231 return;

232 }

233

234 info.set_title(title);

235 }

236

237 void TextDatabaseManager::AddPageContents(const GURL& url,

238 const string16& body) {

239 RecentChangeList::iterator found = recent_changes_.Peek(url);

240 if (found == recent_changes_.end()) {

241 // This page is not in our cache of recent pages. This means that the page

242 // took more than kExpirationSeconds to load. Often, this will be the result

243 // of a very slow iframe or other resource on the page that makes us think

244 // it's still loading.

245 //

246 // As a fallback, set the most recent visit's contents using the input, and

247 // use the last set title in the URL table as the title to index.

248 URLRow url_row;

249 if (!url_database_->GetRowForURL(url, &url_row))

250 return; // URL is unknown, give up.

251 VisitRow visit;

252 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))

253 return; // No recent visit, give up.

254

255 // Use the title from the URL row as the title for the indexing.

256 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,

257 url_row.title(), body);

258 return;

259 }

260

261 PageInfo& info = found->second;

262 if (info.has_title()) {

263 // This info is complete, write to the database.

264 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),

265 info.title(), body);

266 recent_changes_.Erase(found);

267 return;

268 }

269

270 info.set_body(body);

271 }

272

273 bool TextDatabaseManager::AddPageData(const GURL& url,

274 URLID url_id,

275 VisitID visit_id,

276 Time visit_time,

277 const string16& title,

278 const string16& body) {

279 TextDatabase* db = GetDBForTime(visit_time, true);

280 if (!db)

281 return false;

282

283 TimeTicks beginning_time = TimeTicks::Now();

284

285 // First delete any recently-indexed data for this page. This will delete

286 // anything in the main database, but we don't bother looking through the

287 // archived database.

288 VisitVector visits;

289 visit_database_->GetIndexedVisitsForURL(url_id, &visits);

290 for (size_t i = 0; i < visits.size(); i++) {

291 visits[i].is_indexed = false;

292 visit_database_->UpdateVisitRow(visits[i]);

293 DeletePageData(visits[i].visit_time, url, NULL);

294 }

295

296 if (visit_id) {

297 // We're supposed to update the visit database, so load the visit.

298 VisitRow row;

299 if (!visit_database_->GetRowForVisit(visit_id, &row)) {

300 // This situation can occur if Chrome's history is in the process of

301 // being updated, and then the browsing history is deleted before all

302 // updates have been completely performed. In this case, a stale update

303 // to the database is attempted, leading to the warning below.

304 DLOG(WARNING) << "Could not find requested visit #" << visit_id;

305 return false;

306 }

307

308 DCHECK(visit_time == row.visit_time);

309

310 // Update the visit database to reference our addition.

311 row.is_indexed = true;

312 if (!visit_database_->UpdateVisitRow(row))

313 return false;

314 }

315

316 // Now index the data.

317 std::string url_str = URLDatabase::GURLToDatabaseURL(url);

318 bool success = db->AddPageData(visit_time, url_str,

319 ConvertStringForIndexer(title),

320 ConvertStringForIndexer(body));

321

322 UMA_HISTOGRAM_TIMES("History.AddFTSData",

323 TimeTicks::Now() - beginning_time);

324

325 if (history_publisher_)

326 history_publisher_->PublishPageContent(visit_time, url, title, body);

327

328 return success;

329 }

330

331 void TextDatabaseManager::DeletePageData(Time time, const GURL& url,

332 ChangeSet* change_set) {

333 TextDatabase::DBIdent db_ident = TimeToID(time);

334

335 // We want to open the database for writing, but only if it exists. To

336 // achieve this, we check whether it exists by saying we're not going to

337 // write to it (avoiding the autocreation code normally called when writing)

338 // and then access it for writing only if it succeeds.

339 TextDatabase* db = GetDB(db_ident, false);

340 if (!db)

341 return;

342 db = GetDB(db_ident, true);

343

344 if (change_set)

345 change_set->Add(db_ident);

346

347 db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));

348 }

349

350 void TextDatabaseManager::DeleteFromUncommitted(

351 const std::set<GURL>& restrict_urls, Time begin, Time end) {

352 // First find the beginning of the range to delete. Recall that the list

353 // has the most recent item at the beginning. There won't normally be very

354 // many items, so a brute-force search is fine.

355 RecentChangeList::iterator cur = recent_changes_.begin();

356 if (!end.is_null()) {

357 // Walk from the beginning of the list backwards in time to find the newest

358 // entry that should be deleted.

359 while (cur != recent_changes_.end() && cur->second.visit_time() >= end)

360 ++cur;

361 }

362

363 // Now delete all visits up to the oldest one we were supposed to delete.

364 // Note that if begin is_null, it will be less than or equal to any other

365 // time.

366 if (restrict_urls.empty()) {

367 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)

368 cur = recent_changes_.Erase(cur);

369 } else {

370 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {

371 if (restrict_urls.find(cur->first) != restrict_urls.end())

372 cur = recent_changes_.Erase(cur);

373 else

374 ++cur;

375 }

376 }

377 }

378

379 void TextDatabaseManager::DeleteFromUncommittedForTimes(

380 const std::vector<base::Time>& times) {

381 // \|times\| must be in reverse chronological order, i.e. each member

382 // must be earlier than or the same as the one before it.

383 DCHECK(

384 std::adjacent_find(

385 times.begin(), times.end(), std::less<base::Time>()) ==

386 times.end());

387

388 // Both \|recent_changes_\| and \|times\| are in reverse chronological order.

389 RecentChangeList::iterator it = recent_changes_.begin();

390 std::vector<base::Time>::const_iterator time_it = times.begin();

391 while (it != recent_changes_.end() && time_it != times.end()) {

392 base::Time visit_time = it->second.visit_time();

393 if (visit_time == *time_it) {

394 it = recent_changes_.Erase(it);

395 } else if (visit_time < *time_it) {

396 ++time_it;

397 } else /* if (visit_time > time_it) / {

398 ++it;

399 }

400 }

401 }

402

403 void TextDatabaseManager::DeleteAll() {

404 DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";

405

406 InitDBList();

407

408 // Delete uncommitted entries.

409 recent_changes_.Clear();

410

411 // Close all open databases.

412 db_cache_.Clear();

413

414 // Now go through and delete all the files.

415 for (DBIdentSet::iterator i = present_databases_.begin();

416 i != present_databases_.end(); ++i) {

417 base::FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));

418 file_util::Delete(file_name, false);

419 }

420 }

421

422 void TextDatabaseManager::OptimizeChangedDatabases(

423 const ChangeSet& change_set) {

424 for (ChangeSet::DBSet::const_iterator i =

425 change_set.changed_databases_.begin();

426 i != change_set.changed_databases_.end(); ++i) {

427 // We want to open the database for writing, but only if it exists. To

428 // achieve this, we check whether it exists by saying we're not going to

429 // write to it (avoiding the autocreation code normally called when writing)

430 // and then access it for writing only if it succeeds.

431 TextDatabase* db = GetDB(*i, false);

432 if (!db)

433 continue;

434 db = GetDB(*i, true);

435 if (!db)

436 continue; // The file may have changed or something.

437 db->Optimize();

438 }

439 }

440

441 void TextDatabaseManager::GetTextMatches(

442 const string16& query,

443 const QueryOptions& options,

444 std::vector<TextDatabase::Match>* results,

445 Time* first_time_searched) {

446 results->clear();

447

448 *first_time_searched = options.begin_time;

449

450 InitDBList();

451 if (present_databases_.empty())

452 return; // Nothing to search.

453

454 // Get the query into the proper format for the individual DBs.

455 string16 fts_query16;

456 query_parser_.ParseQuery(query, &fts_query16);

457 std::string fts_query = UTF16ToUTF8(fts_query16);

458

459 // Need a copy of the options so we can modify the max count for each call

460 // to the individual databases.

461 QueryOptions cur_options(options);

462

463 // Compute the minimum and maximum values for the identifiers that could

464 // encompass the input time range.

465 TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?

466 *present_databases_.begin() :

467 TimeToID(options.begin_time);

468 TextDatabase::DBIdent max_ident = options.end_time.is_null() ?

469 *present_databases_.rbegin() :

470 TimeToID(options.end_time);

471

472 // Iterate over the databases from the most recent backwards.

473 TextDatabase::URLSet found_urls;

474 for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();

475 i != present_databases_.rend();

476 ++i) {

477 // TODO(brettw) allow canceling the query in the middle.

478 // if (canceled_or_something)

479 // break;

480

481 // This code is stupid, we just loop until we find the correct starting

482 // time range rather than search in an intelligent way. Users will have a

483 // few dozen files at most, so this should not be an issue.

484 if (*i > max_ident)

485 continue; // Haven't gotten to the time range yet.

486 if (*i < min_ident)

487 break; // Covered all the time range.

488

489 TextDatabase* cur_db = GetDB(*i, false);

490 if (!cur_db)

491 continue;

492

493 // Adjust the max count according to how many results we've already got.

494 if (options.max_count) {

495 cur_options.max_count = options.max_count -

496 static_cast<int>(results->size());

497 }

498

499 bool has_more_results = cur_db->GetTextMatches(

500 fts_query, cur_options, results, &found_urls);

501

502 DCHECK(static_cast<int>(results->size()) <= options.EffectiveMaxCount());

503

504 if (has_more_results \|\|

505 static_cast<int>(results->size()) == options.EffectiveMaxCount()) {

506 // Since the search proceeds backwards in time, the last result we have

507 // gives the first time searched.

508 *first_time_searched = results->back().time;

509 break;

510 }

511 }

512 }

513

514 size_t TextDatabaseManager::GetUncommittedEntryCountForTest() const {

515 return recent_changes_.size();

516 }

517

518 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,

519 bool for_writing) {

520 DBCache::iterator found_db = db_cache_.Get(id);

521 if (found_db != db_cache_.end()) {

522 if (transaction_nesting_ && for_writing &&

523 open_transactions_.find(id) == open_transactions_.end()) {

524 // If we currently have an open transaction, that database is not yet

525 // part of the transaction, and the database will be written to, it needs

526 // to be part of our transaction.

527 found_db->second->BeginTransaction();

528 open_transactions_.insert(id);

529 }

530 return found_db->second;

531 }

532

533 // Need to make the database.

534 TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);

535 if (!new_db->Init()) {

536 delete new_db;

537 return NULL;

538 }

539 db_cache_.Put(id, new_db);

540 present_databases_.insert(id);

541

542 if (transaction_nesting_ && for_writing) {

543 // If we currently have an open transaction and the new database will be

544 // written to, it needs to be part of our transaction.

545 new_db->BeginTransaction();

546 open_transactions_.insert(id);

547 }

548

549 // When no transaction is open, allow this new one to kick out an old one.

550 if (!transaction_nesting_)

551 db_cache_.ShrinkToSize(kCacheDBSize);

552

553 return new_db;

554 }

555

556 TextDatabase* TextDatabaseManager::GetDBForTime(Time time,

557 bool create_if_necessary) {

558 return GetDB(TimeToID(time), create_if_necessary);

559 }

560

561 void TextDatabaseManager::ScheduleFlushOldChanges() {

562 weak_factory_.InvalidateWeakPtrs();

563 base::MessageLoop::current()->PostDelayedTask(

564 FROM_HERE,

565 base::Bind(&TextDatabaseManager::FlushOldChanges,

566 weak_factory_.GetWeakPtr()),

567 base::TimeDelta::FromSeconds(kExpirationSeconds));

568 }

569

570 void TextDatabaseManager::FlushOldChanges() {

571 FlushOldChangesForTime(TimeTicks::Now());

572 }

573

574 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {

575 // The end of the list is the oldest, so we just start from there committing

576 // things until we get something too new.

577 RecentChangeList::reverse_iterator i = recent_changes_.rbegin();

578 while (i != recent_changes_.rend() && i->second.Expired(now)) {

579 AddPageData(i->first, i->second.url_id(), i->second.visit_id(),

580 i->second.visit_time(), i->second.title(), i->second.body());

581 i = recent_changes_.Erase(i);

582 }

583

584 ScheduleFlushOldChanges();

585 }

586

587 } // namespace history

OLD	NEW