Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(608)

Side by Side Diff: chrome/browser/history/text_database_manager.cc

Issue 16951015: Remove TextDatabase from the history service. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@replace_fts
Patch Set: Created 7 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/text_database_manager.h"
6
7 #include <algorithm>
8 #include <functional>
9
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/file_util.h"
13 #include "base/files/file_enumerator.h"
14 #include "base/logging.h"
15 #include "base/message_loop.h"
16 #include "base/metrics/histogram.h"
17 #include "base/strings/string_util.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "chrome/browser/history/history_publisher.h"
20 #include "chrome/browser/history/visit_database.h"
21
22 using base::Time;
23 using base::TimeDelta;
24 using base::TimeTicks;
25
26 namespace history {
27
28 namespace {
29
30 // The number of database files we will be attached to at once.
31 const int kCacheDBSize = 5;
32
33 std::string ConvertStringForIndexer(const string16& input) {
34 // TODO(evanm): other transformations here?
35 return UTF16ToUTF8(CollapseWhitespace(input, false));
36 }
37
38 // Data older than this will be committed to the full text index even if we
39 // haven't gotten a title and/or body.
40 const int kExpirationSeconds = 20;
41
42 } // namespace
43
44 // TextDatabaseManager::ChangeSet ----------------------------------------------
45
46 TextDatabaseManager::ChangeSet::ChangeSet() {}
47
48 TextDatabaseManager::ChangeSet::~ChangeSet() {}
49
50 // TextDatabaseManager::PageInfo -----------------------------------------------
51
52 TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
53 VisitID visit_id,
54 Time visit_time)
55 : url_id_(url_id),
56 visit_id_(visit_id),
57 visit_time_(visit_time) {
58 added_time_ = TimeTicks::Now();
59 }
60
61 TextDatabaseManager::PageInfo::~PageInfo() {}
62
63 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
64 if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet.
65 title_ = ASCIIToUTF16(" ");
66 else
67 title_ = ttl;
68 }
69
70 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
71 if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet.
72 body_ = ASCIIToUTF16(" ");
73 else
74 body_ = bdy;
75 }
76
77 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
78 return now - added_time_ > base::TimeDelta::FromSeconds(kExpirationSeconds);
79 }
80
81 // TextDatabaseManager ---------------------------------------------------------
82
83 TextDatabaseManager::TextDatabaseManager(const base::FilePath& dir,
84 URLDatabase* url_database,
85 VisitDatabase* visit_database)
86 : dir_(dir),
87 url_database_(url_database),
88 visit_database_(visit_database),
89 recent_changes_(RecentChangeList::NO_AUTO_EVICT),
90 transaction_nesting_(0),
91 db_cache_(DBCache::NO_AUTO_EVICT),
92 present_databases_loaded_(false),
93 weak_factory_(this),
94 history_publisher_(NULL) {
95 }
96
97 TextDatabaseManager::~TextDatabaseManager() {
98 if (transaction_nesting_)
99 CommitTransaction();
100 }
101
102 // static
103 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
104 Time::Exploded exploded;
105 time.UTCExplode(&exploded);
106
107 // We combine the month and year into a 6-digit number (200801 for
108 // January, 2008). The month is 1-based.
109 return exploded.year * 100 + exploded.month;
110 }
111
112 // static
113 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
114 Time::Exploded exploded;
115 memset(&exploded, 0, sizeof(Time::Exploded));
116 exploded.year = id / 100;
117 exploded.month = id % 100;
118 return Time::FromUTCExploded(exploded);
119 }
120
121 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
122 history_publisher_ = history_publisher;
123
124 // Start checking recent changes and committing them.
125 ScheduleFlushOldChanges();
126 return true;
127 }
128
129 void TextDatabaseManager::BeginTransaction() {
130 transaction_nesting_++;
131 }
132
133 void TextDatabaseManager::CommitTransaction() {
134 DCHECK(transaction_nesting_);
135 transaction_nesting_--;
136 if (transaction_nesting_)
137 return; // Still more nesting of transactions before committing.
138
139 // Commit all databases with open transactions on them.
140 for (DBIdentSet::const_iterator i = open_transactions_.begin();
141 i != open_transactions_.end(); ++i) {
142 DBCache::iterator iter = db_cache_.Get(*i);
143 if (iter == db_cache_.end()) {
144 NOTREACHED() << "All open transactions should be cached.";
145 continue;
146 }
147 iter->second->CommitTransaction();
148 }
149 open_transactions_.clear();
150
151 // Now that the transaction is over, we can expire old connections.
152 db_cache_.ShrinkToSize(kCacheDBSize);
153 }
154
155 void TextDatabaseManager::InitDBList() {
156 if (present_databases_loaded_)
157 return;
158
159 present_databases_loaded_ = true;
160
161 // Find files on disk matching our pattern so we can quickly test for them.
162 base::FilePath::StringType filepattern(TextDatabase::file_base());
163 filepattern.append(FILE_PATH_LITERAL("*"));
164 base::FileEnumerator enumerator(
165 dir_, false, base::FileEnumerator::FILES, filepattern);
166 base::FilePath cur_file;
167 while (!(cur_file = enumerator.Next()).empty()) {
168 // Convert to the number representing this file.
169 TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
170 if (id) // Will be 0 on error.
171 present_databases_.insert(id);
172 }
173 }
174
175 void TextDatabaseManager::AddPageURL(const GURL& url,
176 URLID url_id,
177 VisitID visit_id,
178 Time time) {
179 // Delete any existing page info.
180 RecentChangeList::iterator found = recent_changes_.Peek(url);
181 if (found != recent_changes_.end())
182 recent_changes_.Erase(found);
183
184 // Just save this info for later. We will save it when it expires or when all
185 // the data is complete.
186 recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
187 }
188
189 void TextDatabaseManager::AddPageTitle(const GURL& url,
190 const string16& title) {
191 RecentChangeList::iterator found = recent_changes_.Peek(url);
192 if (found == recent_changes_.end()) {
193 // This page is not in our cache of recent pages. This is very much an edge
194 // case as normally a title will come in <20 seconds after the page commits,
195 // and WebContents will avoid spamming us with >1 title per page. However,
196 // it could come up if your connection is unhappy, and we don't want to
197 // miss anything.
198 //
199 // To solve this problem, we'll just associate the most recent visit with
200 // the new title and index that using the regular code path.
201 URLRow url_row;
202 if (!url_database_->GetRowForURL(url, &url_row))
203 return; // URL is unknown, give up.
204 VisitRow visit;
205 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
206 return; // No recent visit, give up.
207
208 if (visit.is_indexed) {
209 // If this page was already indexed, we could have a body that came in
210 // first and we don't want to overwrite it. We could go query for the
211 // current body, or have a special setter for only the title, but this is
212 // not worth it for this edge case.
213 //
214 // It will be almost impossible for the title to take longer than
215 // kExpirationSeconds yet we got a body in less than that time, since
216 // the title should always come in first.
217 return;
218 }
219
220 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
221 title, string16());
222 return; // We don't know about this page, give up.
223 }
224
225 PageInfo& info = found->second;
226 if (info.has_body()) {
227 // This info is complete, write to the database.
228 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
229 title, info.body());
230 recent_changes_.Erase(found);
231 return;
232 }
233
234 info.set_title(title);
235 }
236
237 void TextDatabaseManager::AddPageContents(const GURL& url,
238 const string16& body) {
239 RecentChangeList::iterator found = recent_changes_.Peek(url);
240 if (found == recent_changes_.end()) {
241 // This page is not in our cache of recent pages. This means that the page
242 // took more than kExpirationSeconds to load. Often, this will be the result
243 // of a very slow iframe or other resource on the page that makes us think
244 // it's still loading.
245 //
246 // As a fallback, set the most recent visit's contents using the input, and
247 // use the last set title in the URL table as the title to index.
248 URLRow url_row;
249 if (!url_database_->GetRowForURL(url, &url_row))
250 return; // URL is unknown, give up.
251 VisitRow visit;
252 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
253 return; // No recent visit, give up.
254
255 // Use the title from the URL row as the title for the indexing.
256 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
257 url_row.title(), body);
258 return;
259 }
260
261 PageInfo& info = found->second;
262 if (info.has_title()) {
263 // This info is complete, write to the database.
264 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
265 info.title(), body);
266 recent_changes_.Erase(found);
267 return;
268 }
269
270 info.set_body(body);
271 }
272
273 bool TextDatabaseManager::AddPageData(const GURL& url,
274 URLID url_id,
275 VisitID visit_id,
276 Time visit_time,
277 const string16& title,
278 const string16& body) {
279 TextDatabase* db = GetDBForTime(visit_time, true);
280 if (!db)
281 return false;
282
283 TimeTicks beginning_time = TimeTicks::Now();
284
285 // First delete any recently-indexed data for this page. This will delete
286 // anything in the main database, but we don't bother looking through the
287 // archived database.
288 VisitVector visits;
289 visit_database_->GetIndexedVisitsForURL(url_id, &visits);
290 for (size_t i = 0; i < visits.size(); i++) {
291 visits[i].is_indexed = false;
292 visit_database_->UpdateVisitRow(visits[i]);
293 DeletePageData(visits[i].visit_time, url, NULL);
294 }
295
296 if (visit_id) {
297 // We're supposed to update the visit database, so load the visit.
298 VisitRow row;
299 if (!visit_database_->GetRowForVisit(visit_id, &row)) {
300 // This situation can occur if Chrome's history is in the process of
301 // being updated, and then the browsing history is deleted before all
302 // updates have been completely performed. In this case, a stale update
303 // to the database is attempted, leading to the warning below.
304 DLOG(WARNING) << "Could not find requested visit #" << visit_id;
305 return false;
306 }
307
308 DCHECK(visit_time == row.visit_time);
309
310 // Update the visit database to reference our addition.
311 row.is_indexed = true;
312 if (!visit_database_->UpdateVisitRow(row))
313 return false;
314 }
315
316 // Now index the data.
317 std::string url_str = URLDatabase::GURLToDatabaseURL(url);
318 bool success = db->AddPageData(visit_time, url_str,
319 ConvertStringForIndexer(title),
320 ConvertStringForIndexer(body));
321
322 UMA_HISTOGRAM_TIMES("History.AddFTSData",
323 TimeTicks::Now() - beginning_time);
324
325 if (history_publisher_)
326 history_publisher_->PublishPageContent(visit_time, url, title, body);
327
328 return success;
329 }
330
331 void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
332 ChangeSet* change_set) {
333 TextDatabase::DBIdent db_ident = TimeToID(time);
334
335 // We want to open the database for writing, but only if it exists. To
336 // achieve this, we check whether it exists by saying we're not going to
337 // write to it (avoiding the autocreation code normally called when writing)
338 // and then access it for writing only if it succeeds.
339 TextDatabase* db = GetDB(db_ident, false);
340 if (!db)
341 return;
342 db = GetDB(db_ident, true);
343
344 if (change_set)
345 change_set->Add(db_ident);
346
347 db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
348 }
349
350 void TextDatabaseManager::DeleteFromUncommitted(
351 const std::set<GURL>& restrict_urls, Time begin, Time end) {
352 // First find the beginning of the range to delete. Recall that the list
353 // has the most recent item at the beginning. There won't normally be very
354 // many items, so a brute-force search is fine.
355 RecentChangeList::iterator cur = recent_changes_.begin();
356 if (!end.is_null()) {
357 // Walk from the beginning of the list backwards in time to find the newest
358 // entry that should be deleted.
359 while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
360 ++cur;
361 }
362
363 // Now delete all visits up to the oldest one we were supposed to delete.
364 // Note that if begin is_null, it will be less than or equal to any other
365 // time.
366 if (restrict_urls.empty()) {
367 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
368 cur = recent_changes_.Erase(cur);
369 } else {
370 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
371 if (restrict_urls.find(cur->first) != restrict_urls.end())
372 cur = recent_changes_.Erase(cur);
373 else
374 ++cur;
375 }
376 }
377 }
378
379 void TextDatabaseManager::DeleteFromUncommittedForTimes(
380 const std::vector<base::Time>& times) {
381 // |times| must be in reverse chronological order, i.e. each member
382 // must be earlier than or the same as the one before it.
383 DCHECK(
384 std::adjacent_find(
385 times.begin(), times.end(), std::less<base::Time>()) ==
386 times.end());
387
388 // Both |recent_changes_| and |times| are in reverse chronological order.
389 RecentChangeList::iterator it = recent_changes_.begin();
390 std::vector<base::Time>::const_iterator time_it = times.begin();
391 while (it != recent_changes_.end() && time_it != times.end()) {
392 base::Time visit_time = it->second.visit_time();
393 if (visit_time == *time_it) {
394 it = recent_changes_.Erase(it);
395 } else if (visit_time < *time_it) {
396 ++time_it;
397 } else /* if (visit_time > *time_it) */ {
398 ++it;
399 }
400 }
401 }
402
403 void TextDatabaseManager::DeleteAll() {
404 DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";
405
406 InitDBList();
407
408 // Delete uncommitted entries.
409 recent_changes_.Clear();
410
411 // Close all open databases.
412 db_cache_.Clear();
413
414 // Now go through and delete all the files.
415 for (DBIdentSet::iterator i = present_databases_.begin();
416 i != present_databases_.end(); ++i) {
417 base::FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
418 file_util::Delete(file_name, false);
419 }
420 }
421
422 void TextDatabaseManager::OptimizeChangedDatabases(
423 const ChangeSet& change_set) {
424 for (ChangeSet::DBSet::const_iterator i =
425 change_set.changed_databases_.begin();
426 i != change_set.changed_databases_.end(); ++i) {
427 // We want to open the database for writing, but only if it exists. To
428 // achieve this, we check whether it exists by saying we're not going to
429 // write to it (avoiding the autocreation code normally called when writing)
430 // and then access it for writing only if it succeeds.
431 TextDatabase* db = GetDB(*i, false);
432 if (!db)
433 continue;
434 db = GetDB(*i, true);
435 if (!db)
436 continue; // The file may have changed or something.
437 db->Optimize();
438 }
439 }
440
441 void TextDatabaseManager::GetTextMatches(
442 const string16& query,
443 const QueryOptions& options,
444 std::vector<TextDatabase::Match>* results,
445 Time* first_time_searched) {
446 results->clear();
447
448 *first_time_searched = options.begin_time;
449
450 InitDBList();
451 if (present_databases_.empty())
452 return; // Nothing to search.
453
454 // Get the query into the proper format for the individual DBs.
455 string16 fts_query16;
456 query_parser_.ParseQuery(query, &fts_query16);
457 std::string fts_query = UTF16ToUTF8(fts_query16);
458
459 // Need a copy of the options so we can modify the max count for each call
460 // to the individual databases.
461 QueryOptions cur_options(options);
462
463 // Compute the minimum and maximum values for the identifiers that could
464 // encompass the input time range.
465 TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
466 *present_databases_.begin() :
467 TimeToID(options.begin_time);
468 TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
469 *present_databases_.rbegin() :
470 TimeToID(options.end_time);
471
472 // Iterate over the databases from the most recent backwards.
473 TextDatabase::URLSet found_urls;
474 for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
475 i != present_databases_.rend();
476 ++i) {
477 // TODO(brettw) allow canceling the query in the middle.
478 // if (canceled_or_something)
479 // break;
480
481 // This code is stupid, we just loop until we find the correct starting
482 // time range rather than search in an intelligent way. Users will have a
483 // few dozen files at most, so this should not be an issue.
484 if (*i > max_ident)
485 continue; // Haven't gotten to the time range yet.
486 if (*i < min_ident)
487 break; // Covered all the time range.
488
489 TextDatabase* cur_db = GetDB(*i, false);
490 if (!cur_db)
491 continue;
492
493 // Adjust the max count according to how many results we've already got.
494 if (options.max_count) {
495 cur_options.max_count = options.max_count -
496 static_cast<int>(results->size());
497 }
498
499 bool has_more_results = cur_db->GetTextMatches(
500 fts_query, cur_options, results, &found_urls);
501
502 DCHECK(static_cast<int>(results->size()) <= options.EffectiveMaxCount());
503
504 if (has_more_results ||
505 static_cast<int>(results->size()) == options.EffectiveMaxCount()) {
506 // Since the search proceeds backwards in time, the last result we have
507 // gives the first time searched.
508 *first_time_searched = results->back().time;
509 break;
510 }
511 }
512 }
513
514 size_t TextDatabaseManager::GetUncommittedEntryCountForTest() const {
515 return recent_changes_.size();
516 }
517
518 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
519 bool for_writing) {
520 DBCache::iterator found_db = db_cache_.Get(id);
521 if (found_db != db_cache_.end()) {
522 if (transaction_nesting_ && for_writing &&
523 open_transactions_.find(id) == open_transactions_.end()) {
524 // If we currently have an open transaction, that database is not yet
525 // part of the transaction, and the database will be written to, it needs
526 // to be part of our transaction.
527 found_db->second->BeginTransaction();
528 open_transactions_.insert(id);
529 }
530 return found_db->second;
531 }
532
533 // Need to make the database.
534 TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
535 if (!new_db->Init()) {
536 delete new_db;
537 return NULL;
538 }
539 db_cache_.Put(id, new_db);
540 present_databases_.insert(id);
541
542 if (transaction_nesting_ && for_writing) {
543 // If we currently have an open transaction and the new database will be
544 // written to, it needs to be part of our transaction.
545 new_db->BeginTransaction();
546 open_transactions_.insert(id);
547 }
548
549 // When no transaction is open, allow this new one to kick out an old one.
550 if (!transaction_nesting_)
551 db_cache_.ShrinkToSize(kCacheDBSize);
552
553 return new_db;
554 }
555
556 TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
557 bool create_if_necessary) {
558 return GetDB(TimeToID(time), create_if_necessary);
559 }
560
561 void TextDatabaseManager::ScheduleFlushOldChanges() {
562 weak_factory_.InvalidateWeakPtrs();
563 base::MessageLoop::current()->PostDelayedTask(
564 FROM_HERE,
565 base::Bind(&TextDatabaseManager::FlushOldChanges,
566 weak_factory_.GetWeakPtr()),
567 base::TimeDelta::FromSeconds(kExpirationSeconds));
568 }
569
570 void TextDatabaseManager::FlushOldChanges() {
571 FlushOldChangesForTime(TimeTicks::Now());
572 }
573
574 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
575 // The end of the list is the oldest, so we just start from there committing
576 // things until we get something too new.
577 RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
578 while (i != recent_changes_.rend() && i->second.Expired(now)) {
579 AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
580 i->second.visit_time(), i->second.title(), i->second.body());
581 i = recent_changes_.Erase(i);
582 }
583
584 ScheduleFlushOldChanges();
585 }
586
587 } // namespace history
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698