OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/history/text_database_manager.h" | |
6 | |
7 #include <algorithm> | |
8 #include <functional> | |
9 | |
10 #include "base/bind.h" | |
11 #include "base/compiler_specific.h" | |
12 #include "base/file_util.h" | |
13 #include "base/files/file_enumerator.h" | |
14 #include "base/logging.h" | |
15 #include "base/message_loop.h" | |
16 #include "base/metrics/histogram.h" | |
17 #include "base/strings/string_util.h" | |
18 #include "base/strings/utf_string_conversions.h" | |
19 #include "chrome/browser/history/history_publisher.h" | |
20 #include "chrome/browser/history/visit_database.h" | |
21 | |
22 using base::Time; | |
23 using base::TimeDelta; | |
24 using base::TimeTicks; | |
25 | |
26 namespace history { | |
27 | |
28 namespace { | |
29 | |
30 // The number of database files we will be attached to at once. | |
31 const int kCacheDBSize = 5; | |
32 | |
33 std::string ConvertStringForIndexer(const string16& input) { | |
34 // TODO(evanm): other transformations here? | |
35 return UTF16ToUTF8(CollapseWhitespace(input, false)); | |
36 } | |
37 | |
38 // Data older than this will be committed to the full text index even if we | |
39 // haven't gotten a title and/or body. | |
40 const int kExpirationSeconds = 20; | |
41 | |
42 } // namespace | |
43 | |
44 // TextDatabaseManager::ChangeSet ---------------------------------------------- | |
45 | |
46 TextDatabaseManager::ChangeSet::ChangeSet() {} | |
47 | |
48 TextDatabaseManager::ChangeSet::~ChangeSet() {} | |
49 | |
50 // TextDatabaseManager::PageInfo ----------------------------------------------- | |
51 | |
52 TextDatabaseManager::PageInfo::PageInfo(URLID url_id, | |
53 VisitID visit_id, | |
54 Time visit_time) | |
55 : url_id_(url_id), | |
56 visit_id_(visit_id), | |
57 visit_time_(visit_time) { | |
58 added_time_ = TimeTicks::Now(); | |
59 } | |
60 | |
61 TextDatabaseManager::PageInfo::~PageInfo() {} | |
62 | |
63 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) { | |
64 if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet. | |
65 title_ = ASCIIToUTF16(" "); | |
66 else | |
67 title_ = ttl; | |
68 } | |
69 | |
70 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) { | |
71 if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet. | |
72 body_ = ASCIIToUTF16(" "); | |
73 else | |
74 body_ = bdy; | |
75 } | |
76 | |
77 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const { | |
78 return now - added_time_ > base::TimeDelta::FromSeconds(kExpirationSeconds); | |
79 } | |
80 | |
81 // TextDatabaseManager --------------------------------------------------------- | |
82 | |
83 TextDatabaseManager::TextDatabaseManager(const base::FilePath& dir, | |
84 URLDatabase* url_database, | |
85 VisitDatabase* visit_database) | |
86 : dir_(dir), | |
87 url_database_(url_database), | |
88 visit_database_(visit_database), | |
89 recent_changes_(RecentChangeList::NO_AUTO_EVICT), | |
90 transaction_nesting_(0), | |
91 db_cache_(DBCache::NO_AUTO_EVICT), | |
92 present_databases_loaded_(false), | |
93 weak_factory_(this), | |
94 history_publisher_(NULL) { | |
95 } | |
96 | |
97 TextDatabaseManager::~TextDatabaseManager() { | |
98 if (transaction_nesting_) | |
99 CommitTransaction(); | |
100 } | |
101 | |
102 // static | |
103 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) { | |
104 Time::Exploded exploded; | |
105 time.UTCExplode(&exploded); | |
106 | |
107 // We combine the month and year into a 6-digit number (200801 for | |
108 // January, 2008). The month is 1-based. | |
109 return exploded.year * 100 + exploded.month; | |
110 } | |
111 | |
112 // static | |
113 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) { | |
114 Time::Exploded exploded; | |
115 memset(&exploded, 0, sizeof(Time::Exploded)); | |
116 exploded.year = id / 100; | |
117 exploded.month = id % 100; | |
118 return Time::FromUTCExploded(exploded); | |
119 } | |
120 | |
121 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) { | |
122 history_publisher_ = history_publisher; | |
123 | |
124 // Start checking recent changes and committing them. | |
125 ScheduleFlushOldChanges(); | |
126 return true; | |
127 } | |
128 | |
129 void TextDatabaseManager::BeginTransaction() { | |
130 transaction_nesting_++; | |
131 } | |
132 | |
133 void TextDatabaseManager::CommitTransaction() { | |
134 DCHECK(transaction_nesting_); | |
135 transaction_nesting_--; | |
136 if (transaction_nesting_) | |
137 return; // Still more nesting of transactions before committing. | |
138 | |
139 // Commit all databases with open transactions on them. | |
140 for (DBIdentSet::const_iterator i = open_transactions_.begin(); | |
141 i != open_transactions_.end(); ++i) { | |
142 DBCache::iterator iter = db_cache_.Get(*i); | |
143 if (iter == db_cache_.end()) { | |
144 NOTREACHED() << "All open transactions should be cached."; | |
145 continue; | |
146 } | |
147 iter->second->CommitTransaction(); | |
148 } | |
149 open_transactions_.clear(); | |
150 | |
151 // Now that the transaction is over, we can expire old connections. | |
152 db_cache_.ShrinkToSize(kCacheDBSize); | |
153 } | |
154 | |
155 void TextDatabaseManager::InitDBList() { | |
156 if (present_databases_loaded_) | |
157 return; | |
158 | |
159 present_databases_loaded_ = true; | |
160 | |
161 // Find files on disk matching our pattern so we can quickly test for them. | |
162 base::FilePath::StringType filepattern(TextDatabase::file_base()); | |
163 filepattern.append(FILE_PATH_LITERAL("*")); | |
164 base::FileEnumerator enumerator( | |
165 dir_, false, base::FileEnumerator::FILES, filepattern); | |
166 base::FilePath cur_file; | |
167 while (!(cur_file = enumerator.Next()).empty()) { | |
168 // Convert to the number representing this file. | |
169 TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file); | |
170 if (id) // Will be 0 on error. | |
171 present_databases_.insert(id); | |
172 } | |
173 } | |
174 | |
175 void TextDatabaseManager::AddPageURL(const GURL& url, | |
176 URLID url_id, | |
177 VisitID visit_id, | |
178 Time time) { | |
179 // Delete any existing page info. | |
180 RecentChangeList::iterator found = recent_changes_.Peek(url); | |
181 if (found != recent_changes_.end()) | |
182 recent_changes_.Erase(found); | |
183 | |
184 // Just save this info for later. We will save it when it expires or when all | |
185 // the data is complete. | |
186 recent_changes_.Put(url, PageInfo(url_id, visit_id, time)); | |
187 } | |
188 | |
189 void TextDatabaseManager::AddPageTitle(const GURL& url, | |
190 const string16& title) { | |
191 RecentChangeList::iterator found = recent_changes_.Peek(url); | |
192 if (found == recent_changes_.end()) { | |
193 // This page is not in our cache of recent pages. This is very much an edge | |
194 // case as normally a title will come in <20 seconds after the page commits, | |
195 // and WebContents will avoid spamming us with >1 title per page. However, | |
196 // it could come up if your connection is unhappy, and we don't want to | |
197 // miss anything. | |
198 // | |
199 // To solve this problem, we'll just associate the most recent visit with | |
200 // the new title and index that using the regular code path. | |
201 URLRow url_row; | |
202 if (!url_database_->GetRowForURL(url, &url_row)) | |
203 return; // URL is unknown, give up. | |
204 VisitRow visit; | |
205 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) | |
206 return; // No recent visit, give up. | |
207 | |
208 if (visit.is_indexed) { | |
209 // If this page was already indexed, we could have a body that came in | |
210 // first and we don't want to overwrite it. We could go query for the | |
211 // current body, or have a special setter for only the title, but this is | |
212 // not worth it for this edge case. | |
213 // | |
214 // It will be almost impossible for the title to take longer than | |
215 // kExpirationSeconds yet we got a body in less than that time, since | |
216 // the title should always come in first. | |
217 return; | |
218 } | |
219 | |
220 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, | |
221 title, string16()); | |
222 return; // We don't know about this page, give up. | |
223 } | |
224 | |
225 PageInfo& info = found->second; | |
226 if (info.has_body()) { | |
227 // This info is complete, write to the database. | |
228 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), | |
229 title, info.body()); | |
230 recent_changes_.Erase(found); | |
231 return; | |
232 } | |
233 | |
234 info.set_title(title); | |
235 } | |
236 | |
237 void TextDatabaseManager::AddPageContents(const GURL& url, | |
238 const string16& body) { | |
239 RecentChangeList::iterator found = recent_changes_.Peek(url); | |
240 if (found == recent_changes_.end()) { | |
241 // This page is not in our cache of recent pages. This means that the page | |
242 // took more than kExpirationSeconds to load. Often, this will be the result | |
243 // of a very slow iframe or other resource on the page that makes us think | |
244 // it's still loading. | |
245 // | |
246 // As a fallback, set the most recent visit's contents using the input, and | |
247 // use the last set title in the URL table as the title to index. | |
248 URLRow url_row; | |
249 if (!url_database_->GetRowForURL(url, &url_row)) | |
250 return; // URL is unknown, give up. | |
251 VisitRow visit; | |
252 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) | |
253 return; // No recent visit, give up. | |
254 | |
255 // Use the title from the URL row as the title for the indexing. | |
256 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, | |
257 url_row.title(), body); | |
258 return; | |
259 } | |
260 | |
261 PageInfo& info = found->second; | |
262 if (info.has_title()) { | |
263 // This info is complete, write to the database. | |
264 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), | |
265 info.title(), body); | |
266 recent_changes_.Erase(found); | |
267 return; | |
268 } | |
269 | |
270 info.set_body(body); | |
271 } | |
272 | |
273 bool TextDatabaseManager::AddPageData(const GURL& url, | |
274 URLID url_id, | |
275 VisitID visit_id, | |
276 Time visit_time, | |
277 const string16& title, | |
278 const string16& body) { | |
279 TextDatabase* db = GetDBForTime(visit_time, true); | |
280 if (!db) | |
281 return false; | |
282 | |
283 TimeTicks beginning_time = TimeTicks::Now(); | |
284 | |
285 // First delete any recently-indexed data for this page. This will delete | |
286 // anything in the main database, but we don't bother looking through the | |
287 // archived database. | |
288 VisitVector visits; | |
289 visit_database_->GetIndexedVisitsForURL(url_id, &visits); | |
290 for (size_t i = 0; i < visits.size(); i++) { | |
291 visits[i].is_indexed = false; | |
292 visit_database_->UpdateVisitRow(visits[i]); | |
293 DeletePageData(visits[i].visit_time, url, NULL); | |
294 } | |
295 | |
296 if (visit_id) { | |
297 // We're supposed to update the visit database, so load the visit. | |
298 VisitRow row; | |
299 if (!visit_database_->GetRowForVisit(visit_id, &row)) { | |
300 // This situation can occur if Chrome's history is in the process of | |
301 // being updated, and then the browsing history is deleted before all | |
302 // updates have been completely performed. In this case, a stale update | |
303 // to the database is attempted, leading to the warning below. | |
304 DLOG(WARNING) << "Could not find requested visit #" << visit_id; | |
305 return false; | |
306 } | |
307 | |
308 DCHECK(visit_time == row.visit_time); | |
309 | |
310 // Update the visit database to reference our addition. | |
311 row.is_indexed = true; | |
312 if (!visit_database_->UpdateVisitRow(row)) | |
313 return false; | |
314 } | |
315 | |
316 // Now index the data. | |
317 std::string url_str = URLDatabase::GURLToDatabaseURL(url); | |
318 bool success = db->AddPageData(visit_time, url_str, | |
319 ConvertStringForIndexer(title), | |
320 ConvertStringForIndexer(body)); | |
321 | |
322 UMA_HISTOGRAM_TIMES("History.AddFTSData", | |
323 TimeTicks::Now() - beginning_time); | |
324 | |
325 if (history_publisher_) | |
326 history_publisher_->PublishPageContent(visit_time, url, title, body); | |
327 | |
328 return success; | |
329 } | |
330 | |
331 void TextDatabaseManager::DeletePageData(Time time, const GURL& url, | |
332 ChangeSet* change_set) { | |
333 TextDatabase::DBIdent db_ident = TimeToID(time); | |
334 | |
335 // We want to open the database for writing, but only if it exists. To | |
336 // achieve this, we check whether it exists by saying we're not going to | |
337 // write to it (avoiding the autocreation code normally called when writing) | |
338 // and then access it for writing only if it succeeds. | |
339 TextDatabase* db = GetDB(db_ident, false); | |
340 if (!db) | |
341 return; | |
342 db = GetDB(db_ident, true); | |
343 | |
344 if (change_set) | |
345 change_set->Add(db_ident); | |
346 | |
347 db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url)); | |
348 } | |
349 | |
350 void TextDatabaseManager::DeleteFromUncommitted( | |
351 const std::set<GURL>& restrict_urls, Time begin, Time end) { | |
352 // First find the beginning of the range to delete. Recall that the list | |
353 // has the most recent item at the beginning. There won't normally be very | |
354 // many items, so a brute-force search is fine. | |
355 RecentChangeList::iterator cur = recent_changes_.begin(); | |
356 if (!end.is_null()) { | |
357 // Walk from the beginning of the list backwards in time to find the newest | |
358 // entry that should be deleted. | |
359 while (cur != recent_changes_.end() && cur->second.visit_time() >= end) | |
360 ++cur; | |
361 } | |
362 | |
363 // Now delete all visits up to the oldest one we were supposed to delete. | |
364 // Note that if begin is_null, it will be less than or equal to any other | |
365 // time. | |
366 if (restrict_urls.empty()) { | |
367 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) | |
368 cur = recent_changes_.Erase(cur); | |
369 } else { | |
370 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) { | |
371 if (restrict_urls.find(cur->first) != restrict_urls.end()) | |
372 cur = recent_changes_.Erase(cur); | |
373 else | |
374 ++cur; | |
375 } | |
376 } | |
377 } | |
378 | |
379 void TextDatabaseManager::DeleteFromUncommittedForTimes( | |
380 const std::vector<base::Time>& times) { | |
381 // |times| must be in reverse chronological order, i.e. each member | |
382 // must be earlier than or the same as the one before it. | |
383 DCHECK( | |
384 std::adjacent_find( | |
385 times.begin(), times.end(), std::less<base::Time>()) == | |
386 times.end()); | |
387 | |
388 // Both |recent_changes_| and |times| are in reverse chronological order. | |
389 RecentChangeList::iterator it = recent_changes_.begin(); | |
390 std::vector<base::Time>::const_iterator time_it = times.begin(); | |
391 while (it != recent_changes_.end() && time_it != times.end()) { | |
392 base::Time visit_time = it->second.visit_time(); | |
393 if (visit_time == *time_it) { | |
394 it = recent_changes_.Erase(it); | |
395 } else if (visit_time < *time_it) { | |
396 ++time_it; | |
397 } else /* if (visit_time > *time_it) */ { | |
398 ++it; | |
399 } | |
400 } | |
401 } | |
402 | |
403 void TextDatabaseManager::DeleteAll() { | |
404 DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction."; | |
405 | |
406 InitDBList(); | |
407 | |
408 // Delete uncommitted entries. | |
409 recent_changes_.Clear(); | |
410 | |
411 // Close all open databases. | |
412 db_cache_.Clear(); | |
413 | |
414 // Now go through and delete all the files. | |
415 for (DBIdentSet::iterator i = present_databases_.begin(); | |
416 i != present_databases_.end(); ++i) { | |
417 base::FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i)); | |
418 file_util::Delete(file_name, false); | |
419 } | |
420 } | |
421 | |
422 void TextDatabaseManager::OptimizeChangedDatabases( | |
423 const ChangeSet& change_set) { | |
424 for (ChangeSet::DBSet::const_iterator i = | |
425 change_set.changed_databases_.begin(); | |
426 i != change_set.changed_databases_.end(); ++i) { | |
427 // We want to open the database for writing, but only if it exists. To | |
428 // achieve this, we check whether it exists by saying we're not going to | |
429 // write to it (avoiding the autocreation code normally called when writing) | |
430 // and then access it for writing only if it succeeds. | |
431 TextDatabase* db = GetDB(*i, false); | |
432 if (!db) | |
433 continue; | |
434 db = GetDB(*i, true); | |
435 if (!db) | |
436 continue; // The file may have changed or something. | |
437 db->Optimize(); | |
438 } | |
439 } | |
440 | |
441 void TextDatabaseManager::GetTextMatches( | |
442 const string16& query, | |
443 const QueryOptions& options, | |
444 std::vector<TextDatabase::Match>* results, | |
445 Time* first_time_searched) { | |
446 results->clear(); | |
447 | |
448 *first_time_searched = options.begin_time; | |
449 | |
450 InitDBList(); | |
451 if (present_databases_.empty()) | |
452 return; // Nothing to search. | |
453 | |
454 // Get the query into the proper format for the individual DBs. | |
455 string16 fts_query16; | |
456 query_parser_.ParseQuery(query, &fts_query16); | |
457 std::string fts_query = UTF16ToUTF8(fts_query16); | |
458 | |
459 // Need a copy of the options so we can modify the max count for each call | |
460 // to the individual databases. | |
461 QueryOptions cur_options(options); | |
462 | |
463 // Compute the minimum and maximum values for the identifiers that could | |
464 // encompass the input time range. | |
465 TextDatabase::DBIdent min_ident = options.begin_time.is_null() ? | |
466 *present_databases_.begin() : | |
467 TimeToID(options.begin_time); | |
468 TextDatabase::DBIdent max_ident = options.end_time.is_null() ? | |
469 *present_databases_.rbegin() : | |
470 TimeToID(options.end_time); | |
471 | |
472 // Iterate over the databases from the most recent backwards. | |
473 TextDatabase::URLSet found_urls; | |
474 for (DBIdentSet::reverse_iterator i = present_databases_.rbegin(); | |
475 i != present_databases_.rend(); | |
476 ++i) { | |
477 // TODO(brettw) allow canceling the query in the middle. | |
478 // if (canceled_or_something) | |
479 // break; | |
480 | |
481 // This code is stupid, we just loop until we find the correct starting | |
482 // time range rather than search in an intelligent way. Users will have a | |
483 // few dozen files at most, so this should not be an issue. | |
484 if (*i > max_ident) | |
485 continue; // Haven't gotten to the time range yet. | |
486 if (*i < min_ident) | |
487 break; // Covered all the time range. | |
488 | |
489 TextDatabase* cur_db = GetDB(*i, false); | |
490 if (!cur_db) | |
491 continue; | |
492 | |
493 // Adjust the max count according to how many results we've already got. | |
494 if (options.max_count) { | |
495 cur_options.max_count = options.max_count - | |
496 static_cast<int>(results->size()); | |
497 } | |
498 | |
499 bool has_more_results = cur_db->GetTextMatches( | |
500 fts_query, cur_options, results, &found_urls); | |
501 | |
502 DCHECK(static_cast<int>(results->size()) <= options.EffectiveMaxCount()); | |
503 | |
504 if (has_more_results || | |
505 static_cast<int>(results->size()) == options.EffectiveMaxCount()) { | |
506 // Since the search proceeds backwards in time, the last result we have | |
507 // gives the first time searched. | |
508 *first_time_searched = results->back().time; | |
509 break; | |
510 } | |
511 } | |
512 } | |
513 | |
514 size_t TextDatabaseManager::GetUncommittedEntryCountForTest() const { | |
515 return recent_changes_.size(); | |
516 } | |
517 | |
518 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id, | |
519 bool for_writing) { | |
520 DBCache::iterator found_db = db_cache_.Get(id); | |
521 if (found_db != db_cache_.end()) { | |
522 if (transaction_nesting_ && for_writing && | |
523 open_transactions_.find(id) == open_transactions_.end()) { | |
524 // If we currently have an open transaction, that database is not yet | |
525 // part of the transaction, and the database will be written to, it needs | |
526 // to be part of our transaction. | |
527 found_db->second->BeginTransaction(); | |
528 open_transactions_.insert(id); | |
529 } | |
530 return found_db->second; | |
531 } | |
532 | |
533 // Need to make the database. | |
534 TextDatabase* new_db = new TextDatabase(dir_, id, for_writing); | |
535 if (!new_db->Init()) { | |
536 delete new_db; | |
537 return NULL; | |
538 } | |
539 db_cache_.Put(id, new_db); | |
540 present_databases_.insert(id); | |
541 | |
542 if (transaction_nesting_ && for_writing) { | |
543 // If we currently have an open transaction and the new database will be | |
544 // written to, it needs to be part of our transaction. | |
545 new_db->BeginTransaction(); | |
546 open_transactions_.insert(id); | |
547 } | |
548 | |
549 // When no transaction is open, allow this new one to kick out an old one. | |
550 if (!transaction_nesting_) | |
551 db_cache_.ShrinkToSize(kCacheDBSize); | |
552 | |
553 return new_db; | |
554 } | |
555 | |
556 TextDatabase* TextDatabaseManager::GetDBForTime(Time time, | |
557 bool create_if_necessary) { | |
558 return GetDB(TimeToID(time), create_if_necessary); | |
559 } | |
560 | |
561 void TextDatabaseManager::ScheduleFlushOldChanges() { | |
562 weak_factory_.InvalidateWeakPtrs(); | |
563 base::MessageLoop::current()->PostDelayedTask( | |
564 FROM_HERE, | |
565 base::Bind(&TextDatabaseManager::FlushOldChanges, | |
566 weak_factory_.GetWeakPtr()), | |
567 base::TimeDelta::FromSeconds(kExpirationSeconds)); | |
568 } | |
569 | |
570 void TextDatabaseManager::FlushOldChanges() { | |
571 FlushOldChangesForTime(TimeTicks::Now()); | |
572 } | |
573 | |
574 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) { | |
575 // The end of the list is the oldest, so we just start from there committing | |
576 // things until we get something too new. | |
577 RecentChangeList::reverse_iterator i = recent_changes_.rbegin(); | |
578 while (i != recent_changes_.rend() && i->second.Expired(now)) { | |
579 AddPageData(i->first, i->second.url_id(), i->second.visit_id(), | |
580 i->second.visit_time(), i->second.title(), i->second.body()); | |
581 i = recent_changes_.Erase(i); | |
582 } | |
583 | |
584 ScheduleFlushOldChanges(); | |
585 } | |
586 | |
587 } // namespace history | |
OLD | NEW |