| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include <limits> | |
| 6 #include <set> | |
| 7 #include <string> | |
| 8 | |
| 9 #include "chrome/browser/history/text_database.h" | |
| 10 | |
| 11 #include "base/file_util.h" | |
| 12 #include "base/logging.h" | |
| 13 #include "base/metrics/histogram.h" | |
| 14 #include "base/strings/string_number_conversions.h" | |
| 15 #include "base/strings/stringprintf.h" | |
| 16 #include "base/strings/utf_string_conversions.h" | |
| 17 #include "chrome/browser/diagnostics/sqlite_diagnostics.h" | |
| 18 #include "sql/statement.h" | |
| 19 #include "sql/transaction.h" | |
| 20 | |
| 21 // There are two tables in each database, one full-text search (FTS) table which | |
| 22 // indexes the contents and title of the pages. The other is a regular SQLITE | |
| 23 // table which contains non-indexed information about the page. All columns of | |
| 24 // a FTS table are indexed using the text search algorithm, which isn't what we | |
| 25 // want for things like times. If this were in the FTS table, there would be | |
| 26 // different words in the index for each time number. | |
| 27 // | |
| 28 // "pages" FTS table: | |
| 29 // url URL of the page so searches will match the URL. | |
| 30 // title Title of the page. | |
| 31 // body Body of the page. | |
| 32 // | |
| 33 // "info" regular table: | |
| 34 // time Time the corresponding FTS entry was visited. | |
| 35 // | |
| 36 // We do joins across these two tables by using their internal rowids, which we | |
| 37 // keep in sync between the two tables. The internal rowid is the only part of | |
| 38 // an FTS table that is indexed like a normal table, and the index over it is | |
| 39 // free since sqlite always indexes the internal rowid. | |
| 40 | |
| 41 namespace history { | |
| 42 | |
| 43 namespace { | |
| 44 | |
| 45 // Version 1 uses FTS2 for index files. | |
| 46 // Version 2 uses FTS3. | |
| 47 static const int kCurrentVersionNumber = 2; | |
| 48 static const int kCompatibleVersionNumber = 2; | |
| 49 | |
| 50 // Snippet computation relies on the index of the columns in the original | |
| 51 // create statement. These are the 0-based indices (as strings) of the | |
| 52 // corresponding columns. | |
| 53 const char kTitleColumnIndex[] = "1"; | |
| 54 const char kBodyColumnIndex[] = "2"; | |
| 55 | |
| 56 // The string prepended to the database identifier to generate the filename. | |
| 57 const base::FilePath::CharType kFilePrefix[] = | |
| 58 FILE_PATH_LITERAL("History Index "); | |
| 59 | |
| 60 } // namespace | |
| 61 | |
| 62 TextDatabase::Match::Match() {} | |
| 63 | |
| 64 TextDatabase::Match::~Match() {} | |
| 65 | |
| 66 TextDatabase::TextDatabase(const base::FilePath& path, | |
| 67 DBIdent id, | |
| 68 bool allow_create) | |
| 69 : path_(path), | |
| 70 ident_(id), | |
| 71 allow_create_(allow_create) { | |
| 72 // Compute the file name. | |
| 73 file_name_ = path_.Append(IDToFileName(ident_)); | |
| 74 } | |
| 75 | |
| 76 TextDatabase::~TextDatabase() { | |
| 77 } | |
| 78 | |
| 79 // static | |
| 80 const base::FilePath::CharType* TextDatabase::file_base() { | |
| 81 return kFilePrefix; | |
| 82 } | |
| 83 | |
| 84 // static | |
| 85 base::FilePath TextDatabase::IDToFileName(DBIdent id) { | |
| 86 // Identifiers are intended to be a combination of the year and month, for | |
| 87 // example, 200801 for January 2008. We convert this to | |
| 88 // "History Index 2008-01". However, we don't make assumptions about this | |
| 89 // scheme: the caller should assign IDs as it feels fit with the knowledge | |
| 90 // that they will apppear on disk in this form. | |
| 91 base::FilePath::StringType filename(file_base()); | |
| 92 base::StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"), | |
| 93 id / 100, id % 100); | |
| 94 return base::FilePath(filename); | |
| 95 } | |
| 96 | |
| 97 // static | |
| 98 TextDatabase::DBIdent TextDatabase::FileNameToID( | |
| 99 const base::FilePath& file_path) { | |
| 100 base::FilePath::StringType file_name = file_path.BaseName().value(); | |
| 101 | |
| 102 // We don't actually check the prefix here. Since the file system could | |
| 103 // be case insensitive in ways we can't predict (NTFS), checking could | |
| 104 // potentially be the wrong thing to do. Instead, we just look for a suffix. | |
| 105 static const size_t kIDStringLength = 7; // Room for "xxxx-xx". | |
| 106 if (file_name.length() < kIDStringLength) | |
| 107 return 0; | |
| 108 const base::FilePath::StringType suffix( | |
| 109 &file_name[file_name.length() - kIDStringLength]); | |
| 110 | |
| 111 if (suffix.length() != kIDStringLength || | |
| 112 suffix[4] != FILE_PATH_LITERAL('-')) { | |
| 113 return 0; | |
| 114 } | |
| 115 | |
| 116 // TODO: Once StringPiece supports a templated interface over the | |
| 117 // underlying string type, use it here instead of substr, since that | |
| 118 // will avoid needless string copies. StringPiece cannot be used | |
| 119 // right now because base::FilePath::StringType could use either 8 or 16 bit | |
| 120 // characters, depending on the OS. | |
| 121 int year, month; | |
| 122 base::StringToInt(suffix.substr(0, 4), &year); | |
| 123 base::StringToInt(suffix.substr(5, 2), &month); | |
| 124 | |
| 125 return year * 100 + month; | |
| 126 } | |
| 127 | |
| 128 bool TextDatabase::Init() { | |
| 129 // Make sure, if we're not allowed to create the file, that it exists. | |
| 130 if (!allow_create_) { | |
| 131 if (!file_util::PathExists(file_name_)) | |
| 132 return false; | |
| 133 } | |
| 134 | |
| 135 db_.set_histogram_tag("Text"); | |
| 136 | |
| 137 // Set the database page size to something a little larger to give us | |
| 138 // better performance (we're typically seek rather than bandwidth limited). | |
| 139 // This only has an effect before any tables have been created, otherwise | |
| 140 // this is a NOP. Must be a power of 2 and a max of 8192. | |
| 141 db_.set_page_size(4096); | |
| 142 | |
| 143 // The default cache size is 2000 which give >8MB of data. Since we will often | |
| 144 // have 2-3 of these objects, each with their own 8MB, this adds up very fast. | |
| 145 // We therefore reduce the size so when there are multiple objects, we're not | |
| 146 // too big. | |
| 147 db_.set_cache_size(512); | |
| 148 | |
| 149 // Run the database in exclusive mode. Nobody else should be accessing the | |
| 150 // database while we're running, and this will give somewhat improved perf. | |
| 151 db_.set_exclusive_locking(); | |
| 152 | |
| 153 // Attach the database to our index file. | |
| 154 if (!db_.Open(file_name_)) | |
| 155 return false; | |
| 156 | |
| 157 // Meta table tracking version information. | |
| 158 if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber)) | |
| 159 return false; | |
| 160 if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) { | |
| 161 // This version is too new. We don't bother notifying the user on this | |
| 162 // error, and just fail to use the file. Normally if they have version skew, | |
| 163 // they will get it for the main history file and it won't be necessary | |
| 164 // here. If that's not the case, since this is only indexed data, it's | |
| 165 // probably better to just not give FTS results than strange errors when | |
| 166 // everything else is working OK. | |
| 167 LOG(WARNING) << "Text database is too new."; | |
| 168 return false; | |
| 169 } | |
| 170 | |
| 171 return CreateTables(); | |
| 172 } | |
| 173 | |
| 174 void TextDatabase::BeginTransaction() { | |
| 175 db_.BeginTransaction(); | |
| 176 } | |
| 177 | |
| 178 void TextDatabase::CommitTransaction() { | |
| 179 db_.CommitTransaction(); | |
| 180 } | |
| 181 | |
| 182 bool TextDatabase::CreateTables() { | |
| 183 // FTS table of page contents. | |
| 184 if (!db_.DoesTableExist("pages")) { | |
| 185 if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3(" | |
| 186 "TOKENIZE icu," | |
| 187 "url LONGVARCHAR," | |
| 188 "title LONGVARCHAR," | |
| 189 "body LONGVARCHAR)")) | |
| 190 return false; | |
| 191 } | |
| 192 | |
| 193 // Non-FTS table containing URLs and times so we can efficiently find them | |
| 194 // using a regular index (all FTS columns are special and are treated as | |
| 195 // full-text-search, which is not what we want when retrieving this data). | |
| 196 if (!db_.DoesTableExist("info")) { | |
| 197 // Note that there is no point in creating an index over time. Since | |
| 198 // we must always query the entire FTS table (it can not efficiently do | |
| 199 // subsets), we will always end up doing that first, and joining the info | |
| 200 // table off of that. | |
| 201 if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)")) | |
| 202 return false; | |
| 203 } | |
| 204 | |
| 205 // Create the index. | |
| 206 return db_.Execute("CREATE INDEX IF NOT EXISTS info_time ON info(time)"); | |
| 207 } | |
| 208 | |
| 209 bool TextDatabase::AddPageData(base::Time time, | |
| 210 const std::string& url, | |
| 211 const std::string& title, | |
| 212 const std::string& contents) { | |
| 213 sql::Transaction committer(&db_); | |
| 214 if (!committer.Begin()) | |
| 215 return false; | |
| 216 | |
| 217 // Add to the pages table. | |
| 218 sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE, | |
| 219 "INSERT INTO pages (url, title, body) VALUES (?,?,?)")); | |
| 220 add_to_pages.BindString(0, url); | |
| 221 add_to_pages.BindString(1, title); | |
| 222 add_to_pages.BindString(2, contents); | |
| 223 if (!add_to_pages.Run()) | |
| 224 return false; | |
| 225 | |
| 226 int64 rowid = db_.GetLastInsertRowId(); | |
| 227 | |
| 228 // Add to the info table with the same rowid. | |
| 229 sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE, | |
| 230 "INSERT INTO info (rowid, time) VALUES (?,?)")); | |
| 231 add_to_info.BindInt64(0, rowid); | |
| 232 add_to_info.BindInt64(1, time.ToInternalValue()); | |
| 233 | |
| 234 if (!add_to_info.Run()) | |
| 235 return false; | |
| 236 | |
| 237 return committer.Commit(); | |
| 238 } | |
| 239 | |
| 240 void TextDatabase::DeletePageData(base::Time time, const std::string& url) { | |
| 241 // First get all rows that match. Selecing on time (which has an index) allows | |
| 242 // us to avoid brute-force searches on the full-text-index table (there will | |
| 243 // generally be only one match per time). | |
| 244 sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE, | |
| 245 "SELECT info.rowid " | |
| 246 "FROM info JOIN pages ON info.rowid = pages.rowid " | |
| 247 "WHERE info.time=? AND pages.url=?")); | |
| 248 select_ids.BindInt64(0, time.ToInternalValue()); | |
| 249 select_ids.BindString(1, url); | |
| 250 | |
| 251 std::set<int64> rows_to_delete; | |
| 252 while (select_ids.Step()) | |
| 253 rows_to_delete.insert(select_ids.ColumnInt64(0)); | |
| 254 | |
| 255 // Delete from the pages table. | |
| 256 sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE, | |
| 257 "DELETE FROM pages WHERE rowid=?")); | |
| 258 | |
| 259 for (std::set<int64>::const_iterator i = rows_to_delete.begin(); | |
| 260 i != rows_to_delete.end(); ++i) { | |
| 261 delete_page.BindInt64(0, *i); | |
| 262 if (!delete_page.Run()) | |
| 263 return; | |
| 264 delete_page.Reset(true); | |
| 265 } | |
| 266 | |
| 267 // Delete from the info table. | |
| 268 sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE, | |
| 269 "DELETE FROM info WHERE rowid=?")); | |
| 270 | |
| 271 for (std::set<int64>::const_iterator i = rows_to_delete.begin(); | |
| 272 i != rows_to_delete.end(); ++i) { | |
| 273 delete_info.BindInt64(0, *i); | |
| 274 if (!delete_info.Run()) | |
| 275 return; | |
| 276 delete_info.Reset(true); | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 void TextDatabase::Optimize() { | |
| 281 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE, | |
| 282 "SELECT OPTIMIZE(pages) FROM pages LIMIT 1")); | |
| 283 statement.Run(); | |
| 284 } | |
| 285 | |
| 286 bool TextDatabase::GetTextMatches(const std::string& query, | |
| 287 const QueryOptions& options, | |
| 288 std::vector<Match>* results, | |
| 289 URLSet* found_urls) { | |
| 290 std::string sql = "SELECT url, title, time, offsets(pages), body FROM pages " | |
| 291 "LEFT OUTER JOIN info ON pages.rowid = info.rowid WHERE "; | |
| 292 sql += options.body_only ? "body " : "pages "; | |
| 293 sql += "MATCH ? AND time >= ? AND time < ? "; | |
| 294 // Times may not be unique, so also sort by rowid to ensure a stable order. | |
| 295 sql += "ORDER BY time DESC, info.rowid DESC"; | |
| 296 | |
| 297 // Generate unique IDs for the two possible variations of the statement, | |
| 298 // so they don't share the same cached prepared statement. | |
| 299 sql::StatementID body_only_id = SQL_FROM_HERE; | |
| 300 sql::StatementID pages_id = SQL_FROM_HERE; | |
| 301 | |
| 302 sql::Statement statement(db_.GetCachedStatement( | |
| 303 (options.body_only ? body_only_id : pages_id), sql.c_str())); | |
| 304 | |
| 305 statement.BindString(0, query); | |
| 306 statement.BindInt64(1, options.EffectiveBeginTime()); | |
| 307 statement.BindInt64(2, options.EffectiveEndTime()); | |
| 308 | |
| 309 // |results| may not be initially empty, so keep track of how many were added | |
| 310 // by this call. | |
| 311 int result_count = 0; | |
| 312 | |
| 313 while (statement.Step()) { | |
| 314 // TODO(brettw) allow canceling the query in the middle. | |
| 315 // if (canceled_or_something) | |
| 316 // break; | |
| 317 | |
| 318 GURL url(statement.ColumnString(0)); | |
| 319 URLSet::const_iterator found_url = found_urls->find(url); | |
| 320 if (found_url != found_urls->end()) | |
| 321 continue; // Don't add this duplicate. | |
| 322 | |
| 323 if (++result_count > options.EffectiveMaxCount()) | |
| 324 break; | |
| 325 | |
| 326 // Fill the results into the vector (avoid copying the URL with Swap()). | |
| 327 results->resize(results->size() + 1); | |
| 328 Match& match = results->at(results->size() - 1); | |
| 329 match.url.Swap(&url); | |
| 330 | |
| 331 match.title = statement.ColumnString16(1); | |
| 332 match.time = base::Time::FromInternalValue(statement.ColumnInt64(2)); | |
| 333 | |
| 334 // Extract any matches in the title. | |
| 335 std::string offsets_str = statement.ColumnString(3); | |
| 336 Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex, | |
| 337 &match.title_match_positions); | |
| 338 Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1), | |
| 339 &match.title_match_positions); | |
| 340 | |
| 341 // Extract the matches in the body. | |
| 342 Snippet::MatchPositions match_positions; | |
| 343 Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex, | |
| 344 &match_positions); | |
| 345 | |
| 346 // Compute the snippet based on those matches. | |
| 347 std::string body = statement.ColumnString(4); | |
| 348 match.snippet.ComputeSnippet(match_positions, body); | |
| 349 } | |
| 350 statement.Reset(true); | |
| 351 return result_count > options.EffectiveMaxCount(); | |
| 352 } | |
| 353 | |
| 354 } // namespace history | |
| OLD | NEW |