Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(217)

Side by Side Diff: chrome/browser/history/text_database.cc

Issue 16951015: Remove TextDatabase from the history service. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@replace_fts
Patch Set: Created 7 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <limits>
6 #include <set>
7 #include <string>
8
9 #include "chrome/browser/history/text_database.h"
10
11 #include "base/file_util.h"
12 #include "base/logging.h"
13 #include "base/metrics/histogram.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "chrome/browser/diagnostics/sqlite_diagnostics.h"
18 #include "sql/statement.h"
19 #include "sql/transaction.h"
20
21 // There are two tables in each database, one full-text search (FTS) table which
22 // indexes the contents and title of the pages. The other is a regular SQLITE
23 // table which contains non-indexed information about the page. All columns of
24 // a FTS table are indexed using the text search algorithm, which isn't what we
25 // want for things like times. If this were in the FTS table, there would be
26 // different words in the index for each time number.
27 //
28 // "pages" FTS table:
29 // url URL of the page so searches will match the URL.
30 // title Title of the page.
31 // body Body of the page.
32 //
33 // "info" regular table:
34 // time Time the corresponding FTS entry was visited.
35 //
36 // We do joins across these two tables by using their internal rowids, which we
37 // keep in sync between the two tables. The internal rowid is the only part of
38 // an FTS table that is indexed like a normal table, and the index over it is
39 // free since sqlite always indexes the internal rowid.
40
41 namespace history {
42
43 namespace {
44
45 // Version 1 uses FTS2 for index files.
46 // Version 2 uses FTS3.
47 static const int kCurrentVersionNumber = 2;
48 static const int kCompatibleVersionNumber = 2;
49
50 // Snippet computation relies on the index of the columns in the original
51 // create statement. These are the 0-based indices (as strings) of the
52 // corresponding columns.
53 const char kTitleColumnIndex[] = "1";
54 const char kBodyColumnIndex[] = "2";
55
56 // The string prepended to the database identifier to generate the filename.
57 const base::FilePath::CharType kFilePrefix[] =
58 FILE_PATH_LITERAL("History Index ");
59
60 } // namespace
61
62 TextDatabase::Match::Match() {}
63
64 TextDatabase::Match::~Match() {}
65
66 TextDatabase::TextDatabase(const base::FilePath& path,
67 DBIdent id,
68 bool allow_create)
69 : path_(path),
70 ident_(id),
71 allow_create_(allow_create) {
72 // Compute the file name.
73 file_name_ = path_.Append(IDToFileName(ident_));
74 }
75
76 TextDatabase::~TextDatabase() {
77 }
78
79 // static
80 const base::FilePath::CharType* TextDatabase::file_base() {
81 return kFilePrefix;
82 }
83
84 // static
85 base::FilePath TextDatabase::IDToFileName(DBIdent id) {
86 // Identifiers are intended to be a combination of the year and month, for
87 // example, 200801 for January 2008. We convert this to
88 // "History Index 2008-01". However, we don't make assumptions about this
89 // scheme: the caller should assign IDs as it feels fit with the knowledge
90 // that they will apppear on disk in this form.
91 base::FilePath::StringType filename(file_base());
92 base::StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"),
93 id / 100, id % 100);
94 return base::FilePath(filename);
95 }
96
97 // static
98 TextDatabase::DBIdent TextDatabase::FileNameToID(
99 const base::FilePath& file_path) {
100 base::FilePath::StringType file_name = file_path.BaseName().value();
101
102 // We don't actually check the prefix here. Since the file system could
103 // be case insensitive in ways we can't predict (NTFS), checking could
104 // potentially be the wrong thing to do. Instead, we just look for a suffix.
105 static const size_t kIDStringLength = 7; // Room for "xxxx-xx".
106 if (file_name.length() < kIDStringLength)
107 return 0;
108 const base::FilePath::StringType suffix(
109 &file_name[file_name.length() - kIDStringLength]);
110
111 if (suffix.length() != kIDStringLength ||
112 suffix[4] != FILE_PATH_LITERAL('-')) {
113 return 0;
114 }
115
116 // TODO: Once StringPiece supports a templated interface over the
117 // underlying string type, use it here instead of substr, since that
118 // will avoid needless string copies. StringPiece cannot be used
119 // right now because base::FilePath::StringType could use either 8 or 16 bit
120 // characters, depending on the OS.
121 int year, month;
122 base::StringToInt(suffix.substr(0, 4), &year);
123 base::StringToInt(suffix.substr(5, 2), &month);
124
125 return year * 100 + month;
126 }
127
128 bool TextDatabase::Init() {
129 // Make sure, if we're not allowed to create the file, that it exists.
130 if (!allow_create_) {
131 if (!file_util::PathExists(file_name_))
132 return false;
133 }
134
135 db_.set_histogram_tag("Text");
136
137 // Set the database page size to something a little larger to give us
138 // better performance (we're typically seek rather than bandwidth limited).
139 // This only has an effect before any tables have been created, otherwise
140 // this is a NOP. Must be a power of 2 and a max of 8192.
141 db_.set_page_size(4096);
142
143 // The default cache size is 2000 which give >8MB of data. Since we will often
144 // have 2-3 of these objects, each with their own 8MB, this adds up very fast.
145 // We therefore reduce the size so when there are multiple objects, we're not
146 // too big.
147 db_.set_cache_size(512);
148
149 // Run the database in exclusive mode. Nobody else should be accessing the
150 // database while we're running, and this will give somewhat improved perf.
151 db_.set_exclusive_locking();
152
153 // Attach the database to our index file.
154 if (!db_.Open(file_name_))
155 return false;
156
157 // Meta table tracking version information.
158 if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber))
159 return false;
160 if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) {
161 // This version is too new. We don't bother notifying the user on this
162 // error, and just fail to use the file. Normally if they have version skew,
163 // they will get it for the main history file and it won't be necessary
164 // here. If that's not the case, since this is only indexed data, it's
165 // probably better to just not give FTS results than strange errors when
166 // everything else is working OK.
167 LOG(WARNING) << "Text database is too new.";
168 return false;
169 }
170
171 return CreateTables();
172 }
173
174 void TextDatabase::BeginTransaction() {
175 db_.BeginTransaction();
176 }
177
178 void TextDatabase::CommitTransaction() {
179 db_.CommitTransaction();
180 }
181
182 bool TextDatabase::CreateTables() {
183 // FTS table of page contents.
184 if (!db_.DoesTableExist("pages")) {
185 if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3("
186 "TOKENIZE icu,"
187 "url LONGVARCHAR,"
188 "title LONGVARCHAR,"
189 "body LONGVARCHAR)"))
190 return false;
191 }
192
193 // Non-FTS table containing URLs and times so we can efficiently find them
194 // using a regular index (all FTS columns are special and are treated as
195 // full-text-search, which is not what we want when retrieving this data).
196 if (!db_.DoesTableExist("info")) {
197 // Note that there is no point in creating an index over time. Since
198 // we must always query the entire FTS table (it can not efficiently do
199 // subsets), we will always end up doing that first, and joining the info
200 // table off of that.
201 if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)"))
202 return false;
203 }
204
205 // Create the index.
206 return db_.Execute("CREATE INDEX IF NOT EXISTS info_time ON info(time)");
207 }
208
209 bool TextDatabase::AddPageData(base::Time time,
210 const std::string& url,
211 const std::string& title,
212 const std::string& contents) {
213 sql::Transaction committer(&db_);
214 if (!committer.Begin())
215 return false;
216
217 // Add to the pages table.
218 sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE,
219 "INSERT INTO pages (url, title, body) VALUES (?,?,?)"));
220 add_to_pages.BindString(0, url);
221 add_to_pages.BindString(1, title);
222 add_to_pages.BindString(2, contents);
223 if (!add_to_pages.Run())
224 return false;
225
226 int64 rowid = db_.GetLastInsertRowId();
227
228 // Add to the info table with the same rowid.
229 sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE,
230 "INSERT INTO info (rowid, time) VALUES (?,?)"));
231 add_to_info.BindInt64(0, rowid);
232 add_to_info.BindInt64(1, time.ToInternalValue());
233
234 if (!add_to_info.Run())
235 return false;
236
237 return committer.Commit();
238 }
239
240 void TextDatabase::DeletePageData(base::Time time, const std::string& url) {
241 // First get all rows that match. Selecing on time (which has an index) allows
242 // us to avoid brute-force searches on the full-text-index table (there will
243 // generally be only one match per time).
244 sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE,
245 "SELECT info.rowid "
246 "FROM info JOIN pages ON info.rowid = pages.rowid "
247 "WHERE info.time=? AND pages.url=?"));
248 select_ids.BindInt64(0, time.ToInternalValue());
249 select_ids.BindString(1, url);
250
251 std::set<int64> rows_to_delete;
252 while (select_ids.Step())
253 rows_to_delete.insert(select_ids.ColumnInt64(0));
254
255 // Delete from the pages table.
256 sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE,
257 "DELETE FROM pages WHERE rowid=?"));
258
259 for (std::set<int64>::const_iterator i = rows_to_delete.begin();
260 i != rows_to_delete.end(); ++i) {
261 delete_page.BindInt64(0, *i);
262 if (!delete_page.Run())
263 return;
264 delete_page.Reset(true);
265 }
266
267 // Delete from the info table.
268 sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE,
269 "DELETE FROM info WHERE rowid=?"));
270
271 for (std::set<int64>::const_iterator i = rows_to_delete.begin();
272 i != rows_to_delete.end(); ++i) {
273 delete_info.BindInt64(0, *i);
274 if (!delete_info.Run())
275 return;
276 delete_info.Reset(true);
277 }
278 }
279
280 void TextDatabase::Optimize() {
281 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
282 "SELECT OPTIMIZE(pages) FROM pages LIMIT 1"));
283 statement.Run();
284 }
285
286 bool TextDatabase::GetTextMatches(const std::string& query,
287 const QueryOptions& options,
288 std::vector<Match>* results,
289 URLSet* found_urls) {
290 std::string sql = "SELECT url, title, time, offsets(pages), body FROM pages "
291 "LEFT OUTER JOIN info ON pages.rowid = info.rowid WHERE ";
292 sql += options.body_only ? "body " : "pages ";
293 sql += "MATCH ? AND time >= ? AND time < ? ";
294 // Times may not be unique, so also sort by rowid to ensure a stable order.
295 sql += "ORDER BY time DESC, info.rowid DESC";
296
297 // Generate unique IDs for the two possible variations of the statement,
298 // so they don't share the same cached prepared statement.
299 sql::StatementID body_only_id = SQL_FROM_HERE;
300 sql::StatementID pages_id = SQL_FROM_HERE;
301
302 sql::Statement statement(db_.GetCachedStatement(
303 (options.body_only ? body_only_id : pages_id), sql.c_str()));
304
305 statement.BindString(0, query);
306 statement.BindInt64(1, options.EffectiveBeginTime());
307 statement.BindInt64(2, options.EffectiveEndTime());
308
309 // |results| may not be initially empty, so keep track of how many were added
310 // by this call.
311 int result_count = 0;
312
313 while (statement.Step()) {
314 // TODO(brettw) allow canceling the query in the middle.
315 // if (canceled_or_something)
316 // break;
317
318 GURL url(statement.ColumnString(0));
319 URLSet::const_iterator found_url = found_urls->find(url);
320 if (found_url != found_urls->end())
321 continue; // Don't add this duplicate.
322
323 if (++result_count > options.EffectiveMaxCount())
324 break;
325
326 // Fill the results into the vector (avoid copying the URL with Swap()).
327 results->resize(results->size() + 1);
328 Match& match = results->at(results->size() - 1);
329 match.url.Swap(&url);
330
331 match.title = statement.ColumnString16(1);
332 match.time = base::Time::FromInternalValue(statement.ColumnInt64(2));
333
334 // Extract any matches in the title.
335 std::string offsets_str = statement.ColumnString(3);
336 Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex,
337 &match.title_match_positions);
338 Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1),
339 &match.title_match_positions);
340
341 // Extract the matches in the body.
342 Snippet::MatchPositions match_positions;
343 Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex,
344 &match_positions);
345
346 // Compute the snippet based on those matches.
347 std::string body = statement.ColumnString(4);
348 match.snippet.ComputeSnippet(match_positions, body);
349 }
350 statement.Reset(true);
351 return result_count > options.EffectiveMaxCount();
352 }
353
354 } // namespace history
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698