chrome/browser/history/text_database.cc - Issue 16951015: Remove TextDatabase from the history service.

Side by Side Diff: chrome/browser/history/text_database.cc

Issue 16951015: Remove TextDatabase from the history service. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@replace_fts

Patch Set: Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« chrome/browser/history/history_backend.cc ('K') | « chrome/browser/history/text_database.h ('k') | chrome/browser/history/text_database_manager.h » ('j') | chrome/browser/history/visit_database.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include <limits>

6 #include <set>

7 #include <string>

8

9 #include "chrome/browser/history/text_database.h"

10

11 #include "base/file_util.h"

12 #include "base/logging.h"

13 #include "base/metrics/histogram.h"

14 #include "base/strings/string_number_conversions.h"

15 #include "base/strings/stringprintf.h"

16 #include "base/strings/utf_string_conversions.h"

17 #include "chrome/browser/diagnostics/sqlite_diagnostics.h"

18 #include "sql/statement.h"

19 #include "sql/transaction.h"

20

21 // There are two tables in each database, one full-text search (FTS) table which

22 // indexes the contents and title of the pages. The other is a regular SQLITE

23 // table which contains non-indexed information about the page. All columns of

24 // a FTS table are indexed using the text search algorithm, which isn't what we

25 // want for things like times. If this were in the FTS table, there would be

26 // different words in the index for each time number.

27 //

28 // "pages" FTS table:

29 // url URL of the page so searches will match the URL.

30 // title Title of the page.

31 // body Body of the page.

32 //

33 // "info" regular table:

34 // time Time the corresponding FTS entry was visited.

35 //

36 // We do joins across these two tables by using their internal rowids, which we

37 // keep in sync between the two tables. The internal rowid is the only part of

38 // an FTS table that is indexed like a normal table, and the index over it is

39 // free since sqlite always indexes the internal rowid.

40

41 namespace history {

42

43 namespace {

44

45 // Version 1 uses FTS2 for index files.

46 // Version 2 uses FTS3.

47 static const int kCurrentVersionNumber = 2;

48 static const int kCompatibleVersionNumber = 2;

49

50 // Snippet computation relies on the index of the columns in the original

51 // create statement. These are the 0-based indices (as strings) of the

52 // corresponding columns.

53 const char kTitleColumnIndex[] = "1";

54 const char kBodyColumnIndex[] = "2";

55

56 // The string prepended to the database identifier to generate the filename.

57 const base::FilePath::CharType kFilePrefix[] =

58 FILE_PATH_LITERAL("History Index ");

59

60 } // namespace

61

62 TextDatabase::Match::Match() {}

63

64 TextDatabase::Match::~Match() {}

65

66 TextDatabase::TextDatabase(const base::FilePath& path,

67 DBIdent id,

68 bool allow_create)

69 : path_(path),

70 ident_(id),

71 allow_create_(allow_create) {

72 // Compute the file name.

73 file_name_ = path_.Append(IDToFileName(ident_));

74 }

75

76 TextDatabase::~TextDatabase() {

77 }

78

79 // static

80 const base::FilePath::CharType* TextDatabase::file_base() {

81 return kFilePrefix;

82 }

83

84 // static

85 base::FilePath TextDatabase::IDToFileName(DBIdent id) {

86 // Identifiers are intended to be a combination of the year and month, for

87 // example, 200801 for January 2008. We convert this to

88 // "History Index 2008-01". However, we don't make assumptions about this

89 // scheme: the caller should assign IDs as it feels fit with the knowledge

90 // that they will apppear on disk in this form.

91 base::FilePath::StringType filename(file_base());

92 base::StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"),

93 id / 100, id % 100);

94 return base::FilePath(filename);

95 }

96

97 // static

98 TextDatabase::DBIdent TextDatabase::FileNameToID(

99 const base::FilePath& file_path) {

100 base::FilePath::StringType file_name = file_path.BaseName().value();

101

102 // We don't actually check the prefix here. Since the file system could

103 // be case insensitive in ways we can't predict (NTFS), checking could

104 // potentially be the wrong thing to do. Instead, we just look for a suffix.

105 static const size_t kIDStringLength = 7; // Room for "xxxx-xx".

106 if (file_name.length() < kIDStringLength)

107 return 0;

108 const base::FilePath::StringType suffix(

109 &file_name[file_name.length() - kIDStringLength]);

110

111 if (suffix.length() != kIDStringLength \|\|

112 suffix[4] != FILE_PATH_LITERAL('-')) {

113 return 0;

114 }

115

116 // TODO: Once StringPiece supports a templated interface over the

117 // underlying string type, use it here instead of substr, since that

118 // will avoid needless string copies. StringPiece cannot be used

119 // right now because base::FilePath::StringType could use either 8 or 16 bit

120 // characters, depending on the OS.

121 int year, month;

122 base::StringToInt(suffix.substr(0, 4), &year);

123 base::StringToInt(suffix.substr(5, 2), &month);

124

125 return year * 100 + month;

126 }

127

128 bool TextDatabase::Init() {

129 // Make sure, if we're not allowed to create the file, that it exists.

130 if (!allow_create_) {

131 if (!file_util::PathExists(file_name_))

132 return false;

133 }

134

135 db_.set_histogram_tag("Text");

136

137 // Set the database page size to something a little larger to give us

138 // better performance (we're typically seek rather than bandwidth limited).

139 // This only has an effect before any tables have been created, otherwise

140 // this is a NOP. Must be a power of 2 and a max of 8192.

141 db_.set_page_size(4096);

142

143 // The default cache size is 2000 which give >8MB of data. Since we will often

144 // have 2-3 of these objects, each with their own 8MB, this adds up very fast.

145 // We therefore reduce the size so when there are multiple objects, we're not

146 // too big.

147 db_.set_cache_size(512);

148

149 // Run the database in exclusive mode. Nobody else should be accessing the

150 // database while we're running, and this will give somewhat improved perf.

151 db_.set_exclusive_locking();

152

153 // Attach the database to our index file.

154 if (!db_.Open(file_name_))

155 return false;

156

157 // Meta table tracking version information.

158 if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber))

159 return false;

160 if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) {

161 // This version is too new. We don't bother notifying the user on this

162 // error, and just fail to use the file. Normally if they have version skew,

163 // they will get it for the main history file and it won't be necessary

164 // here. If that's not the case, since this is only indexed data, it's

165 // probably better to just not give FTS results than strange errors when

166 // everything else is working OK.

167 LOG(WARNING) << "Text database is too new.";

168 return false;

169 }

170

171 return CreateTables();

172 }

173

174 void TextDatabase::BeginTransaction() {

175 db_.BeginTransaction();

176 }

177

178 void TextDatabase::CommitTransaction() {

179 db_.CommitTransaction();

180 }

181

182 bool TextDatabase::CreateTables() {

183 // FTS table of page contents.

184 if (!db_.DoesTableExist("pages")) {

185 if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3("

186 "TOKENIZE icu,"

187 "url LONGVARCHAR,"

188 "title LONGVARCHAR,"

189 "body LONGVARCHAR)"))

190 return false;

191 }

192

193 // Non-FTS table containing URLs and times so we can efficiently find them

194 // using a regular index (all FTS columns are special and are treated as

195 // full-text-search, which is not what we want when retrieving this data).

196 if (!db_.DoesTableExist("info")) {

197 // Note that there is no point in creating an index over time. Since

198 // we must always query the entire FTS table (it can not efficiently do

199 // subsets), we will always end up doing that first, and joining the info

200 // table off of that.

201 if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)"))

202 return false;

203 }

204

205 // Create the index.

206 return db_.Execute("CREATE INDEX IF NOT EXISTS info_time ON info(time)");

207 }

208

209 bool TextDatabase::AddPageData(base::Time time,

210 const std::string& url,

211 const std::string& title,

212 const std::string& contents) {

213 sql::Transaction committer(&db_);

214 if (!committer.Begin())

215 return false;

216

217 // Add to the pages table.

218 sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE,

219 "INSERT INTO pages (url, title, body) VALUES (?,?,?)"));

220 add_to_pages.BindString(0, url);

221 add_to_pages.BindString(1, title);

222 add_to_pages.BindString(2, contents);

223 if (!add_to_pages.Run())

224 return false;

225

226 int64 rowid = db_.GetLastInsertRowId();

227

228 // Add to the info table with the same rowid.

229 sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE,

230 "INSERT INTO info (rowid, time) VALUES (?,?)"));

231 add_to_info.BindInt64(0, rowid);

232 add_to_info.BindInt64(1, time.ToInternalValue());

233

234 if (!add_to_info.Run())

235 return false;

236

237 return committer.Commit();

238 }

239

240 void TextDatabase::DeletePageData(base::Time time, const std::string& url) {

241 // First get all rows that match. Selecing on time (which has an index) allows

242 // us to avoid brute-force searches on the full-text-index table (there will

243 // generally be only one match per time).

244 sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE,

245 "SELECT info.rowid "

246 "FROM info JOIN pages ON info.rowid = pages.rowid "

247 "WHERE info.time=? AND pages.url=?"));

248 select_ids.BindInt64(0, time.ToInternalValue());

249 select_ids.BindString(1, url);

250

251 std::set<int64> rows_to_delete;

252 while (select_ids.Step())

253 rows_to_delete.insert(select_ids.ColumnInt64(0));

254

255 // Delete from the pages table.

256 sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE,

257 "DELETE FROM pages WHERE rowid=?"));

258

259 for (std::set<int64>::const_iterator i = rows_to_delete.begin();

260 i != rows_to_delete.end(); ++i) {

261 delete_page.BindInt64(0, *i);

262 if (!delete_page.Run())

263 return;

264 delete_page.Reset(true);

265 }

266

267 // Delete from the info table.

268 sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE,

269 "DELETE FROM info WHERE rowid=?"));

270

271 for (std::set<int64>::const_iterator i = rows_to_delete.begin();

272 i != rows_to_delete.end(); ++i) {

273 delete_info.BindInt64(0, *i);

274 if (!delete_info.Run())

275 return;

276 delete_info.Reset(true);

277 }

278 }

279

280 void TextDatabase::Optimize() {

281 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,

282 "SELECT OPTIMIZE(pages) FROM pages LIMIT 1"));

283 statement.Run();

284 }

285

286 bool TextDatabase::GetTextMatches(const std::string& query,

287 const QueryOptions& options,

288 std::vector<Match>* results,

289 URLSet* found_urls) {

290 std::string sql = "SELECT url, title, time, offsets(pages), body FROM pages "

291 "LEFT OUTER JOIN info ON pages.rowid = info.rowid WHERE ";

292 sql += options.body_only ? "body " : "pages ";

293 sql += "MATCH ? AND time >= ? AND time < ? ";

294 // Times may not be unique, so also sort by rowid to ensure a stable order.

295 sql += "ORDER BY time DESC, info.rowid DESC";

296

297 // Generate unique IDs for the two possible variations of the statement,

298 // so they don't share the same cached prepared statement.

299 sql::StatementID body_only_id = SQL_FROM_HERE;

300 sql::StatementID pages_id = SQL_FROM_HERE;

301

302 sql::Statement statement(db_.GetCachedStatement(

303 (options.body_only ? body_only_id : pages_id), sql.c_str()));

304

305 statement.BindString(0, query);

306 statement.BindInt64(1, options.EffectiveBeginTime());

307 statement.BindInt64(2, options.EffectiveEndTime());

308

309 // \|results\| may not be initially empty, so keep track of how many were added

310 // by this call.

311 int result_count = 0;

312

313 while (statement.Step()) {

314 // TODO(brettw) allow canceling the query in the middle.

315 // if (canceled_or_something)

316 // break;

317

318 GURL url(statement.ColumnString(0));

319 URLSet::const_iterator found_url = found_urls->find(url);

320 if (found_url != found_urls->end())

321 continue; // Don't add this duplicate.

322

323 if (++result_count > options.EffectiveMaxCount())

324 break;

325

326 // Fill the results into the vector (avoid copying the URL with Swap()).

327 results->resize(results->size() + 1);

328 Match& match = results->at(results->size() - 1);

329 match.url.Swap(&url);

330

331 match.title = statement.ColumnString16(1);

332 match.time = base::Time::FromInternalValue(statement.ColumnInt64(2));

333

334 // Extract any matches in the title.

335 std::string offsets_str = statement.ColumnString(3);

336 Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex,

337 &match.title_match_positions);

338 Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1),

339 &match.title_match_positions);

340

341 // Extract the matches in the body.

342 Snippet::MatchPositions match_positions;

343 Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex,

344 &match_positions);

345

346 // Compute the snippet based on those matches.

347 std::string body = statement.ColumnString(4);

348 match.snippet.ComputeSnippet(match_positions, body);

349 }

350 statement.Reset(true);

351 return result_count > options.EffectiveMaxCount();

352 }

353

354 } // namespace history

OLD	NEW