Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(371)

Side by Side Diff: chrome/browser/extensions/activity_log/counting_policy.cc

Issue 21646004: Compressed activity log database storage (Closed) Base URL: http://git.chromium.org/chromium/src.git@refactor-cleanups
Patch Set: Factor out dropping of obsolete tables and use in all policies Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // A policy for storing activity log data to a database that performs
6 // aggregation to reduce the size of the database. The database layout is
7 // nearly the same as FullStreamUIPolicy, which stores a complete log, with a
8 // few changes:
9 // - a "count" column is added to track how many log records were merged
10 // together into this row
11 // - the "time" column measures the most recent time that the current row was
12 // updated
13 // When writing a record, if a row already exists where all other columns
14 // (extension_id, action_type, api_name, args, urls, etc.) all match, and the
15 // previous time falls within today (the current time), then the count field on
16 // the old row is incremented. Otherwise, a new row is written.
17 //
18 // For many text columns, repeated strings are compressed by moving string
19 // storage to a separate table ("string_ids") and storing only an identifier in
20 // the logging table. For example, if the api_name_x column contained the
21 // value 4 and the string_ids table contained a row with primary key 4 and
22 // value 'tabs.query', then the api_name field should be taken to have the
23 // value 'tabs.query'. Each column ending with "_x" is compressed in this way.
24 // All lookups are to the string_ids table, except for the page_url_x and
25 // arg_url_x columns, which are converted via the url_ids table (this
26 // separation of URL values is to help simplify history clearing).
27 //
28 // The activitylog_uncompressed view allows for simpler reading of the activity
29 // log contents with identifiers already translated to string values.
30
31 #include "chrome/browser/extensions/activity_log/counting_policy.h"
32
33 #include <map>
34 #include <string>
35 #include <vector>
36
37 #include "base/callback.h"
38 #include "base/files/file_path.h"
39 #include "base/json/json_reader.h"
40 #include "base/json/json_string_value_serializer.h"
41 #include "base/strings/string_util.h"
42 #include "base/strings/stringprintf.h"
43 #include "chrome/common/chrome_constants.h"
44
45 using content::BrowserThread;
46
47 namespace {
48
49 // Delay between cleaning passes (to delete old action records) through the
50 // database.
51 const int kCleaningDelayInHours = 12;
52
53 // We should log the arguments to these API calls. Be careful when
54 // constructing this whitelist to not keep arguments that might compromise
55 // privacy by logging too much data to the activity log.
56 //
57 // TODO(mvrable): The contents of this whitelist should be reviewed and
58 // expanded as needed.
59 const char* kAlwaysLog[] = {"extension.connect", "extension.sendMessage",
60 "tabs.executeScript", "tabs.insertCSS"};
61
62 // Columns in the main database table. See the file-level comment for a
63 // discussion of how data is stored and the meanings of the _x columns.
64 const char* kTableContentFields[] = {
65 "count", "extension_id_x", "time", "action_type", "api_name_x", "args_x",
66 "page_url_x", "page_title_x", "arg_url_x", "other_x"};
67 const char* kTableFieldTypes[] = {
68 "INTEGER NOT NULL DEFAULT 1", "INTEGER NOT NULL", "INTEGER", "INTEGER",
69 "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
70 "INTEGER"};
71
72 // Miscellaneous SQL commands for initializing the database; these should be
73 // idempotent.
74 static const char kPolicyMiscSetup[] =
75 // The activitylog_uncompressed view performs string lookups for simpler
76 // access to the log data.
77 "DROP VIEW IF EXISTS activitylog_uncompressed;\n"
78 "CREATE VIEW activitylog_uncompressed AS\n"
79 "SELECT count,\n"
80 " x1.value AS extension_id,\n"
81 " time,\n"
82 " action_type,\n"
83 " x2.value AS api_name,\n"
84 " x3.value AS args,\n"
85 " x4.value AS page_url,\n"
86 " x5.value AS page_title,\n"
87 " x6.value AS arg_url,\n"
88 " x7.value AS other\n"
89 "FROM activitylog_compressed\n"
90 " LEFT JOIN string_ids AS x1 ON (x1.id = extension_id_x)\n"
91 " LEFT JOIN string_ids AS x2 ON (x2.id = api_name_x)\n"
92 " LEFT JOIN string_ids AS x3 ON (x3.id = args_x)\n"
93 " LEFT JOIN url_ids AS x4 ON (x4.id = page_url_x)\n"
94 " LEFT JOIN string_ids AS x5 ON (x5.id = page_title_x)\n"
95 " LEFT JOIN url_ids AS x6 ON (x6.id = arg_url_x)\n"
96 " LEFT JOIN string_ids AS x7 ON (x7.id = other_x);\n"
97 // An index on all fields except count and time: all the fields that aren't
98 // changed when incrementing a count. This should accelerate finding the
99 // rows to update (at worst several rows will need to be checked to find
100 // the one in the right time range).
101 "CREATE INDEX IF NOT EXISTS activitylog_compressed_index\n"
102 "ON activitylog_compressed(extension_id_x, action_type, api_name_x,\n"
103 " args_x, page_url_x, page_title_x, arg_url_x, other_x)";
104
105 // SQL statements to clean old, unused entries out of the string and URL id
106 // tables.
107 static const char kStringTableCleanup[] =
108 "DELETE FROM string_ids WHERE id NOT IN\n"
109 "(SELECT extension_id_x FROM activitylog_compressed\n"
110 " UNION SELECT api_name_x FROM activitylog_compressed\n"
111 " UNION SELECT args_x FROM activitylog_compressed\n"
112 " UNION SELECT page_title_x FROM activitylog_compressed\n"
113 " UNION SELECT other_x FROM activitylog_compressed)";
114 static const char kUrlTableCleanup[] =
115 "DELETE FROM url_ids WHERE id NOT IN\n"
116 "(SELECT page_url_x FROM activitylog_compressed\n"
117 " UNION SELECT arg_url_x FROM activitylog_compressed)";
118
119 } // namespace
120
121 namespace extensions {
122
123 // A specialized Action subclass which is used to represent an action read from
124 // the database with a corresponding count.
125 class CountedAction : public Action {
126 public:
127 CountedAction(const std::string& extension_id,
128 const base::Time& time,
129 const ActionType action_type,
130 const std::string& api_name)
131 : Action(extension_id, time, action_type, api_name) {}
132
133 // Number of merged records for this action.
134 int count() const { return count_; }
135 void set_count(int count) { count_ = count; }
136
137 virtual std::string PrintForDebug() const OVERRIDE;
138
139 protected:
140 virtual ~CountedAction() {}
141
142 private:
143 int count_;
144 };
145
146 std::string CountedAction::PrintForDebug() const {
147 return base::StringPrintf(
148 "%s COUNT=%d", Action::PrintForDebug().c_str(), count());
149 }
150
151 const char* CountingPolicy::kTableName = "activitylog_compressed";
152 const char* CountingPolicy::kReadViewName = "activitylog_uncompressed";
153
154 CountingPolicy::CountingPolicy(Profile* profile)
155 : ActivityLogDatabasePolicy(
156 profile,
157 base::FilePath(chrome::kExtensionActivityLogFilename)),
158 string_table_("string_ids"),
159 url_table_("url_ids"),
160 closing_(false),
161 retention_time_(base::TimeDelta::FromHours(60)) {
162 for (size_t i = 0; i < arraysize(kAlwaysLog); i++) {
163 api_arg_whitelist_.insert(kAlwaysLog[i]);
164 }
165 }
166
167 CountingPolicy::~CountingPolicy() {}
168
169 bool CountingPolicy::InitDatabase(sql::Connection* db) {
170 if (!Util::DropObsoleteTables(db))
171 return false;
172
173 if (!string_table_.Initialize(db))
174 return false;
175 if (!url_table_.Initialize(db))
176 return false;
177
178 // Create the unified activity log entry table.
179 if (!ActivityDatabase::InitializeTable(db,
180 kTableName,
181 kTableContentFields,
182 kTableFieldTypes,
183 arraysize(kTableContentFields)))
184 return false;
185
186 // Create a view for easily accessing the uncompressed form of the data, and
187 // any necessary indexes if needed.
188 return db->Execute(kPolicyMiscSetup);
189 }
190
191 void CountingPolicy::ProcessAction(scoped_refptr<Action> action) {
192 ScheduleAndForget(this, &CountingPolicy::QueueAction, action);
193 }
194
195 void CountingPolicy::QueueAction(scoped_refptr<Action> action) {
196 if (activity_database()->is_db_valid()) {
197 action = action->Clone();
198 Util::SanitizeAction(action);
199 Util::StripArguments(api_arg_whitelist_, action);
200 queued_actions_.push_back(action);
201 activity_database()->AdviseFlush(queued_actions_.size());
202 }
203 }
204
205 bool CountingPolicy::FlushDatabase(sql::Connection* db) {
206 // Columns that must match exactly for database rows to be coalesced.
207 static const char* matched_columns[] = {
208 "extension_id_x", "action_type", "api_name_x", "args_x", "page_url_x",
209 "page_title_x", "arg_url_x", "other_x"};
210 Action::ActionVector queue;
211 queue.swap(queued_actions_);
212
213 // Whether to clean old records out of the activity log database. Do this
214 // much less frequently than database flushes since it is expensive, but
215 // always check on the first database flush (since there might be a large
216 // amount of data to clear).
217 //
218 // We do not clean old old database records on shutdown, for both performance
219 // and correctness reasons. We don't want to slow down shutting down the
220 // browser. Additionally, in unit tests the call to Now() may invoke a mock
221 // clock which has already been freed by the time the flush on close
222 // happens--so we can't call Now() when the policy is shutting down.
223 bool clean_database = false;
224 if (!closing_) {
felt 2013/08/07 01:09:42 is closing_ touched on both the DB and UI threads?
mvrable 2013/08/07 17:01:19 You're right, it is. Thinking about this some mor
felt 2013/08/08 02:08:50 Thanks, I like the updated version better.
225 clean_database = (last_database_cleaning_time_.is_null() ||
226 Now() - last_database_cleaning_time_ >
227 base::TimeDelta::FromHours(kCleaningDelayInHours));
228 }
229
230 if (queue.empty() && !clean_database)
231 return true;
232
233 sql::Transaction transaction(db);
234 if (!transaction.Begin())
235 return false;
236
237 std::string insert_str =
238 "INSERT INTO " + std::string(kTableName) + "(count, time";
239 std::string update_str =
240 "UPDATE " + std::string(kTableName) +
241 " SET count = count + 1, time = max(?, time)"
242 " WHERE time >= ? AND time < ?";
243
244 for (size_t i = 0; i < arraysize(matched_columns); i++) {
245 insert_str =
246 base::StringPrintf("%s, %s", insert_str.c_str(), matched_columns[i]);
247 update_str = base::StringPrintf(
248 "%s AND %s IS ?", update_str.c_str(), matched_columns[i]);
249 }
250 insert_str += ") VALUES (1, ?";
251 for (size_t i = 0; i < arraysize(matched_columns); i++) {
252 insert_str += ", ?";
253 }
254 insert_str += ")";
255
256 Action::ActionVector::size_type i;
257 for (i = 0; i != queue.size(); ++i) {
258 const Action& action = *queue[i];
259
260 base::Time day_start = action.time().LocalMidnight();
261 base::Time next_day = Util::AddDays(day_start, 1);
262
263 // The contents in values must match up with fields in matched_columns. A
264 // value of -1 is used to encode a null database value.
265 int64 id;
266 std::vector<int64> matched_values;
267
268 if (!string_table_.StringToInt(db, action.extension_id(), &id))
269 return false;
270 matched_values.push_back(id);
271
272 matched_values.push_back(static_cast<int>(action.action_type()));
273
274 if (!string_table_.StringToInt(db, action.api_name(), &id))
275 return false;
276 matched_values.push_back(id);
277
278 if (action.args()) {
279 std::string args = Util::Serialize(action.args());
280 // TODO(mvrable): For now, truncate long argument lists. This is a
281 // workaround for excessively-long values coming from DOM logging. When
282 // the V8ValueConverter is fixed to return more reasonable values, we can
283 // drop the truncation.
284 if (args.length() > 10000) {
285 args = "[\"<too_large>\"]";
286 }
287 if (!string_table_.StringToInt(db, args, &id))
288 return false;
289 matched_values.push_back(id);
290 } else {
291 matched_values.push_back(-1);
292 }
293
294 std::string page_url_string =
295 Util::UrlToString(action.page_url(), action.page_incognito());
296 if (!page_url_string.empty()) {
297 if (!url_table_.StringToInt(db, page_url_string, &id))
298 return false;
299 matched_values.push_back(id);
300 } else {
301 matched_values.push_back(-1);
302 }
303
304 // TODO(mvrable): Create a title_table_?
305 if (!action.page_title().empty()) {
306 if (!string_table_.StringToInt(db, action.page_title(), &id))
307 return false;
308 matched_values.push_back(id);
309 } else {
310 matched_values.push_back(-1);
311 }
312
313 std::string arg_url_string =
314 Util::UrlToString(action.arg_url(), action.arg_incognito());
315 if (!arg_url_string.empty()) {
316 if (!url_table_.StringToInt(db, arg_url_string, &id))
317 return false;
318 matched_values.push_back(id);
319 } else {
320 matched_values.push_back(-1);
321 }
322
323 if (action.other()) {
324 if (!string_table_.StringToInt(db, Util::Serialize(action.other()), &id))
325 return false;
326 matched_values.push_back(id);
327 } else {
328 matched_values.push_back(-1);
329 }
330
331 // Assume there is an existing row for this action, and try to update the
332 // count.
333 sql::Statement update_statement(db->GetCachedStatement(
334 sql::StatementID(SQL_FROM_HERE), update_str.c_str()));
335 update_statement.BindInt64(0, action.time().ToInternalValue());
336 update_statement.BindInt64(1, day_start.ToInternalValue());
337 update_statement.BindInt64(2, next_day.ToInternalValue());
338 for (size_t j = 0; j < matched_values.size(); j++) {
339 if (matched_values[j] == -1)
340 update_statement.BindNull(j + 3);
341 else
342 update_statement.BindInt64(j + 3, matched_values[j]);
343 }
344 if (!update_statement.Run())
345 return false;
346
347 // Check if the update succeeded (was the count of updated rows non-zero)?
348 // If it failed because no matching row existed, fall back to inserting a
349 // new record.
350 if (db->GetLastChangeCount() > 0) {
351 if (db->GetLastChangeCount() > 1) {
352 LOG(WARNING) << "Found and updated multiple rows in the activity log "
353 << "database; counts may be off!";
354 }
355 continue;
356 }
357 sql::Statement insert_statement(db->GetCachedStatement(
358 sql::StatementID(SQL_FROM_HERE), insert_str.c_str()));
359 insert_statement.BindInt64(0, action.time().ToInternalValue());
360 for (size_t j = 0; j < matched_values.size(); j++) {
361 if (matched_values[j] == -1)
362 update_statement.BindNull(j + 1);
363 else
364 insert_statement.BindInt64(j + 1, matched_values[j]);
365 }
366 if (!insert_statement.Run())
367 return false;
368 }
369
370 if (clean_database) {
371 LOG(ERROR) << "Cleaning database";
372 base::Time cutoff = (Now() - retention_time()).LocalMidnight();
373 if (!CleanOlderThan(db, cutoff))
374 return false;
375 last_database_cleaning_time_ = Now();
376 }
377
378 if (!transaction.Commit())
379 return false;
380
381 return true;
382 }
383
384 scoped_ptr<Action::ActionVector> CountingPolicy::DoReadData(
385 const std::string& extension_id,
386 const int days_ago) {
387 // Ensure data is flushed to the database first so that we query over all
388 // data.
389 activity_database()->AdviseFlush(ActivityDatabase::kFlushImmediately);
390
391 DCHECK_GE(days_ago, 0);
392 scoped_ptr<Action::ActionVector> actions(new Action::ActionVector());
393
394 sql::Connection* db = GetDatabaseConnection();
395 if (!db) {
396 return actions.Pass();
397 }
398
399 int64 early_bound;
400 int64 late_bound;
401 Util::ComputeDatabaseTimeBounds(Now(), days_ago, &early_bound, &late_bound);
402 std::string query_str = base::StringPrintf(
403 "SELECT time, action_type, api_name, args, page_url, page_title, "
404 "arg_url, other, count "
405 "FROM %s WHERE extension_id=? AND time>? AND time<=? "
406 "ORDER BY time DESC",
407 kReadViewName);
408 sql::Statement query(db->GetCachedStatement(SQL_FROM_HERE,
409 query_str.c_str()));
410 query.BindString(0, extension_id);
411 query.BindInt64(1, early_bound);
412 query.BindInt64(2, late_bound);
413
414 while (query.is_valid() && query.Step()) {
415 scoped_refptr<CountedAction> action =
416 new CountedAction(extension_id,
417 base::Time::FromInternalValue(query.ColumnInt64(0)),
418 static_cast<Action::ActionType>(query.ColumnInt(1)),
419 query.ColumnString(2));
420
421 if (query.ColumnType(3) != sql::COLUMN_TYPE_NULL) {
422 scoped_ptr<Value> parsed_value(
423 base::JSONReader::Read(query.ColumnString(3)));
424 if (parsed_value && parsed_value->IsType(Value::TYPE_LIST)) {
425 action->set_args(
426 make_scoped_ptr(static_cast<ListValue*>(parsed_value.release())));
427 } else {
428 LOG(WARNING) << "Unable to parse args: '" << query.ColumnString(3)
429 << "'";
430 }
431 }
432
433 GURL page_url(query.ColumnString(4));
434 action->set_page_url(page_url);
435
436 action->set_page_title(query.ColumnString(5));
437
438 GURL arg_url(query.ColumnString(6));
439 action->set_arg_url(arg_url);
440
441 if (query.ColumnType(7) != sql::COLUMN_TYPE_NULL) {
442 scoped_ptr<Value> parsed_value(
443 base::JSONReader::Read(query.ColumnString(7)));
444 if (parsed_value && parsed_value->IsType(Value::TYPE_DICTIONARY)) {
445 action->set_other(make_scoped_ptr(
446 static_cast<DictionaryValue*>(parsed_value.release())));
447 } else {
448 LOG(WARNING) << "Unable to parse other: '" << query.ColumnString(7)
449 << "'";
450 }
451 }
452
453 action->set_count(query.ColumnInt(8));
454
455 actions->push_back(action);
456 }
457
458 return actions.Pass();
459 }
460
461 void CountingPolicy::ReadData(
462 const std::string& extension_id,
463 const int day,
464 const base::Callback<void(scoped_ptr<Action::ActionVector>)>& callback) {
465 BrowserThread::PostTaskAndReplyWithResult(
466 BrowserThread::DB,
467 FROM_HERE,
468 base::Bind(&CountingPolicy::DoReadData,
469 base::Unretained(this),
470 extension_id,
471 day),
472 callback);
473 }
474
475 void CountingPolicy::OnDatabaseFailure() {
476 queued_actions_.clear();
477 }
478
479 void CountingPolicy::OnDatabaseClose() {
480 delete this;
481 }
482
483 // Cleans old records from the activity log database.
484 bool CountingPolicy::CleanOlderThan(sql::Connection* db,
485 const base::Time& cutoff) {
486 std::string clean_statement =
487 "DELETE FROM " + std::string(kTableName) + " WHERE time < ?";
488 sql::Statement cleaner(db->GetCachedStatement(sql::StatementID(SQL_FROM_HERE),
489 clean_statement.c_str()));
490 cleaner.BindInt64(0, cutoff.ToInternalValue());
491 if (!cleaner.Run())
492 return false;
493 return CleanStringTables(db);
494 }
495
496 // Cleans unused interned strings from the database. This should be run after
497 // deleting rows from the main log table to clean out stale values.
498 bool CountingPolicy::CleanStringTables(sql::Connection* db) {
499 sql::Statement cleaner1(db->GetCachedStatement(
500 sql::StatementID(SQL_FROM_HERE), kStringTableCleanup));
501 if (!cleaner1.Run())
502 return false;
503 if (db->GetLastChangeCount() > 0)
504 string_table_.ClearCache();
505
506 sql::Statement cleaner2(db->GetCachedStatement(
507 sql::StatementID(SQL_FROM_HERE), kUrlTableCleanup));
508 if (!cleaner2.Run())
509 return false;
510 if (db->GetLastChangeCount() > 0)
511 url_table_.ClearCache();
512
513 return true;
514 }
515
516 void CountingPolicy::Close() {
517 closing_ = true;
518
519 // The policy object should have never been created if there's no DB thread.
520 DCHECK(BrowserThread::IsMessageLoopValid(BrowserThread::DB));
521 ScheduleAndForget(activity_database(), &ActivityDatabase::Close);
522 }
523
524 } // namespace extensions
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698