Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(43)

Unified Diff: chrome/browser/extensions/activity_log/counting_policy.cc

Issue 21646004: Compressed activity log database storage (Closed) Base URL: http://git.chromium.org/chromium/src.git@refactor-cleanups
Patch Set: Factor out dropping of obsolete tables and use in all policies Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/extensions/activity_log/counting_policy.cc
diff --git a/chrome/browser/extensions/activity_log/counting_policy.cc b/chrome/browser/extensions/activity_log/counting_policy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..306498312c89c43e876d90fdb9d56e3cfaeb5e23
--- /dev/null
+++ b/chrome/browser/extensions/activity_log/counting_policy.cc
@@ -0,0 +1,524 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// A policy for storing activity log data to a database that performs
+// aggregation to reduce the size of the database. The database layout is
+// nearly the same as FullStreamUIPolicy, which stores a complete log, with a
+// few changes:
+// - a "count" column is added to track how many log records were merged
+// together into this row
+// - the "time" column measures the most recent time that the current row was
+// updated
+// When writing a record, if a row already exists where all other columns
+// (extension_id, action_type, api_name, args, urls, etc.) all match, and the
+// previous time falls within today (the current time), then the count field on
+// the old row is incremented. Otherwise, a new row is written.
+//
+// For many text columns, repeated strings are compressed by moving string
+// storage to a separate table ("string_ids") and storing only an identifier in
+// the logging table. For example, if the api_name_x column contained the
+// value 4 and the string_ids table contained a row with primary key 4 and
+// value 'tabs.query', then the api_name field should be taken to have the
+// value 'tabs.query'. Each column ending with "_x" is compressed in this way.
+// All lookups are to the string_ids table, except for the page_url_x and
+// arg_url_x columns, which are converted via the url_ids table (this
+// separation of URL values is to help simplify history clearing).
+//
+// The activitylog_uncompressed view allows for simpler reading of the activity
+// log contents with identifiers already translated to string values.
+
+#include "chrome/browser/extensions/activity_log/counting_policy.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "base/callback.h"
+#include "base/files/file_path.h"
+#include "base/json/json_reader.h"
+#include "base/json/json_string_value_serializer.h"
+#include "base/strings/string_util.h"
+#include "base/strings/stringprintf.h"
+#include "chrome/common/chrome_constants.h"
+
+using content::BrowserThread;
+
+namespace {
+
+// Delay between cleaning passes (to delete old action records) through the
+// database.
+const int kCleaningDelayInHours = 12;
+
+// We should log the arguments to these API calls. Be careful when
+// constructing this whitelist to not keep arguments that might compromise
+// privacy by logging too much data to the activity log.
+//
+// TODO(mvrable): The contents of this whitelist should be reviewed and
+// expanded as needed.
+const char* kAlwaysLog[] = {"extension.connect", "extension.sendMessage",
+ "tabs.executeScript", "tabs.insertCSS"};
+
+// Columns in the main database table. See the file-level comment for a
+// discussion of how data is stored and the meanings of the _x columns.
+const char* kTableContentFields[] = {
+ "count", "extension_id_x", "time", "action_type", "api_name_x", "args_x",
+ "page_url_x", "page_title_x", "arg_url_x", "other_x"};
+const char* kTableFieldTypes[] = {
+ "INTEGER NOT NULL DEFAULT 1", "INTEGER NOT NULL", "INTEGER", "INTEGER",
+ "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
+ "INTEGER"};
+
+// Miscellaneous SQL commands for initializing the database; these should be
+// idempotent.
+static const char kPolicyMiscSetup[] =
+ // The activitylog_uncompressed view performs string lookups for simpler
+ // access to the log data.
+ "DROP VIEW IF EXISTS activitylog_uncompressed;\n"
+ "CREATE VIEW activitylog_uncompressed AS\n"
+ "SELECT count,\n"
+ " x1.value AS extension_id,\n"
+ " time,\n"
+ " action_type,\n"
+ " x2.value AS api_name,\n"
+ " x3.value AS args,\n"
+ " x4.value AS page_url,\n"
+ " x5.value AS page_title,\n"
+ " x6.value AS arg_url,\n"
+ " x7.value AS other\n"
+ "FROM activitylog_compressed\n"
+ " LEFT JOIN string_ids AS x1 ON (x1.id = extension_id_x)\n"
+ " LEFT JOIN string_ids AS x2 ON (x2.id = api_name_x)\n"
+ " LEFT JOIN string_ids AS x3 ON (x3.id = args_x)\n"
+ " LEFT JOIN url_ids AS x4 ON (x4.id = page_url_x)\n"
+ " LEFT JOIN string_ids AS x5 ON (x5.id = page_title_x)\n"
+ " LEFT JOIN url_ids AS x6 ON (x6.id = arg_url_x)\n"
+ " LEFT JOIN string_ids AS x7 ON (x7.id = other_x);\n"
+ // An index on all fields except count and time: all the fields that aren't
+ // changed when incrementing a count. This should accelerate finding the
+ // rows to update (at worst several rows will need to be checked to find
+ // the one in the right time range).
+ "CREATE INDEX IF NOT EXISTS activitylog_compressed_index\n"
+ "ON activitylog_compressed(extension_id_x, action_type, api_name_x,\n"
+ " args_x, page_url_x, page_title_x, arg_url_x, other_x)";
+
+// SQL statements to clean old, unused entries out of the string and URL id
+// tables.
+static const char kStringTableCleanup[] =
+ "DELETE FROM string_ids WHERE id NOT IN\n"
+ "(SELECT extension_id_x FROM activitylog_compressed\n"
+ " UNION SELECT api_name_x FROM activitylog_compressed\n"
+ " UNION SELECT args_x FROM activitylog_compressed\n"
+ " UNION SELECT page_title_x FROM activitylog_compressed\n"
+ " UNION SELECT other_x FROM activitylog_compressed)";
+static const char kUrlTableCleanup[] =
+ "DELETE FROM url_ids WHERE id NOT IN\n"
+ "(SELECT page_url_x FROM activitylog_compressed\n"
+ " UNION SELECT arg_url_x FROM activitylog_compressed)";
+
+} // namespace
+
+namespace extensions {
+
+// A specialized Action subclass which is used to represent an action read from
+// the database with a corresponding count.
+class CountedAction : public Action {
+ public:
+ CountedAction(const std::string& extension_id,
+ const base::Time& time,
+ const ActionType action_type,
+ const std::string& api_name)
+ : Action(extension_id, time, action_type, api_name) {}
+
+ // Number of merged records for this action.
+ int count() const { return count_; }
+ void set_count(int count) { count_ = count; }
+
+ virtual std::string PrintForDebug() const OVERRIDE;
+
+ protected:
+ virtual ~CountedAction() {}
+
+ private:
+ int count_;
+};
+
+std::string CountedAction::PrintForDebug() const {
+ return base::StringPrintf(
+ "%s COUNT=%d", Action::PrintForDebug().c_str(), count());
+}
+
+const char* CountingPolicy::kTableName = "activitylog_compressed";
+const char* CountingPolicy::kReadViewName = "activitylog_uncompressed";
+
+CountingPolicy::CountingPolicy(Profile* profile)
+ : ActivityLogDatabasePolicy(
+ profile,
+ base::FilePath(chrome::kExtensionActivityLogFilename)),
+ string_table_("string_ids"),
+ url_table_("url_ids"),
+ closing_(false),
+ retention_time_(base::TimeDelta::FromHours(60)) {
+ for (size_t i = 0; i < arraysize(kAlwaysLog); i++) {
+ api_arg_whitelist_.insert(kAlwaysLog[i]);
+ }
+}
+
+CountingPolicy::~CountingPolicy() {}
+
+bool CountingPolicy::InitDatabase(sql::Connection* db) {
+ if (!Util::DropObsoleteTables(db))
+ return false;
+
+ if (!string_table_.Initialize(db))
+ return false;
+ if (!url_table_.Initialize(db))
+ return false;
+
+ // Create the unified activity log entry table.
+ if (!ActivityDatabase::InitializeTable(db,
+ kTableName,
+ kTableContentFields,
+ kTableFieldTypes,
+ arraysize(kTableContentFields)))
+ return false;
+
+ // Create a view for easily accessing the uncompressed form of the data, and
+ // any necessary indexes if needed.
+ return db->Execute(kPolicyMiscSetup);
+}
+
+void CountingPolicy::ProcessAction(scoped_refptr<Action> action) {
+ ScheduleAndForget(this, &CountingPolicy::QueueAction, action);
+}
+
+void CountingPolicy::QueueAction(scoped_refptr<Action> action) {
+ if (activity_database()->is_db_valid()) {
+ action = action->Clone();
+ Util::SanitizeAction(action);
+ Util::StripArguments(api_arg_whitelist_, action);
+ queued_actions_.push_back(action);
+ activity_database()->AdviseFlush(queued_actions_.size());
+ }
+}
+
+bool CountingPolicy::FlushDatabase(sql::Connection* db) {
+ // Columns that must match exactly for database rows to be coalesced.
+ static const char* matched_columns[] = {
+ "extension_id_x", "action_type", "api_name_x", "args_x", "page_url_x",
+ "page_title_x", "arg_url_x", "other_x"};
+ Action::ActionVector queue;
+ queue.swap(queued_actions_);
+
+ // Whether to clean old records out of the activity log database. Do this
+ // much less frequently than database flushes since it is expensive, but
+ // always check on the first database flush (since there might be a large
+ // amount of data to clear).
+ //
+ // We do not clean old old database records on shutdown, for both performance
+ // and correctness reasons. We don't want to slow down shutting down the
+ // browser. Additionally, in unit tests the call to Now() may invoke a mock
+ // clock which has already been freed by the time the flush on close
+ // happens--so we can't call Now() when the policy is shutting down.
+ bool clean_database = false;
+ if (!closing_) {
felt 2013/08/07 01:09:42 is closing_ touched on both the DB and UI threads?
mvrable 2013/08/07 17:01:19 You're right, it is. Thinking about this some mor
felt 2013/08/08 02:08:50 Thanks, I like the updated version better.
+ clean_database = (last_database_cleaning_time_.is_null() ||
+ Now() - last_database_cleaning_time_ >
+ base::TimeDelta::FromHours(kCleaningDelayInHours));
+ }
+
+ if (queue.empty() && !clean_database)
+ return true;
+
+ sql::Transaction transaction(db);
+ if (!transaction.Begin())
+ return false;
+
+ std::string insert_str =
+ "INSERT INTO " + std::string(kTableName) + "(count, time";
+ std::string update_str =
+ "UPDATE " + std::string(kTableName) +
+ " SET count = count + 1, time = max(?, time)"
+ " WHERE time >= ? AND time < ?";
+
+ for (size_t i = 0; i < arraysize(matched_columns); i++) {
+ insert_str =
+ base::StringPrintf("%s, %s", insert_str.c_str(), matched_columns[i]);
+ update_str = base::StringPrintf(
+ "%s AND %s IS ?", update_str.c_str(), matched_columns[i]);
+ }
+ insert_str += ") VALUES (1, ?";
+ for (size_t i = 0; i < arraysize(matched_columns); i++) {
+ insert_str += ", ?";
+ }
+ insert_str += ")";
+
+ Action::ActionVector::size_type i;
+ for (i = 0; i != queue.size(); ++i) {
+ const Action& action = *queue[i];
+
+ base::Time day_start = action.time().LocalMidnight();
+ base::Time next_day = Util::AddDays(day_start, 1);
+
+ // The contents in values must match up with fields in matched_columns. A
+ // value of -1 is used to encode a null database value.
+ int64 id;
+ std::vector<int64> matched_values;
+
+ if (!string_table_.StringToInt(db, action.extension_id(), &id))
+ return false;
+ matched_values.push_back(id);
+
+ matched_values.push_back(static_cast<int>(action.action_type()));
+
+ if (!string_table_.StringToInt(db, action.api_name(), &id))
+ return false;
+ matched_values.push_back(id);
+
+ if (action.args()) {
+ std::string args = Util::Serialize(action.args());
+ // TODO(mvrable): For now, truncate long argument lists. This is a
+ // workaround for excessively-long values coming from DOM logging. When
+ // the V8ValueConverter is fixed to return more reasonable values, we can
+ // drop the truncation.
+ if (args.length() > 10000) {
+ args = "[\"<too_large>\"]";
+ }
+ if (!string_table_.StringToInt(db, args, &id))
+ return false;
+ matched_values.push_back(id);
+ } else {
+ matched_values.push_back(-1);
+ }
+
+ std::string page_url_string =
+ Util::UrlToString(action.page_url(), action.page_incognito());
+ if (!page_url_string.empty()) {
+ if (!url_table_.StringToInt(db, page_url_string, &id))
+ return false;
+ matched_values.push_back(id);
+ } else {
+ matched_values.push_back(-1);
+ }
+
+ // TODO(mvrable): Create a title_table_?
+ if (!action.page_title().empty()) {
+ if (!string_table_.StringToInt(db, action.page_title(), &id))
+ return false;
+ matched_values.push_back(id);
+ } else {
+ matched_values.push_back(-1);
+ }
+
+ std::string arg_url_string =
+ Util::UrlToString(action.arg_url(), action.arg_incognito());
+ if (!arg_url_string.empty()) {
+ if (!url_table_.StringToInt(db, arg_url_string, &id))
+ return false;
+ matched_values.push_back(id);
+ } else {
+ matched_values.push_back(-1);
+ }
+
+ if (action.other()) {
+ if (!string_table_.StringToInt(db, Util::Serialize(action.other()), &id))
+ return false;
+ matched_values.push_back(id);
+ } else {
+ matched_values.push_back(-1);
+ }
+
+ // Assume there is an existing row for this action, and try to update the
+ // count.
+ sql::Statement update_statement(db->GetCachedStatement(
+ sql::StatementID(SQL_FROM_HERE), update_str.c_str()));
+ update_statement.BindInt64(0, action.time().ToInternalValue());
+ update_statement.BindInt64(1, day_start.ToInternalValue());
+ update_statement.BindInt64(2, next_day.ToInternalValue());
+ for (size_t j = 0; j < matched_values.size(); j++) {
+ if (matched_values[j] == -1)
+ update_statement.BindNull(j + 3);
+ else
+ update_statement.BindInt64(j + 3, matched_values[j]);
+ }
+ if (!update_statement.Run())
+ return false;
+
+ // Check if the update succeeded (was the count of updated rows non-zero)?
+ // If it failed because no matching row existed, fall back to inserting a
+ // new record.
+ if (db->GetLastChangeCount() > 0) {
+ if (db->GetLastChangeCount() > 1) {
+ LOG(WARNING) << "Found and updated multiple rows in the activity log "
+ << "database; counts may be off!";
+ }
+ continue;
+ }
+ sql::Statement insert_statement(db->GetCachedStatement(
+ sql::StatementID(SQL_FROM_HERE), insert_str.c_str()));
+ insert_statement.BindInt64(0, action.time().ToInternalValue());
+ for (size_t j = 0; j < matched_values.size(); j++) {
+ if (matched_values[j] == -1)
+ update_statement.BindNull(j + 1);
+ else
+ insert_statement.BindInt64(j + 1, matched_values[j]);
+ }
+ if (!insert_statement.Run())
+ return false;
+ }
+
+ if (clean_database) {
+ LOG(ERROR) << "Cleaning database";
+ base::Time cutoff = (Now() - retention_time()).LocalMidnight();
+ if (!CleanOlderThan(db, cutoff))
+ return false;
+ last_database_cleaning_time_ = Now();
+ }
+
+ if (!transaction.Commit())
+ return false;
+
+ return true;
+}
+
+scoped_ptr<Action::ActionVector> CountingPolicy::DoReadData(
+ const std::string& extension_id,
+ const int days_ago) {
+ // Ensure data is flushed to the database first so that we query over all
+ // data.
+ activity_database()->AdviseFlush(ActivityDatabase::kFlushImmediately);
+
+ DCHECK_GE(days_ago, 0);
+ scoped_ptr<Action::ActionVector> actions(new Action::ActionVector());
+
+ sql::Connection* db = GetDatabaseConnection();
+ if (!db) {
+ return actions.Pass();
+ }
+
+ int64 early_bound;
+ int64 late_bound;
+ Util::ComputeDatabaseTimeBounds(Now(), days_ago, &early_bound, &late_bound);
+ std::string query_str = base::StringPrintf(
+ "SELECT time, action_type, api_name, args, page_url, page_title, "
+ "arg_url, other, count "
+ "FROM %s WHERE extension_id=? AND time>? AND time<=? "
+ "ORDER BY time DESC",
+ kReadViewName);
+ sql::Statement query(db->GetCachedStatement(SQL_FROM_HERE,
+ query_str.c_str()));
+ query.BindString(0, extension_id);
+ query.BindInt64(1, early_bound);
+ query.BindInt64(2, late_bound);
+
+ while (query.is_valid() && query.Step()) {
+ scoped_refptr<CountedAction> action =
+ new CountedAction(extension_id,
+ base::Time::FromInternalValue(query.ColumnInt64(0)),
+ static_cast<Action::ActionType>(query.ColumnInt(1)),
+ query.ColumnString(2));
+
+ if (query.ColumnType(3) != sql::COLUMN_TYPE_NULL) {
+ scoped_ptr<Value> parsed_value(
+ base::JSONReader::Read(query.ColumnString(3)));
+ if (parsed_value && parsed_value->IsType(Value::TYPE_LIST)) {
+ action->set_args(
+ make_scoped_ptr(static_cast<ListValue*>(parsed_value.release())));
+ } else {
+ LOG(WARNING) << "Unable to parse args: '" << query.ColumnString(3)
+ << "'";
+ }
+ }
+
+ GURL page_url(query.ColumnString(4));
+ action->set_page_url(page_url);
+
+ action->set_page_title(query.ColumnString(5));
+
+ GURL arg_url(query.ColumnString(6));
+ action->set_arg_url(arg_url);
+
+ if (query.ColumnType(7) != sql::COLUMN_TYPE_NULL) {
+ scoped_ptr<Value> parsed_value(
+ base::JSONReader::Read(query.ColumnString(7)));
+ if (parsed_value && parsed_value->IsType(Value::TYPE_DICTIONARY)) {
+ action->set_other(make_scoped_ptr(
+ static_cast<DictionaryValue*>(parsed_value.release())));
+ } else {
+ LOG(WARNING) << "Unable to parse other: '" << query.ColumnString(7)
+ << "'";
+ }
+ }
+
+ action->set_count(query.ColumnInt(8));
+
+ actions->push_back(action);
+ }
+
+ return actions.Pass();
+}
+
+void CountingPolicy::ReadData(
+ const std::string& extension_id,
+ const int day,
+ const base::Callback<void(scoped_ptr<Action::ActionVector>)>& callback) {
+ BrowserThread::PostTaskAndReplyWithResult(
+ BrowserThread::DB,
+ FROM_HERE,
+ base::Bind(&CountingPolicy::DoReadData,
+ base::Unretained(this),
+ extension_id,
+ day),
+ callback);
+}
+
+void CountingPolicy::OnDatabaseFailure() {
+ queued_actions_.clear();
+}
+
+void CountingPolicy::OnDatabaseClose() {
+ delete this;
+}
+
+// Cleans old records from the activity log database.
+bool CountingPolicy::CleanOlderThan(sql::Connection* db,
+ const base::Time& cutoff) {
+ std::string clean_statement =
+ "DELETE FROM " + std::string(kTableName) + " WHERE time < ?";
+ sql::Statement cleaner(db->GetCachedStatement(sql::StatementID(SQL_FROM_HERE),
+ clean_statement.c_str()));
+ cleaner.BindInt64(0, cutoff.ToInternalValue());
+ if (!cleaner.Run())
+ return false;
+ return CleanStringTables(db);
+}
+
+// Cleans unused interned strings from the database. This should be run after
+// deleting rows from the main log table to clean out stale values.
+bool CountingPolicy::CleanStringTables(sql::Connection* db) {
+ sql::Statement cleaner1(db->GetCachedStatement(
+ sql::StatementID(SQL_FROM_HERE), kStringTableCleanup));
+ if (!cleaner1.Run())
+ return false;
+ if (db->GetLastChangeCount() > 0)
+ string_table_.ClearCache();
+
+ sql::Statement cleaner2(db->GetCachedStatement(
+ sql::StatementID(SQL_FROM_HERE), kUrlTableCleanup));
+ if (!cleaner2.Run())
+ return false;
+ if (db->GetLastChangeCount() > 0)
+ url_table_.ClearCache();
+
+ return true;
+}
+
+void CountingPolicy::Close() {
+ closing_ = true;
+
+ // The policy object should have never been created if there's no DB thread.
+ DCHECK(BrowserThread::IsMessageLoopValid(BrowserThread::DB));
+ ScheduleAndForget(activity_database(), &ActivityDatabase::Close);
+}
+
+} // namespace extensions

Powered by Google App Engine
This is Rietveld 408576698