Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(698)

Side by Side Diff: components/browser_watcher/postmortem_report_collector.cc

Issue 2715903003: Bound the impact of system instability on chrome instability. (Closed)
Patch Set: Address Siggi's comments Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/browser_watcher/postmortem_report_collector.h" 5 #include "components/browser_watcher/postmortem_report_collector.h"
6 6
7 #include <utility> 7 #include <utility>
8 8
9 #include "base/debug/activity_analyzer.h" 9 #include "base/debug/activity_analyzer.h"
10 #include "base/files/file_enumerator.h" 10 #include "base/files/file_enumerator.h"
(...skipping 18 matching lines...) Expand all
29 using base::debug::ActivityUserData; 29 using base::debug::ActivityUserData;
30 using base::debug::GlobalActivityAnalyzer; 30 using base::debug::GlobalActivityAnalyzer;
31 using base::debug::GlobalActivityTracker; 31 using base::debug::GlobalActivityTracker;
32 using base::debug::ThreadActivityAnalyzer; 32 using base::debug::ThreadActivityAnalyzer;
33 using crashpad::CrashReportDatabase; 33 using crashpad::CrashReportDatabase;
34 34
35 namespace { 35 namespace {
36 36
37 const char kFieldTrialKeyPrefix[] = "FieldTrial."; 37 const char kFieldTrialKeyPrefix[] = "FieldTrial.";
38 38
39 // DO NOT CHANGE VALUES. This is logged persistently in a histogram.
40 enum SystemSessionAnalysisStatus {
41 SYSTEM_SESSION_ANALYSIS_SUCCESS = 0,
42 SYSTEM_SESSION_ANALYSIS_NO_TIMESTAMP = 1,
43 SYSTEM_SESSION_ANALYSIS_NO_ANALYZER = 2,
44 SYSTEM_SESSION_ANALYSIS_FAILED = 3,
45 SYSTEM_SESSION_ANALYSIS_OUTSIDE_RANGE = 4,
46 SYSTEM_SESSION_ANALYSIS_STATUS_MAX = 5
47 };
48
39 // Collects stability user data from the recorded format to the collected 49 // Collects stability user data from the recorded format to the collected
40 // format. 50 // format.
41 void CollectUserData( 51 void CollectUserData(
42 const ActivityUserData::Snapshot& recorded_map, 52 const ActivityUserData::Snapshot& recorded_map,
43 google::protobuf::Map<std::string, TypedValue>* collected_map, 53 google::protobuf::Map<std::string, TypedValue>* collected_map,
44 StabilityReport* report) { 54 StabilityReport* report) {
45 DCHECK(collected_map); 55 DCHECK(collected_map);
46 56
47 for (const auto& name_and_value : recorded_map) { 57 for (const auto& name_and_value : recorded_map) {
48 const std::string& key = name_and_value.first; 58 const std::string& key = name_and_value.first;
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
108 break; 118 break;
109 case ActivityUserData::UNSIGNED_VALUE: 119 case ActivityUserData::UNSIGNED_VALUE:
110 collected_value.set_unsigned_value(recorded_value.GetUint()); 120 collected_value.set_unsigned_value(recorded_value.GetUint());
111 break; 121 break;
112 } 122 }
113 123
114 (*collected_map)[key].Swap(&collected_value); 124 (*collected_map)[key].Swap(&collected_value);
115 } 125 }
116 } 126 }
117 127
128 bool GetStartTimestamp(
129 const google::protobuf::Map<std::string, TypedValue>& global_data,
130 base::Time* time) {
131 DCHECK(time);
132
133 const auto& it = global_data.find(kStabilityStartTimestamp);
134 if (it == global_data.end())
135 return false;
136
137 const TypedValue& value = it->second;
138 if (value.value_case() != TypedValue::kSignedValue)
139 return false;
140
141 *time = base::Time::FromInternalValue(value.signed_value());
142 return true;
143 }
144
118 void CollectModuleInformation( 145 void CollectModuleInformation(
119 const std::vector<GlobalActivityTracker::ModuleInfo>& modules, 146 const std::vector<GlobalActivityTracker::ModuleInfo>& modules,
120 ProcessState* process_state) { 147 ProcessState* process_state) {
121 DCHECK(process_state); 148 DCHECK(process_state);
122 149
123 char code_identifier[17]; 150 char code_identifier[17];
124 char debug_identifier[41]; 151 char debug_identifier[41];
125 152
126 for (const GlobalActivityTracker::ModuleInfo& recorded : modules) { 153 for (const GlobalActivityTracker::ModuleInfo& recorded : modules) {
127 CodeModule* collected = process_state->add_modules(); 154 CodeModule* collected = process_state->add_modules();
(...skipping 18 matching lines...) Expand all
146 collected->set_debug_identifier(debug_identifier); 173 collected->set_debug_identifier(debug_identifier);
147 collected->set_is_unloaded(!recorded.is_loaded); 174 collected->set_is_unloaded(!recorded.is_loaded);
148 } 175 }
149 } 176 }
150 177
151 } // namespace 178 } // namespace
152 179
153 PostmortemReportCollector::PostmortemReportCollector( 180 PostmortemReportCollector::PostmortemReportCollector(
154 const std::string& product_name, 181 const std::string& product_name,
155 const std::string& version_number, 182 const std::string& version_number,
156 const std::string& channel_name) 183 const std::string& channel_name,
184 SystemSessionAnalyzer* analyzer)
157 : product_name_(product_name), 185 : product_name_(product_name),
158 version_number_(version_number), 186 version_number_(version_number),
159 channel_name_(channel_name) {} 187 channel_name_(channel_name),
188 system_session_analyzer_(analyzer) {}
160 189
161 int PostmortemReportCollector::CollectAndSubmitForUpload( 190 PostmortemReportCollector::~PostmortemReportCollector() {}
191
192 int PostmortemReportCollector::CollectAndSubmitAllPendingReports(
162 const base::FilePath& debug_info_dir, 193 const base::FilePath& debug_info_dir,
163 const base::FilePath::StringType& debug_file_pattern, 194 const base::FilePath::StringType& debug_file_pattern,
164 const std::set<base::FilePath>& excluded_debug_files, 195 const std::set<base::FilePath>& excluded_debug_files,
165 crashpad::CrashReportDatabase* report_database) { 196 crashpad::CrashReportDatabase* report_database) {
166 DCHECK_NE(true, debug_info_dir.empty()); 197 DCHECK_NE(true, debug_info_dir.empty());
167 DCHECK_NE(true, debug_file_pattern.empty()); 198 DCHECK_NE(true, debug_file_pattern.empty());
168 DCHECK_NE(nullptr, report_database); 199 DCHECK_NE(nullptr, report_database);
169 200
170 // Collect the list of files to harvest. 201 // Collect the list of files to harvest.
171 std::vector<FilePath> debug_files = GetDebugStateFilePaths( 202 std::vector<FilePath> debug_files = GetDebugStateFilePaths(
172 debug_info_dir, debug_file_pattern, excluded_debug_files); 203 debug_info_dir, debug_file_pattern, excluded_debug_files);
173 204
174 // Determine the crashpad client id. 205 // Determine the crashpad client id.
175 crashpad::UUID client_id; 206 crashpad::UUID client_id;
176 crashpad::Settings* settings = report_database->GetSettings(); 207 crashpad::Settings* settings = report_database->GetSettings();
177 if (settings) { 208 if (settings) {
178 // If GetSettings() or GetClientID() fails client_id will be left at its 209 // If GetSettings() or GetClientID() fails client_id will be left at its
179 // default value, all zeroes, which is appropriate. 210 // default value, all zeroes, which is appropriate.
180 settings->GetClientID(&client_id); 211 settings->GetClientID(&client_id);
181 } 212 }
182 213
183 // Process each stability file. 214 // Process each stability file.
184 int success_cnt = 0; 215 int success_cnt = 0;
185 for (const FilePath& file : debug_files) { 216 for (const FilePath& file : debug_files) {
186 CollectionStatus status = 217 CollectionStatus status =
187 CollectAndSubmit(client_id, file, report_database); 218 CollectAndSubmitOneReport(client_id, file, report_database);
188 // TODO(manzagop): consider making this a stability metric. 219 // TODO(manzagop): consider making this a stability metric.
189 UMA_HISTOGRAM_ENUMERATION("ActivityTracker.Collect.Status", status, 220 UMA_HISTOGRAM_ENUMERATION("ActivityTracker.Collect.Status", status,
190 COLLECTION_STATUS_MAX); 221 COLLECTION_STATUS_MAX);
191 if (status == SUCCESS) 222 if (status == SUCCESS)
192 ++success_cnt; 223 ++success_cnt;
193 } 224 }
194 225
195 return success_cnt; 226 return success_cnt;
196 } 227 }
197 228
(...skipping 10 matching lines...) Expand all
208 debug_file_pattern); 239 debug_file_pattern);
209 FilePath path; 240 FilePath path;
210 for (path = enumerator.Next(); !path.empty(); path = enumerator.Next()) { 241 for (path = enumerator.Next(); !path.empty(); path = enumerator.Next()) {
211 if (excluded_debug_files.find(path) == excluded_debug_files.end()) 242 if (excluded_debug_files.find(path) == excluded_debug_files.end())
212 paths.push_back(path); 243 paths.push_back(path);
213 } 244 }
214 return paths; 245 return paths;
215 } 246 }
216 247
217 PostmortemReportCollector::CollectionStatus 248 PostmortemReportCollector::CollectionStatus
218 PostmortemReportCollector::CollectAndSubmit( 249 PostmortemReportCollector::CollectAndSubmitOneReport(
219 const crashpad::UUID& client_id, 250 const crashpad::UUID& client_id,
220 const FilePath& file, 251 const FilePath& file,
221 crashpad::CrashReportDatabase* report_database) { 252 crashpad::CrashReportDatabase* report_database) {
222 DCHECK_NE(nullptr, report_database); 253 DCHECK_NE(nullptr, report_database);
223 254
224 // Note: the code below involves two notions of report: chrome internal state 255 // Note: the code below involves two notions of report: chrome internal state
225 // reports and the crashpad reports they get wrapped into. 256 // reports and the crashpad reports they get wrapped into.
226 257
227 // Collect the data from the debug file to a proto. Note: a non-empty report 258 // Collect the data from the debug file to a proto. Note: a non-empty report
228 // is interpreted here as an unclean exit. 259 // is interpreted here as an unclean exit.
229 std::unique_ptr<StabilityReport> report_proto; 260 std::unique_ptr<StabilityReport> report_proto;
230 CollectionStatus status = Collect(file, &report_proto); 261 CollectionStatus status = CollectOneReport(file, &report_proto);
231 if (status != SUCCESS) { 262 if (status != SUCCESS) {
232 // The file was empty, or there was an error collecting the data. Detailed 263 // The file was empty, or there was an error collecting the data. Detailed
233 // logging happens within the Collect function. 264 // logging happens within the Collect function.
234 if (!base::DeleteFile(file, false)) 265 if (!base::DeleteFile(file, false))
235 LOG(ERROR) << "Failed to delete " << file.value(); 266 LOG(ERROR) << "Failed to delete " << file.value();
236 return status; 267 return status;
237 } 268 }
238 DCHECK_NE(nullptr, report_proto.get()); 269 DCHECK_NE(nullptr, report_proto.get());
239 270
240 // Prepare a crashpad report. 271 // Prepare a crashpad report.
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
272 database_status = report_database->FinishedWritingCrashReport( 303 database_status = report_database->FinishedWritingCrashReport(
273 new_report, &unused_report_id); 304 new_report, &unused_report_id);
274 if (database_status != CrashReportDatabase::kNoError) { 305 if (database_status != CrashReportDatabase::kNoError) {
275 LOG(ERROR) << "FinishedWritingCrashReport failed"; 306 LOG(ERROR) << "FinishedWritingCrashReport failed";
276 return FINISHED_WRITING_CRASH_REPORT_FAILED; 307 return FINISHED_WRITING_CRASH_REPORT_FAILED;
277 } 308 }
278 309
279 return SUCCESS; 310 return SUCCESS;
280 } 311 }
281 312
282 PostmortemReportCollector::CollectionStatus PostmortemReportCollector::Collect( 313 PostmortemReportCollector::CollectionStatus
314 PostmortemReportCollector::CollectOneReport(
283 const base::FilePath& debug_state_file, 315 const base::FilePath& debug_state_file,
284 std::unique_ptr<StabilityReport>* report) { 316 std::unique_ptr<StabilityReport>* report) {
285 DCHECK_NE(nullptr, report); 317 DCHECK_NE(nullptr, report);
286 report->reset(); 318 report->reset();
287 319
288 // Create a global analyzer. 320 // Create a global analyzer.
289 std::unique_ptr<GlobalActivityAnalyzer> global_analyzer = 321 std::unique_ptr<GlobalActivityAnalyzer> global_analyzer =
290 GlobalActivityAnalyzer::CreateWithFile(debug_state_file); 322 GlobalActivityAnalyzer::CreateWithFile(debug_state_file);
291 if (!global_analyzer) 323 if (!global_analyzer)
292 return ANALYZER_CREATION_FAILED; 324 return ANALYZER_CREATION_FAILED;
(...skipping 10 matching lines...) Expand all
303 335
304 // Create the report, then flesh it out. 336 // Create the report, then flesh it out.
305 report->reset(new StabilityReport()); 337 report->reset(new StabilityReport());
306 338
307 // Collect log messages. 339 // Collect log messages.
308 for (const std::string& message : log_messages) { 340 for (const std::string& message : log_messages) {
309 (*report)->add_log_messages(message); 341 (*report)->add_log_messages(message);
310 } 342 }
311 343
312 // Collect global user data. 344 // Collect global user data.
313 google::protobuf::Map<std::string, TypedValue>& global_data = 345 CollectUserData(global_data_snapshot, (*report)->mutable_global_data(),
314 *(*report)->mutable_global_data(); 346 report->get());
315 CollectUserData(global_data_snapshot, &global_data, report->get()); 347 SetReporterDetails(report->get());
316 348 RecordSystemShutdownState(report->get());
317 // Add the reporting Chrome's details to the report.
318 global_data[kStabilityReporterChannel].set_string_value(channel_name());
319 #if defined(ARCH_CPU_X86)
320 global_data[kStabilityReporterPlatform].set_string_value(
321 std::string("Win32"));
322 #elif defined(ARCH_CPU_X86_64)
323 global_data[kStabilityReporterPlatform].set_string_value(
324 std::string("Win64"));
325 #endif
326 global_data[kStabilityReporterProduct].set_string_value(product_name());
327 global_data[kStabilityReporterVersion].set_string_value(version_number());
328 349
329 // Collect thread activity data. 350 // Collect thread activity data.
330 // Note: a single process is instrumented. 351 // Note: only the browser process records stability data for now.
331 ProcessState* process_state = (*report)->add_process_states(); 352 ProcessState* process_state = (*report)->add_process_states();
332 for (; thread_analyzer != nullptr; 353 for (; thread_analyzer != nullptr;
333 thread_analyzer = global_analyzer->GetNextAnalyzer()) { 354 thread_analyzer = global_analyzer->GetNextAnalyzer()) {
334 // Only valid analyzers are expected per contract of GetFirstAnalyzer / 355 // Only valid analyzers are expected per contract of GetFirstAnalyzer /
335 // GetNextAnalyzer. 356 // GetNextAnalyzer.
336 DCHECK(thread_analyzer->IsValid()); 357 DCHECK(thread_analyzer->IsValid());
337 358
338 if (!process_state->has_process_id()) { 359 if (!process_state->has_process_id()) {
339 process_state->set_process_id( 360 process_state->set_process_id(
340 thread_analyzer->activity_snapshot().process_id); 361 thread_analyzer->activity_snapshot().process_id);
341 } 362 }
342 DCHECK_EQ(thread_analyzer->activity_snapshot().process_id, 363 DCHECK_EQ(thread_analyzer->activity_snapshot().process_id,
343 process_state->process_id()); 364 process_state->process_id());
344 365
345 ThreadState* thread_state = process_state->add_threads(); 366 ThreadState* thread_state = process_state->add_threads();
346 CollectThread(thread_analyzer->activity_snapshot(), thread_state); 367 CollectThread(thread_analyzer->activity_snapshot(), thread_state);
347 } 368 }
348 369
349 // Collect module information. 370 // Collect module information.
350 CollectModuleInformation(global_analyzer->GetModules(), process_state); 371 CollectModuleInformation(global_analyzer->GetModules(), process_state);
351 372
352 return SUCCESS; 373 return SUCCESS;
353 } 374 }
354 375
376 void PostmortemReportCollector::SetReporterDetails(
377 StabilityReport* report) const {
378 DCHECK(report);
379
380 google::protobuf::Map<std::string, TypedValue>& global_data =
381 *(report->mutable_global_data());
382
383 // Reporter version details. These are useful as the reporter may be of a
384 // different version.
385 global_data[kStabilityReporterChannel].set_string_value(channel_name());
386 #if defined(ARCH_CPU_X86)
387 global_data[kStabilityReporterPlatform].set_string_value(
388 std::string("Win32"));
389 #elif defined(ARCH_CPU_X86_64)
390 global_data[kStabilityReporterPlatform].set_string_value(
391 std::string("Win64"));
392 #endif
393 global_data[kStabilityReporterProduct].set_string_value(product_name());
394 global_data[kStabilityReporterVersion].set_string_value(version_number());
395 }
396
397 void PostmortemReportCollector::RecordSystemShutdownState(
398 StabilityReport* report) const {
399 DCHECK(report);
400
401 // The session state for the stability report, recorded to provided visibility
402 // into whether the system session was clean.
403 SystemState::SessionState session_state = SystemState::UNKNOWN;
404 // The status of the analysis, recorded to provide insight into the success
405 // or failure of the analysis.
406 SystemSessionAnalysisStatus status = SYSTEM_SESSION_ANALYSIS_SUCCESS;
407
408 base::Time time;
409 if (!GetStartTimestamp(report->global_data(), &time)) {
410 status = SYSTEM_SESSION_ANALYSIS_NO_TIMESTAMP;
411 } else if (!system_session_analyzer_) {
412 status = SYSTEM_SESSION_ANALYSIS_NO_ANALYZER;
413 } else {
414 SystemSessionAnalyzer::Status analyzer_status =
415 system_session_analyzer_->IsSessionUnclean(time);
416 switch (analyzer_status) {
417 case SystemSessionAnalyzer::FAILED:
418 status = SYSTEM_SESSION_ANALYSIS_FAILED;
419 break;
420 case SystemSessionAnalyzer::CLEAN:
421 session_state = SystemState::CLEAN;
422 break;
423 case SystemSessionAnalyzer::UNCLEAN:
424 session_state = SystemState::UNCLEAN;
425 break;
426 case SystemSessionAnalyzer::OUTSIDE_RANGE:
427 status = SYSTEM_SESSION_ANALYSIS_OUTSIDE_RANGE;
428 break;
429 }
430 }
431
432 report->mutable_system_state()->set_session_state(session_state);
433 UMA_HISTOGRAM_ENUMERATION(
434 "ActivityTracker.Collect.SystemSessionAnalysisStatus", status,
435 SYSTEM_SESSION_ANALYSIS_STATUS_MAX);
436 }
437
355 void PostmortemReportCollector::CollectThread( 438 void PostmortemReportCollector::CollectThread(
356 const base::debug::ThreadActivityAnalyzer::Snapshot& snapshot, 439 const base::debug::ThreadActivityAnalyzer::Snapshot& snapshot,
357 ThreadState* thread_state) { 440 ThreadState* thread_state) {
358 DCHECK(thread_state); 441 DCHECK(thread_state);
359 442
360 thread_state->set_thread_name(snapshot.thread_name); 443 thread_state->set_thread_name(snapshot.thread_name);
361 thread_state->set_thread_id(snapshot.thread_id); 444 thread_state->set_thread_id(snapshot.thread_id);
362 thread_state->set_activity_count(snapshot.activity_stack_depth); 445 thread_state->set_activity_count(snapshot.activity_stack_depth);
363 446
364 for (size_t i = 0; i < snapshot.activity_stack.size(); ++i) { 447 for (size_t i = 0; i < snapshot.activity_stack.size(); ++i) {
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
404 StabilityReport* report, 487 StabilityReport* report,
405 const crashpad::UUID& client_id, 488 const crashpad::UUID& client_id,
406 const crashpad::UUID& report_id, 489 const crashpad::UUID& report_id,
407 base::PlatformFile minidump_file) { 490 base::PlatformFile minidump_file) {
408 DCHECK(report); 491 DCHECK(report);
409 492
410 return WritePostmortemDump(minidump_file, client_id, report_id, report); 493 return WritePostmortemDump(minidump_file, client_id, report_id, report);
411 } 494 }
412 495
413 } // namespace browser_watcher 496 } // namespace browser_watcher
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698