OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Performance Test Bisect Tool | 6 """Performance Test Bisect Tool |
7 | 7 |
8 This script bisects a series of changelists using binary search. It starts at | 8 This script bisects a series of changelists using binary search. It starts at |
9 a bad revision where a performance metric has regressed, and asks for a last | 9 a bad revision where a performance metric has regressed, and asks for a last |
10 known-good revision. It will then binary search across this revision range by | 10 known-good revision. It will then binary search across this revision range by |
(...skipping 2323 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2334 print | 2334 print |
2335 print 'To reproduce locally:' | 2335 print 'To reproduce locally:' |
2336 print '$ ' + self.opts.command | 2336 print '$ ' + self.opts.command |
2337 if bisect_utils.IsTelemetryCommand(self.opts.command): | 2337 if bisect_utils.IsTelemetryCommand(self.opts.command): |
2338 print | 2338 print |
2339 print 'Also consider passing --profiler=list to see available profilers.' | 2339 print 'Also consider passing --profiler=list to see available profilers.' |
2340 | 2340 |
2341 def _PrintOtherRegressions(self, other_regressions, revision_data): | 2341 def _PrintOtherRegressions(self, other_regressions, revision_data): |
2342 print | 2342 print |
2343 print 'Other regressions may have occurred:' | 2343 print 'Other regressions may have occurred:' |
| 2344 print ' %8s %82s %10s' % ('Depot'.center(8, ' '), |
| 2345 'Range'.center(82, ' '), 'Confidence'.center(10, ' ')) |
2344 for regression in other_regressions: | 2346 for regression in other_regressions: |
2345 current_id, previous_id, percent_change, deviations = regression | 2347 current_id, previous_id, confidence = regression |
2346 current_data = revision_data[current_id] | 2348 current_data = revision_data[current_id] |
2347 previous_data = revision_data[previous_id] | 2349 previous_data = revision_data[previous_id] |
2348 | 2350 |
2349 if deviations is None: | 2351 print ' %8s %s..%s %s' % ( |
2350 deviations = 'N/A' | 2352 current_data['depot'], current_id, previous_id, |
2351 else: | 2353 ('%d%%' % confidence).center(10, ' ')) |
2352 deviations = '%.2f' % deviations | |
2353 | |
2354 if percent_change is None: | |
2355 percent_change = 0 | |
2356 | |
2357 print ' %8s %s [%.2f%%, %s x std.dev]' % ( | |
2358 previous_data['depot'], previous_id, 100 * percent_change, deviations) | |
2359 print ' %8s %s' % (current_data['depot'], current_id) | |
2360 print | 2354 print |
2361 | 2355 |
2362 def _PrintStepTime(self, revision_data_sorted): | 2356 def _PrintStepTime(self, revision_data_sorted): |
2363 step_perf_time_avg = 0.0 | 2357 step_perf_time_avg = 0.0 |
2364 step_build_time_avg = 0.0 | 2358 step_build_time_avg = 0.0 |
2365 step_count = 0.0 | 2359 step_count = 0.0 |
2366 for _, current_data in revision_data_sorted: | 2360 for _, current_data in revision_data_sorted: |
2367 step_perf_time_avg += current_data['perf_time'] | 2361 step_perf_time_avg += current_data['perf_time'] |
2368 step_build_time_avg += current_data['build_time'] | 2362 step_build_time_avg += current_data['build_time'] |
2369 step_count += 1 | 2363 step_count += 1 |
2370 if step_count: | 2364 if step_count: |
2371 step_perf_time_avg = step_perf_time_avg / step_count | 2365 step_perf_time_avg = step_perf_time_avg / step_count |
2372 step_build_time_avg = step_build_time_avg / step_count | 2366 step_build_time_avg = step_build_time_avg / step_count |
2373 print | 2367 print |
2374 print 'Average build time : %s' % datetime.timedelta( | 2368 print 'Average build time : %s' % datetime.timedelta( |
2375 seconds=int(step_build_time_avg)) | 2369 seconds=int(step_build_time_avg)) |
2376 print 'Average test time : %s' % datetime.timedelta( | 2370 print 'Average test time : %s' % datetime.timedelta( |
2377 seconds=int(step_perf_time_avg)) | 2371 seconds=int(step_perf_time_avg)) |
2378 | 2372 |
2379 def _PrintWarnings(self): | 2373 def _PrintWarnings(self): |
2380 if not self.warnings: | 2374 if not self.warnings: |
2381 return | 2375 return |
2382 print | 2376 print |
2383 print 'WARNINGS:' | 2377 print 'WARNINGS:' |
2384 for w in self.warnings: | 2378 for w in self.warnings: |
2385 print ' !!! %s' % w | 2379 print ' !!! %s' % w |
2386 | 2380 |
| 2381 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): |
| 2382 other_regressions = [] |
| 2383 previous_values = [] |
| 2384 previous_id = None |
| 2385 for current_id, current_data in revision_data_sorted: |
| 2386 current_values = current_data['value'] |
| 2387 if current_values: |
| 2388 current_values = current_values['values'] |
| 2389 if previous_values: |
| 2390 confidence = self._CalculateConfidence(previous_values, |
| 2391 [current_values]) |
| 2392 mean_of_prev_runs = CalculateTruncatedMean( |
| 2393 sum(previous_values, []), 0) |
| 2394 mean_of_current_runs = CalculateTruncatedMean(current_values, 0) |
| 2395 |
| 2396 # Check that the potential regression is in the same direction as |
| 2397 # the overall regression. If the mean of the previous runs < the |
| 2398 # mean of the current runs, this local regression is in same |
| 2399 # direction. |
| 2400 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
| 2401 is_same_direction = (prev_less_than_current if |
| 2402 bad_greater_than_good else not prev_less_than_current) |
| 2403 |
| 2404 # Only report potential regressions with high confidence. |
| 2405 if is_same_direction and confidence > 50: |
| 2406 other_regressions.append([current_id, previous_id, confidence]) |
| 2407 previous_values.append(current_values) |
| 2408 previous_id = current_id |
| 2409 return other_regressions |
| 2410 |
| 2411 def _CalculateConfidence(self, working_means, broken_means): |
| 2412 bounds_working = [] |
| 2413 bounds_broken = [] |
| 2414 for m in working_means: |
| 2415 current_mean = CalculateTruncatedMean(m, 0) |
| 2416 if bounds_working: |
| 2417 bounds_working[0] = min(current_mean, bounds_working[0]) |
| 2418 bounds_working[1] = max(current_mean, bounds_working[0]) |
| 2419 else: |
| 2420 bounds_working = [current_mean, current_mean] |
| 2421 for m in broken_means: |
| 2422 current_mean = CalculateTruncatedMean(m, 0) |
| 2423 if bounds_broken: |
| 2424 bounds_broken[0] = min(current_mean, bounds_broken[0]) |
| 2425 bounds_broken[1] = max(current_mean, bounds_broken[0]) |
| 2426 else: |
| 2427 bounds_broken = [current_mean, current_mean] |
| 2428 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]), |
| 2429 math.fabs(bounds_broken[0] - bounds_working[1])) |
| 2430 working_mean = sum(working_means, []) |
| 2431 broken_mean = sum(broken_means, []) |
| 2432 len_working_group = CalculateStandardDeviation(working_mean) |
| 2433 len_broken_group = CalculateStandardDeviation(broken_mean) |
| 2434 |
| 2435 confidence = (dist_between_groups / ( |
| 2436 max(0.0001, (len_broken_group + len_working_group )))) |
| 2437 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) |
| 2438 return confidence |
| 2439 |
2387 def _GetResultsDict(self, revision_data, revision_data_sorted): | 2440 def _GetResultsDict(self, revision_data, revision_data_sorted): |
2388 # Find range where it possibly broke. | 2441 # Find range where it possibly broke. |
2389 first_working_revision = None | 2442 first_working_revision = None |
2390 first_working_revision_index = -1 | 2443 first_working_revision_index = -1 |
2391 last_broken_revision = None | 2444 last_broken_revision = None |
2392 last_broken_revision_index = -1 | 2445 last_broken_revision_index = -1 |
2393 | 2446 |
2394 for i in xrange(len(revision_data_sorted)): | 2447 for i in xrange(len(revision_data_sorted)): |
2395 k, v = revision_data_sorted[i] | 2448 k, v = revision_data_sorted[i] |
2396 if v['passed'] == 1: | 2449 if v['passed'] == 1: |
2397 if not first_working_revision: | 2450 if not first_working_revision: |
2398 first_working_revision = k | 2451 first_working_revision = k |
2399 first_working_revision_index = i | 2452 first_working_revision_index = i |
2400 | 2453 |
2401 if not v['passed']: | 2454 if not v['passed']: |
2402 last_broken_revision = k | 2455 last_broken_revision = k |
2403 last_broken_revision_index = i | 2456 last_broken_revision_index = i |
2404 | 2457 |
2405 if last_broken_revision != None and first_working_revision != None: | 2458 if last_broken_revision != None and first_working_revision != None: |
2406 bounds_broken = [revision_data[last_broken_revision]['value']['mean'], | 2459 broken_means = [] |
2407 revision_data[last_broken_revision]['value']['mean']] | |
2408 broken_mean = [] | |
2409 for i in xrange(0, last_broken_revision_index + 1): | 2460 for i in xrange(0, last_broken_revision_index + 1): |
2410 if revision_data_sorted[i][1]['value']: | 2461 if revision_data_sorted[i][1]['value']: |
2411 bounds_broken[0] = min(bounds_broken[0], | 2462 broken_means.append(revision_data_sorted[i][1]['value']['values']) |
2412 revision_data_sorted[i][1]['value']['mean']) | |
2413 bounds_broken[1] = max(bounds_broken[1], | |
2414 revision_data_sorted[i][1]['value']['mean']) | |
2415 broken_mean.extend(revision_data_sorted[i][1]['value']['values']) | |
2416 | 2463 |
2417 bounds_working = [revision_data[first_working_revision]['value']['mean'], | 2464 working_means = [] |
2418 revision_data[first_working_revision]['value']['mean']] | |
2419 working_mean = [] | |
2420 for i in xrange(first_working_revision_index, len(revision_data_sorted)): | 2465 for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
2421 if revision_data_sorted[i][1]['value']: | 2466 if revision_data_sorted[i][1]['value']: |
2422 bounds_working[0] = min(bounds_working[0], | 2467 working_means.append(revision_data_sorted[i][1]['value']['values']) |
2423 revision_data_sorted[i][1]['value']['mean']) | 2468 |
2424 bounds_working[1] = max(bounds_working[1], | 2469 # Flatten the lists to calculate mean of all values. |
2425 revision_data_sorted[i][1]['value']['mean']) | 2470 working_mean = sum(working_means, []) |
2426 working_mean.extend(revision_data_sorted[i][1]['value']['values']) | 2471 broken_mean = sum(broken_means, []) |
2427 | 2472 |
2428 # Calculate the approximate size of the regression | 2473 # Calculate the approximate size of the regression |
2429 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0) | 2474 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0) |
2430 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0) | 2475 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0) |
2431 | 2476 |
2432 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) / | 2477 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) / |
2433 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0 | 2478 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0 |
2434 | 2479 |
2435 regression_std_err = math.fabs(CalculatePooledStandardError( | 2480 regression_std_err = math.fabs(CalculatePooledStandardError( |
2436 [working_mean, broken_mean]) / | 2481 [working_mean, broken_mean]) / |
2437 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 | 2482 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
2438 | 2483 |
2439 # Give a "confidence" in the bisect. At the moment we use how distinct the | 2484 # Give a "confidence" in the bisect. At the moment we use how distinct the |
2440 # values are before and after the last broken revision, and how noisy the | 2485 # values are before and after the last broken revision, and how noisy the |
2441 # overall graph is. | 2486 # overall graph is. |
2442 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]), | 2487 confidence = self._CalculateConfidence(working_means, broken_means) |
2443 math.fabs(bounds_broken[0] - bounds_working[1])) | |
2444 len_working_group = CalculateStandardDeviation(working_mean) | |
2445 len_broken_group = CalculateStandardDeviation(broken_mean) | |
2446 | |
2447 confidence = (dist_between_groups / ( | |
2448 max(0.0001, (len_broken_group + len_working_group )))) | |
2449 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) | |
2450 | 2488 |
2451 culprit_revisions = [] | 2489 culprit_revisions = [] |
2452 | 2490 |
2453 cwd = os.getcwd() | 2491 cwd = os.getcwd() |
2454 self.ChangeToDepotWorkingDirectory( | 2492 self.ChangeToDepotWorkingDirectory( |
2455 revision_data[last_broken_revision]['depot']) | 2493 revision_data[last_broken_revision]['depot']) |
2456 | 2494 |
2457 if revision_data[last_broken_revision]['depot'] == 'cros': | 2495 if revision_data[last_broken_revision]['depot'] == 'cros': |
2458 # Want to get a list of all the commits and what depots they belong | 2496 # Want to get a list of all the commits and what depots they belong |
2459 # to so that we can grab info about each. | 2497 # to so that we can grab info about each. |
(...skipping 30 matching lines...) Expand all Loading... |
2490 for i in xrange(last_broken_revision_index, len(revision_data_sorted)): | 2528 for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
2491 k, v = revision_data_sorted[i] | 2529 k, v = revision_data_sorted[i] |
2492 if k == first_working_revision: | 2530 if k == first_working_revision: |
2493 break | 2531 break |
2494 self.ChangeToDepotWorkingDirectory(v['depot']) | 2532 self.ChangeToDepotWorkingDirectory(v['depot']) |
2495 info = self.source_control.QueryRevisionInfo(k) | 2533 info = self.source_control.QueryRevisionInfo(k) |
2496 culprit_revisions.append((k, info, v['depot'])) | 2534 culprit_revisions.append((k, info, v['depot'])) |
2497 os.chdir(cwd) | 2535 os.chdir(cwd) |
2498 | 2536 |
2499 # Check for any other possible regression ranges | 2537 # Check for any other possible regression ranges |
2500 good_std_dev = revision_data[first_working_revision]['value']['std_err'] | 2538 other_regressions = self._FindOtherRegressions(revision_data_sorted, |
2501 good_mean = revision_data[first_working_revision]['value']['mean'] | 2539 mean_of_bad_runs > mean_of_good_runs) |
2502 bad_mean = revision_data[last_broken_revision]['value']['mean'] | |
2503 prev_revision_data = revision_data_sorted[0][1] | |
2504 prev_revision_id = revision_data_sorted[0][0] | |
2505 other_regressions = [] | |
2506 for current_id, current_data in revision_data_sorted: | |
2507 if current_data['value']: | |
2508 prev_mean = prev_revision_data['value']['mean'] | |
2509 cur_mean = current_data['value']['mean'] | |
2510 | |
2511 if good_std_dev: | |
2512 deviations = math.fabs(prev_mean - cur_mean) / good_std_dev | |
2513 else: | |
2514 deviations = None | |
2515 | |
2516 if good_mean: | |
2517 percent_change = (prev_mean - cur_mean) / good_mean | |
2518 | |
2519 # If the "good" valuse are supposed to be higher than the "bad" | |
2520 # values (ie. scores), flip the sign of the percent change so that | |
2521 # a positive value always represents a regression. | |
2522 if bad_mean < good_mean: | |
2523 percent_change *= -1.0 | |
2524 else: | |
2525 percent_change = None | |
2526 | |
2527 if deviations >= 1.5 or percent_change > 0.01: | |
2528 if current_id != first_working_revision: | |
2529 other_regressions.append( | |
2530 [current_id, prev_revision_id, percent_change, deviations]) | |
2531 prev_revision_data = current_data | |
2532 prev_revision_id = current_id | |
2533 | 2540 |
2534 # Check for warnings: | 2541 # Check for warnings: |
2535 if len(culprit_revisions) > 1: | 2542 if len(culprit_revisions) > 1: |
2536 self.warnings.append('Due to build errors, regression range could ' | 2543 self.warnings.append('Due to build errors, regression range could ' |
2537 'not be narrowed down to a single commit.') | 2544 'not be narrowed down to a single commit.') |
2538 if self.opts.repeat_test_count == 1: | 2545 if self.opts.repeat_test_count == 1: |
2539 self.warnings.append('Tests were only set to run once. This may ' | 2546 self.warnings.append('Tests were only set to run once. This may ' |
2540 'be insufficient to get meaningful results.') | 2547 'be insufficient to get meaningful results.') |
2541 if confidence < 100: | 2548 if confidence < 100: |
2542 if confidence: | 2549 if confidence: |
(...skipping 423 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2966 # The perf dashboard scrapes the "results" step in order to comment on | 2973 # The perf dashboard scrapes the "results" step in order to comment on |
2967 # bugs. If you change this, please update the perf dashboard as well. | 2974 # bugs. If you change this, please update the perf dashboard as well. |
2968 bisect_utils.OutputAnnotationStepStart('Results') | 2975 bisect_utils.OutputAnnotationStepStart('Results') |
2969 print 'Error: %s' % e.message | 2976 print 'Error: %s' % e.message |
2970 if opts.output_buildbot_annotations: | 2977 if opts.output_buildbot_annotations: |
2971 bisect_utils.OutputAnnotationStepClosed() | 2978 bisect_utils.OutputAnnotationStepClosed() |
2972 return 1 | 2979 return 1 |
2973 | 2980 |
2974 if __name__ == '__main__': | 2981 if __name__ == '__main__': |
2975 sys.exit(main()) | 2982 sys.exit(main()) |
OLD | NEW |