Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(45)

Side by Side Diff: tools/bisect-perf-regression.py

Issue 122563003: Refactor calculation of "other regressions" output. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: . Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 """Performance Test Bisect Tool 6 """Performance Test Bisect Tool
7 7
8 This script bisects a series of changelists using binary search. It starts at 8 This script bisects a series of changelists using binary search. It starts at
9 a bad revision where a performance metric has regressed, and asks for a last 9 a bad revision where a performance metric has regressed, and asks for a last
10 known-good revision. It will then binary search across this revision range by 10 known-good revision. It will then binary search across this revision range by
(...skipping 2323 matching lines...) Expand 10 before | Expand all | Expand 10 after
2334 print 2334 print
2335 print 'To reproduce locally:' 2335 print 'To reproduce locally:'
2336 print '$ ' + self.opts.command 2336 print '$ ' + self.opts.command
2337 if bisect_utils.IsTelemetryCommand(self.opts.command): 2337 if bisect_utils.IsTelemetryCommand(self.opts.command):
2338 print 2338 print
2339 print 'Also consider passing --profiler=list to see available profilers.' 2339 print 'Also consider passing --profiler=list to see available profilers.'
2340 2340
2341 def _PrintOtherRegressions(self, other_regressions, revision_data): 2341 def _PrintOtherRegressions(self, other_regressions, revision_data):
2342 print 2342 print
2343 print 'Other regressions may have occurred:' 2343 print 'Other regressions may have occurred:'
2344 print ' %8s %82s %10s' % ('Depot'.center(8, ' '),
2345 'Range'.center(82, ' '), 'Confidence'.center(10, ' '))
2344 for regression in other_regressions: 2346 for regression in other_regressions:
2345 current_id, previous_id, percent_change, deviations = regression 2347 current_id, previous_id, confidence = regression
2346 current_data = revision_data[current_id] 2348 current_data = revision_data[current_id]
2347 previous_data = revision_data[previous_id] 2349 previous_data = revision_data[previous_id]
2348 2350
2349 if deviations is None: 2351 print ' %8s %s..%s %s' % (
2350 deviations = 'N/A' 2352 current_data['depot'], current_id, previous_id,
2351 else: 2353 ('%d%%' % confidence).center(10, ' '))
2352 deviations = '%.2f' % deviations
2353
2354 if percent_change is None:
2355 percent_change = 0
2356
2357 print ' %8s %s [%.2f%%, %s x std.dev]' % (
2358 previous_data['depot'], previous_id, 100 * percent_change, deviations)
2359 print ' %8s %s' % (current_data['depot'], current_id)
2360 print 2354 print
2361 2355
2362 def _PrintStepTime(self, revision_data_sorted): 2356 def _PrintStepTime(self, revision_data_sorted):
2363 step_perf_time_avg = 0.0 2357 step_perf_time_avg = 0.0
2364 step_build_time_avg = 0.0 2358 step_build_time_avg = 0.0
2365 step_count = 0.0 2359 step_count = 0.0
2366 for _, current_data in revision_data_sorted: 2360 for _, current_data in revision_data_sorted:
2367 step_perf_time_avg += current_data['perf_time'] 2361 step_perf_time_avg += current_data['perf_time']
2368 step_build_time_avg += current_data['build_time'] 2362 step_build_time_avg += current_data['build_time']
2369 step_count += 1 2363 step_count += 1
2370 if step_count: 2364 if step_count:
2371 step_perf_time_avg = step_perf_time_avg / step_count 2365 step_perf_time_avg = step_perf_time_avg / step_count
2372 step_build_time_avg = step_build_time_avg / step_count 2366 step_build_time_avg = step_build_time_avg / step_count
2373 print 2367 print
2374 print 'Average build time : %s' % datetime.timedelta( 2368 print 'Average build time : %s' % datetime.timedelta(
2375 seconds=int(step_build_time_avg)) 2369 seconds=int(step_build_time_avg))
2376 print 'Average test time : %s' % datetime.timedelta( 2370 print 'Average test time : %s' % datetime.timedelta(
2377 seconds=int(step_perf_time_avg)) 2371 seconds=int(step_perf_time_avg))
2378 2372
2379 def _PrintWarnings(self): 2373 def _PrintWarnings(self):
2380 if not self.warnings: 2374 if not self.warnings:
2381 return 2375 return
2382 print 2376 print
2383 print 'WARNINGS:' 2377 print 'WARNINGS:'
2384 for w in self.warnings: 2378 for w in self.warnings:
2385 print ' !!! %s' % w 2379 print ' !!! %s' % w
2386 2380
2381 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good):
2382 other_regressions = []
2383 previous_values = []
2384 previous_id = None
2385 for current_id, current_data in revision_data_sorted:
2386 current_values = current_data['value']
2387 if current_values:
2388 current_values = current_values['values']
2389 if previous_values:
2390 confidence = self._CalculateConfidence(previous_values,
2391 [current_values])
2392 mean_of_prev_runs = CalculateTruncatedMean(
2393 sum(previous_values, []), 0)
2394 mean_of_current_runs = CalculateTruncatedMean(current_values, 0)
2395
2396 # Check that the potential regression is in the same direction as
2397 # the overall regression. If the mean of the previous runs < the
2398 # mean of the current runs, this local regression is in same
2399 # direction.
2400 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
2401 is_same_direction = (prev_less_than_current if
2402 bad_greater_than_good else not prev_less_than_current)
2403
2404 # Only report potential regressions with high confidence.
2405 if is_same_direction and confidence > 50:
2406 other_regressions.append([current_id, previous_id, confidence])
2407 previous_values.append(current_values)
2408 previous_id = current_id
2409 return other_regressions
2410
2411 def _CalculateConfidence(self, working_means, broken_means):
2412 bounds_working = []
2413 bounds_broken = []
2414 for m in working_means:
2415 current_mean = CalculateTruncatedMean(m, 0)
2416 if bounds_working:
2417 bounds_working[0] = min(current_mean, bounds_working[0])
2418 bounds_working[1] = max(current_mean, bounds_working[0])
2419 else:
2420 bounds_working = [current_mean, current_mean]
2421 for m in broken_means:
2422 current_mean = CalculateTruncatedMean(m, 0)
2423 if bounds_broken:
2424 bounds_broken[0] = min(current_mean, bounds_broken[0])
2425 bounds_broken[1] = max(current_mean, bounds_broken[0])
2426 else:
2427 bounds_broken = [current_mean, current_mean]
2428 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),
2429 math.fabs(bounds_broken[0] - bounds_working[1]))
2430 working_mean = sum(working_means, [])
2431 broken_mean = sum(broken_means, [])
2432 len_working_group = CalculateStandardDeviation(working_mean)
2433 len_broken_group = CalculateStandardDeviation(broken_mean)
2434
2435 confidence = (dist_between_groups / (
2436 max(0.0001, (len_broken_group + len_working_group ))))
2437 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
2438 return confidence
2439
2387 def _GetResultsDict(self, revision_data, revision_data_sorted): 2440 def _GetResultsDict(self, revision_data, revision_data_sorted):
2388 # Find range where it possibly broke. 2441 # Find range where it possibly broke.
2389 first_working_revision = None 2442 first_working_revision = None
2390 first_working_revision_index = -1 2443 first_working_revision_index = -1
2391 last_broken_revision = None 2444 last_broken_revision = None
2392 last_broken_revision_index = -1 2445 last_broken_revision_index = -1
2393 2446
2394 for i in xrange(len(revision_data_sorted)): 2447 for i in xrange(len(revision_data_sorted)):
2395 k, v = revision_data_sorted[i] 2448 k, v = revision_data_sorted[i]
2396 if v['passed'] == 1: 2449 if v['passed'] == 1:
2397 if not first_working_revision: 2450 if not first_working_revision:
2398 first_working_revision = k 2451 first_working_revision = k
2399 first_working_revision_index = i 2452 first_working_revision_index = i
2400 2453
2401 if not v['passed']: 2454 if not v['passed']:
2402 last_broken_revision = k 2455 last_broken_revision = k
2403 last_broken_revision_index = i 2456 last_broken_revision_index = i
2404 2457
2405 if last_broken_revision != None and first_working_revision != None: 2458 if last_broken_revision != None and first_working_revision != None:
2406 bounds_broken = [revision_data[last_broken_revision]['value']['mean'], 2459 broken_means = []
2407 revision_data[last_broken_revision]['value']['mean']]
2408 broken_mean = []
2409 for i in xrange(0, last_broken_revision_index + 1): 2460 for i in xrange(0, last_broken_revision_index + 1):
2410 if revision_data_sorted[i][1]['value']: 2461 if revision_data_sorted[i][1]['value']:
2411 bounds_broken[0] = min(bounds_broken[0], 2462 broken_means.append(revision_data_sorted[i][1]['value']['values'])
2412 revision_data_sorted[i][1]['value']['mean'])
2413 bounds_broken[1] = max(bounds_broken[1],
2414 revision_data_sorted[i][1]['value']['mean'])
2415 broken_mean.extend(revision_data_sorted[i][1]['value']['values'])
2416 2463
2417 bounds_working = [revision_data[first_working_revision]['value']['mean'], 2464 working_means = []
2418 revision_data[first_working_revision]['value']['mean']]
2419 working_mean = []
2420 for i in xrange(first_working_revision_index, len(revision_data_sorted)): 2465 for i in xrange(first_working_revision_index, len(revision_data_sorted)):
2421 if revision_data_sorted[i][1]['value']: 2466 if revision_data_sorted[i][1]['value']:
2422 bounds_working[0] = min(bounds_working[0], 2467 working_means.append(revision_data_sorted[i][1]['value']['values'])
2423 revision_data_sorted[i][1]['value']['mean']) 2468
2424 bounds_working[1] = max(bounds_working[1], 2469 # Flatten the lists to calculate mean of all values.
2425 revision_data_sorted[i][1]['value']['mean']) 2470 working_mean = sum(working_means, [])
2426 working_mean.extend(revision_data_sorted[i][1]['value']['values']) 2471 broken_mean = sum(broken_means, [])
2427 2472
2428 # Calculate the approximate size of the regression 2473 # Calculate the approximate size of the regression
2429 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0) 2474 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0)
2430 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0) 2475 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0)
2431 2476
2432 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) / 2477 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /
2433 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0 2478 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0
2434 2479
2435 regression_std_err = math.fabs(CalculatePooledStandardError( 2480 regression_std_err = math.fabs(CalculatePooledStandardError(
2436 [working_mean, broken_mean]) / 2481 [working_mean, broken_mean]) /
2437 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 2482 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
2438 2483
2439 # Give a "confidence" in the bisect. At the moment we use how distinct the 2484 # Give a "confidence" in the bisect. At the moment we use how distinct the
2440 # values are before and after the last broken revision, and how noisy the 2485 # values are before and after the last broken revision, and how noisy the
2441 # overall graph is. 2486 # overall graph is.
2442 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]), 2487 confidence = self._CalculateConfidence(working_means, broken_means)
2443 math.fabs(bounds_broken[0] - bounds_working[1]))
2444 len_working_group = CalculateStandardDeviation(working_mean)
2445 len_broken_group = CalculateStandardDeviation(broken_mean)
2446
2447 confidence = (dist_between_groups / (
2448 max(0.0001, (len_broken_group + len_working_group ))))
2449 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
2450 2488
2451 culprit_revisions = [] 2489 culprit_revisions = []
2452 2490
2453 cwd = os.getcwd() 2491 cwd = os.getcwd()
2454 self.ChangeToDepotWorkingDirectory( 2492 self.ChangeToDepotWorkingDirectory(
2455 revision_data[last_broken_revision]['depot']) 2493 revision_data[last_broken_revision]['depot'])
2456 2494
2457 if revision_data[last_broken_revision]['depot'] == 'cros': 2495 if revision_data[last_broken_revision]['depot'] == 'cros':
2458 # Want to get a list of all the commits and what depots they belong 2496 # Want to get a list of all the commits and what depots they belong
2459 # to so that we can grab info about each. 2497 # to so that we can grab info about each.
(...skipping 30 matching lines...) Expand all
2490 for i in xrange(last_broken_revision_index, len(revision_data_sorted)): 2528 for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
2491 k, v = revision_data_sorted[i] 2529 k, v = revision_data_sorted[i]
2492 if k == first_working_revision: 2530 if k == first_working_revision:
2493 break 2531 break
2494 self.ChangeToDepotWorkingDirectory(v['depot']) 2532 self.ChangeToDepotWorkingDirectory(v['depot'])
2495 info = self.source_control.QueryRevisionInfo(k) 2533 info = self.source_control.QueryRevisionInfo(k)
2496 culprit_revisions.append((k, info, v['depot'])) 2534 culprit_revisions.append((k, info, v['depot']))
2497 os.chdir(cwd) 2535 os.chdir(cwd)
2498 2536
2499 # Check for any other possible regression ranges 2537 # Check for any other possible regression ranges
2500 good_std_dev = revision_data[first_working_revision]['value']['std_err'] 2538 other_regressions = self._FindOtherRegressions(revision_data_sorted,
2501 good_mean = revision_data[first_working_revision]['value']['mean'] 2539 mean_of_bad_runs > mean_of_good_runs)
2502 bad_mean = revision_data[last_broken_revision]['value']['mean']
2503 prev_revision_data = revision_data_sorted[0][1]
2504 prev_revision_id = revision_data_sorted[0][0]
2505 other_regressions = []
2506 for current_id, current_data in revision_data_sorted:
2507 if current_data['value']:
2508 prev_mean = prev_revision_data['value']['mean']
2509 cur_mean = current_data['value']['mean']
2510
2511 if good_std_dev:
2512 deviations = math.fabs(prev_mean - cur_mean) / good_std_dev
2513 else:
2514 deviations = None
2515
2516 if good_mean:
2517 percent_change = (prev_mean - cur_mean) / good_mean
2518
2519 # If the "good" valuse are supposed to be higher than the "bad"
2520 # values (ie. scores), flip the sign of the percent change so that
2521 # a positive value always represents a regression.
2522 if bad_mean < good_mean:
2523 percent_change *= -1.0
2524 else:
2525 percent_change = None
2526
2527 if deviations >= 1.5 or percent_change > 0.01:
2528 if current_id != first_working_revision:
2529 other_regressions.append(
2530 [current_id, prev_revision_id, percent_change, deviations])
2531 prev_revision_data = current_data
2532 prev_revision_id = current_id
2533 2540
2534 # Check for warnings: 2541 # Check for warnings:
2535 if len(culprit_revisions) > 1: 2542 if len(culprit_revisions) > 1:
2536 self.warnings.append('Due to build errors, regression range could ' 2543 self.warnings.append('Due to build errors, regression range could '
2537 'not be narrowed down to a single commit.') 2544 'not be narrowed down to a single commit.')
2538 if self.opts.repeat_test_count == 1: 2545 if self.opts.repeat_test_count == 1:
2539 self.warnings.append('Tests were only set to run once. This may ' 2546 self.warnings.append('Tests were only set to run once. This may '
2540 'be insufficient to get meaningful results.') 2547 'be insufficient to get meaningful results.')
2541 if confidence < 100: 2548 if confidence < 100:
2542 if confidence: 2549 if confidence:
(...skipping 423 matching lines...) Expand 10 before | Expand all | Expand 10 after
2966 # The perf dashboard scrapes the "results" step in order to comment on 2973 # The perf dashboard scrapes the "results" step in order to comment on
2967 # bugs. If you change this, please update the perf dashboard as well. 2974 # bugs. If you change this, please update the perf dashboard as well.
2968 bisect_utils.OutputAnnotationStepStart('Results') 2975 bisect_utils.OutputAnnotationStepStart('Results')
2969 print 'Error: %s' % e.message 2976 print 'Error: %s' % e.message
2970 if opts.output_buildbot_annotations: 2977 if opts.output_buildbot_annotations:
2971 bisect_utils.OutputAnnotationStepClosed() 2978 bisect_utils.OutputAnnotationStepClosed()
2972 return 1 2979 return 1
2973 2980
2974 if __name__ == '__main__': 2981 if __name__ == '__main__':
2975 sys.exit(main()) 2982 sys.exit(main())
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698