tools/bisect-perf-regression.py - Issue 122563003: Refactor calculation of "other regressions" output.

Side by Side Diff: tools/bisect-perf-regression.py

Issue 122563003: Refactor calculation of "other regressions" output. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: . Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 """Performance Test Bisect Tool	6 """Performance Test Bisect Tool

7	7

8 This script bisects a series of changelists using binary search. It starts at	8 This script bisects a series of changelists using binary search. It starts at

9 a bad revision where a performance metric has regressed, and asks for a last	9 a bad revision where a performance metric has regressed, and asks for a last

10 known-good revision. It will then binary search across this revision range by	10 known-good revision. It will then binary search across this revision range by

(...skipping 2323 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2334 print	2334 print

2335 print 'To reproduce locally:'	2335 print 'To reproduce locally:'

2336 print '$ ' + self.opts.command	2336 print '$ ' + self.opts.command

2337 if bisect_utils.IsTelemetryCommand(self.opts.command):	2337 if bisect_utils.IsTelemetryCommand(self.opts.command):

2338 print	2338 print

2339 print 'Also consider passing --profiler=list to see available profilers.'	2339 print 'Also consider passing --profiler=list to see available profilers.'

2340	2340

2341 def _PrintOtherRegressions(self, other_regressions, revision_data):	2341 def _PrintOtherRegressions(self, other_regressions, revision_data):

2342 print	2342 print

2343 print 'Other regressions may have occurred:'	2343 print 'Other regressions may have occurred:'

	2344 print ' %8s %82s %10s' % ('Depot'.center(8, ' '),

	2345 'Range'.center(82, ' '), 'Confidence'.center(10, ' '))

2344 for regression in other_regressions:	2346 for regression in other_regressions:

2345 current_id, previous_id, percent_change, deviations = regression	2347 current_id, previous_id, confidence = regression

2346 current_data = revision_data[current_id]	2348 current_data = revision_data[current_id]

2347 previous_data = revision_data[previous_id]	2349 previous_data = revision_data[previous_id]

2348	2350

2349 if deviations is None:	2351 print ' %8s %s..%s %s' % (

2350 deviations = 'N/A'	2352 current_data['depot'], current_id, previous_id,

2351 else:	2353 ('%d%%' % confidence).center(10, ' '))

2352 deviations = '%.2f' % deviations

2353

2354 if percent_change is None:

2355 percent_change = 0

2356

2357 print ' %8s %s [%.2f%%, %s x std.dev]' % (

2358 previous_data['depot'], previous_id, 100 * percent_change, deviations)

2359 print ' %8s %s' % (current_data['depot'], current_id)

2360 print	2354 print

2361	2355

2362 def _PrintStepTime(self, revision_data_sorted):	2356 def _PrintStepTime(self, revision_data_sorted):

2363 step_perf_time_avg = 0.0	2357 step_perf_time_avg = 0.0

2364 step_build_time_avg = 0.0	2358 step_build_time_avg = 0.0

2365 step_count = 0.0	2359 step_count = 0.0

2366 for _, current_data in revision_data_sorted:	2360 for _, current_data in revision_data_sorted:

2367 step_perf_time_avg += current_data['perf_time']	2361 step_perf_time_avg += current_data['perf_time']

2368 step_build_time_avg += current_data['build_time']	2362 step_build_time_avg += current_data['build_time']

2369 step_count += 1	2363 step_count += 1

2370 if step_count:	2364 if step_count:

2371 step_perf_time_avg = step_perf_time_avg / step_count	2365 step_perf_time_avg = step_perf_time_avg / step_count

2372 step_build_time_avg = step_build_time_avg / step_count	2366 step_build_time_avg = step_build_time_avg / step_count

2373 print	2367 print

2374 print 'Average build time : %s' % datetime.timedelta(	2368 print 'Average build time : %s' % datetime.timedelta(

2375 seconds=int(step_build_time_avg))	2369 seconds=int(step_build_time_avg))

2376 print 'Average test time : %s' % datetime.timedelta(	2370 print 'Average test time : %s' % datetime.timedelta(

2377 seconds=int(step_perf_time_avg))	2371 seconds=int(step_perf_time_avg))

2378	2372

2379 def _PrintWarnings(self):	2373 def _PrintWarnings(self):

2380 if not self.warnings:	2374 if not self.warnings:

2381 return	2375 return

2382 print	2376 print

2383 print 'WARNINGS:'	2377 print 'WARNINGS:'

2384 for w in self.warnings:	2378 for w in self.warnings:

2385 print ' !!! %s' % w	2379 print ' !!! %s' % w

2386	2380

	2381 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good):

	2382 other_regressions = []

	2383 previous_values = []

	2384 previous_id = None

	2385 for current_id, current_data in revision_data_sorted:

	2386 current_values = current_data['value']

	2387 if current_values:

	2388 current_values = current_values['values']

	2389 if previous_values:

	2390 confidence = self._CalculateConfidence(previous_values,

	2391 [current_values])

	2392 mean_of_prev_runs = CalculateTruncatedMean(

	2393 sum(previous_values, []), 0)

	2394 mean_of_current_runs = CalculateTruncatedMean(current_values, 0)

	2395

	2396 # Check that the potential regression is in the same direction as

	2397 # the overall regression. If the mean of the previous runs < the

	2398 # mean of the current runs, this local regression is in same

	2399 # direction.

	2400 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs

	2401 is_same_direction = (prev_less_than_current if

	2402 bad_greater_than_good else not prev_less_than_current)

	2403

	2404 # Only report potential regressions with high confidence.

	2405 if is_same_direction and confidence > 50:

	2406 other_regressions.append([current_id, previous_id, confidence])

	2407 previous_values.append(current_values)

	2408 previous_id = current_id

	2409 return other_regressions

	2410

	2411 def _CalculateConfidence(self, working_means, broken_means):

	2412 bounds_working = []

	2413 bounds_broken = []

	2414 for m in working_means:

	2415 current_mean = CalculateTruncatedMean(m, 0)

	2416 if bounds_working:

	2417 bounds_working[0] = min(current_mean, bounds_working[0])

	2418 bounds_working[1] = max(current_mean, bounds_working[0])

	2419 else:

	2420 bounds_working = [current_mean, current_mean]

	2421 for m in broken_means:

	2422 current_mean = CalculateTruncatedMean(m, 0)

	2423 if bounds_broken:

	2424 bounds_broken[0] = min(current_mean, bounds_broken[0])

	2425 bounds_broken[1] = max(current_mean, bounds_broken[0])

	2426 else:

	2427 bounds_broken = [current_mean, current_mean]

	2428 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),

	2429 math.fabs(bounds_broken[0] - bounds_working[1]))

	2430 working_mean = sum(working_means, [])

	2431 broken_mean = sum(broken_means, [])

	2432 len_working_group = CalculateStandardDeviation(working_mean)

	2433 len_broken_group = CalculateStandardDeviation(broken_mean)

	2434

	2435 confidence = (dist_between_groups / (

	2436 max(0.0001, (len_broken_group + len_working_group ))))

	2437 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)

	2438 return confidence

	2439

2387 def _GetResultsDict(self, revision_data, revision_data_sorted):	2440 def _GetResultsDict(self, revision_data, revision_data_sorted):

2388 # Find range where it possibly broke.	2441 # Find range where it possibly broke.

2389 first_working_revision = None	2442 first_working_revision = None

2390 first_working_revision_index = -1	2443 first_working_revision_index = -1

2391 last_broken_revision = None	2444 last_broken_revision = None

2392 last_broken_revision_index = -1	2445 last_broken_revision_index = -1

2393	2446

2394 for i in xrange(len(revision_data_sorted)):	2447 for i in xrange(len(revision_data_sorted)):

2395 k, v = revision_data_sorted[i]	2448 k, v = revision_data_sorted[i]

2396 if v['passed'] == 1:	2449 if v['passed'] == 1:

2397 if not first_working_revision:	2450 if not first_working_revision:

2398 first_working_revision = k	2451 first_working_revision = k

2399 first_working_revision_index = i	2452 first_working_revision_index = i

2400	2453

2401 if not v['passed']:	2454 if not v['passed']:

2402 last_broken_revision = k	2455 last_broken_revision = k

2403 last_broken_revision_index = i	2456 last_broken_revision_index = i

2404	2457

2405 if last_broken_revision != None and first_working_revision != None:	2458 if last_broken_revision != None and first_working_revision != None:

2406 bounds_broken = [revision_data[last_broken_revision]['value']['mean'],	2459 broken_means = []

2407 revision_data[last_broken_revision]['value']['mean']]

2408 broken_mean = []

2409 for i in xrange(0, last_broken_revision_index + 1):	2460 for i in xrange(0, last_broken_revision_index + 1):

2410 if revision_data_sorted[i][1]['value']:	2461 if revision_data_sorted[i][1]['value']:

2411 bounds_broken[0] = min(bounds_broken[0],	2462 broken_means.append(revision_data_sorted[i][1]['value']['values'])

2412 revision_data_sorted[i][1]['value']['mean'])

2413 bounds_broken[1] = max(bounds_broken[1],

2414 revision_data_sorted[i][1]['value']['mean'])

2415 broken_mean.extend(revision_data_sorted[i][1]['value']['values'])

2416	2463

2417 bounds_working = [revision_data[first_working_revision]['value']['mean'],	2464 working_means = []

2418 revision_data[first_working_revision]['value']['mean']]

2419 working_mean = []

2420 for i in xrange(first_working_revision_index, len(revision_data_sorted)):	2465 for i in xrange(first_working_revision_index, len(revision_data_sorted)):

2421 if revision_data_sorted[i][1]['value']:	2466 if revision_data_sorted[i][1]['value']:

2422 bounds_working[0] = min(bounds_working[0],	2467 working_means.append(revision_data_sorted[i][1]['value']['values'])

2423 revision_data_sorted[i][1]['value']['mean'])	2468

2424 bounds_working[1] = max(bounds_working[1],	2469 # Flatten the lists to calculate mean of all values.

2425 revision_data_sorted[i][1]['value']['mean'])	2470 working_mean = sum(working_means, [])

2426 working_mean.extend(revision_data_sorted[i][1]['value']['values'])	2471 broken_mean = sum(broken_means, [])

2427	2472

2428 # Calculate the approximate size of the regression	2473 # Calculate the approximate size of the regression

2429 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0)	2474 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0)

2430 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0)	2475 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0)

2431	2476

2432 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /	2477 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /

2433 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0	2478 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0

2434	2479

2435 regression_std_err = math.fabs(CalculatePooledStandardError(	2480 regression_std_err = math.fabs(CalculatePooledStandardError(

2436 [working_mean, broken_mean]) /	2481 [working_mean, broken_mean]) /

2437 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0	2482 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

2438	2483

2439 # Give a "confidence" in the bisect. At the moment we use how distinct the	2484 # Give a "confidence" in the bisect. At the moment we use how distinct the

2440 # values are before and after the last broken revision, and how noisy the	2485 # values are before and after the last broken revision, and how noisy the

2441 # overall graph is.	2486 # overall graph is.

2442 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),	2487 confidence = self._CalculateConfidence(working_means, broken_means)

2443 math.fabs(bounds_broken[0] - bounds_working[1]))

2444 len_working_group = CalculateStandardDeviation(working_mean)

2445 len_broken_group = CalculateStandardDeviation(broken_mean)

2446

2447 confidence = (dist_between_groups / (

2448 max(0.0001, (len_broken_group + len_working_group ))))

2449 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)

2450	2488

2451 culprit_revisions = []	2489 culprit_revisions = []

2452	2490

2453 cwd = os.getcwd()	2491 cwd = os.getcwd()

2454 self.ChangeToDepotWorkingDirectory(	2492 self.ChangeToDepotWorkingDirectory(

2455 revision_data[last_broken_revision]['depot'])	2493 revision_data[last_broken_revision]['depot'])

2456	2494

2457 if revision_data[last_broken_revision]['depot'] == 'cros':	2495 if revision_data[last_broken_revision]['depot'] == 'cros':

2458 # Want to get a list of all the commits and what depots they belong	2496 # Want to get a list of all the commits and what depots they belong

2459 # to so that we can grab info about each.	2497 # to so that we can grab info about each.

(...skipping 30 matching lines...) Expand all Loading...
2490 for i in xrange(last_broken_revision_index, len(revision_data_sorted)):	2528 for i in xrange(last_broken_revision_index, len(revision_data_sorted)):

2491 k, v = revision_data_sorted[i]	2529 k, v = revision_data_sorted[i]

2492 if k == first_working_revision:	2530 if k == first_working_revision:

2493 break	2531 break

2494 self.ChangeToDepotWorkingDirectory(v['depot'])	2532 self.ChangeToDepotWorkingDirectory(v['depot'])

2495 info = self.source_control.QueryRevisionInfo(k)	2533 info = self.source_control.QueryRevisionInfo(k)

2496 culprit_revisions.append((k, info, v['depot']))	2534 culprit_revisions.append((k, info, v['depot']))

2497 os.chdir(cwd)	2535 os.chdir(cwd)

2498	2536

2499 # Check for any other possible regression ranges	2537 # Check for any other possible regression ranges

2500 good_std_dev = revision_data[first_working_revision]['value']['std_err']	2538 other_regressions = self._FindOtherRegressions(revision_data_sorted,

2501 good_mean = revision_data[first_working_revision]['value']['mean']	2539 mean_of_bad_runs > mean_of_good_runs)

2502 bad_mean = revision_data[last_broken_revision]['value']['mean']

2503 prev_revision_data = revision_data_sorted[0][1]

2504 prev_revision_id = revision_data_sorted[0][0]

2505 other_regressions = []

2506 for current_id, current_data in revision_data_sorted:

2507 if current_data['value']:

2508 prev_mean = prev_revision_data['value']['mean']

2509 cur_mean = current_data['value']['mean']

2510

2511 if good_std_dev:

2512 deviations = math.fabs(prev_mean - cur_mean) / good_std_dev

2513 else:

2514 deviations = None

2515

2516 if good_mean:

2517 percent_change = (prev_mean - cur_mean) / good_mean

2518

2519 # If the "good" valuse are supposed to be higher than the "bad"

2520 # values (ie. scores), flip the sign of the percent change so that

2521 # a positive value always represents a regression.

2522 if bad_mean < good_mean:

2523 percent_change *= -1.0

2524 else:

2525 percent_change = None

2526

2527 if deviations >= 1.5 or percent_change > 0.01:

2528 if current_id != first_working_revision:

2529 other_regressions.append(

2530 [current_id, prev_revision_id, percent_change, deviations])

2531 prev_revision_data = current_data

2532 prev_revision_id = current_id

2533	2540

2534 # Check for warnings:	2541 # Check for warnings:

2535 if len(culprit_revisions) > 1:	2542 if len(culprit_revisions) > 1:

2536 self.warnings.append('Due to build errors, regression range could '	2543 self.warnings.append('Due to build errors, regression range could '

2537 'not be narrowed down to a single commit.')	2544 'not be narrowed down to a single commit.')

2538 if self.opts.repeat_test_count == 1:	2545 if self.opts.repeat_test_count == 1:

2539 self.warnings.append('Tests were only set to run once. This may '	2546 self.warnings.append('Tests were only set to run once. This may '

2540 'be insufficient to get meaningful results.')	2547 'be insufficient to get meaningful results.')

2541 if confidence < 100:	2548 if confidence < 100:

2542 if confidence:	2549 if confidence:

(...skipping 423 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2966 # The perf dashboard scrapes the "results" step in order to comment on	2973 # The perf dashboard scrapes the "results" step in order to comment on

2967 # bugs. If you change this, please update the perf dashboard as well.	2974 # bugs. If you change this, please update the perf dashboard as well.

2968 bisect_utils.OutputAnnotationStepStart('Results')	2975 bisect_utils.OutputAnnotationStepStart('Results')

2969 print 'Error: %s' % e.message	2976 print 'Error: %s' % e.message

2970 if opts.output_buildbot_annotations:	2977 if opts.output_buildbot_annotations:

2971 bisect_utils.OutputAnnotationStepClosed()	2978 bisect_utils.OutputAnnotationStepClosed()

2972 return 1	2979 return 1

2973	2980

2974 if __name__ == '__main__':	2981 if __name__ == '__main__':

2975 sys.exit(main())	2982 sys.exit(main())

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »