Chromium Code Reviews| Index: tools/auto_bisect/bisect_perf_regression.py |
| diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py |
| index f90074f4d7533adda3937ef5e8740afa4898efef..becd04dde7281df5a302a74826d028454cb37f21 100755 |
| --- a/tools/auto_bisect/bisect_perf_regression.py |
| +++ b/tools/auto_bisect/bisect_perf_regression.py |
| @@ -75,6 +75,10 @@ MAX_MAC_BUILD_TIME = 14400 |
| MAX_WIN_BUILD_TIME = 14400 |
| MAX_LINUX_BUILD_TIME = 14400 |
| +# The confidence percentage we require to consider the initial range a |
| +# regression based on the test results of the inital good and bad revisions. |
| +REGRESSION_CONFIDENCE = 95 |
| + |
| # Patch template to add a new file, DEPS.sha under src folder. |
| # This file contains SHA1 value of the DEPS changes made while bisecting |
| # dependency repositories. This patch send along with DEPS patch to try server. |
| @@ -89,6 +93,23 @@ new file mode 100644 |
| +%(deps_sha)s |
| """ |
| +REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """ |
| +We could not reproduce the regression with this test/metric/platform combination |
| +with enough confidence. |
| + |
| +Here are the results for the initial revision range: |
| +\'Good\' revision: {} |
|
qyearsley
2014/10/23 00:38:39
Giving names to each of these template fields woul
RobertoCN
2014/10/23 19:51:45
Done.
|
| +\tmean: {} |
| +\tstd.err.:{} |
| +\tsample size:{} |
| +\'Bad\' revision: {} |
| +\tmean: {} |
| +\tstd.err.:{} |
| +\tsample size:{} |
| + |
| +NOTE: There\'s still a chance that this is actually a regression, but you may |
|
qyearsley
2014/10/23 00:38:39
Escaping single quotes isn't necessary insider a "
RobertoCN
2014/10/23 19:51:45
Done.
|
| + need to bisect a different platform.""" |
| + |
| # Git branch name used to run bisect try jobs. |
| BISECT_TRYJOB_BRANCH = 'bisect-tryjob' |
| # Git master branch name. |
| @@ -2217,6 +2238,25 @@ class BisectPerformanceMetrics(object): |
| min_revision = 0 |
| max_revision = len(revision_states) - 1 |
| + # Check how likely it is that the good and bad results are different |
| + # beyond chance-induced variation. |
| + if not self.opts.debug_ignore_regression_confidence: |
| + # Adding good and bad values to a parameter list. |
| + confidenceParams = [] |
| + for l in [known_bad_value['values'], known_good_value['values']]: |
| + # Flatten if needed |
| + if isinstance(l, list) and all([isinstance(x, list) for x in l]): |
| + confidenceParams.append(sum(l, [])) |
| + else: |
| + confidenceParams.append(l) |
| + regression_confidence = BisectResults.ConfidenceScore(*confidenceParams) |
| + if regression_confidence < REGRESSION_CONFIDENCE: |
| + error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format( |
| + good_revision, known_good_value['mean'], |
| + known_good_value['std_err'], len(known_good_value['values']), |
| + bad_revision, known_bad_value['mean'], |
| + known_bad_value['std_err'], len(known_bad_value['values'])) |
| + return BisectResults(error=error) |
|
qyearsley
2014/10/23 00:38:38
Might be a good idea to extract everything under t
RobertoCN
2014/10/23 19:51:45
Done.
|
| # Can just mark the good and bad revisions explicitly here since we |
| # already know the results. |
| @@ -2425,6 +2465,7 @@ class BisectOptions(object): |
| self.debug_ignore_build = None |
| self.debug_ignore_sync = None |
| self.debug_ignore_perf_test = None |
| + self.debug_ignore_regression_confidence = None |
| self.debug_fake_first_test_mean = 0 |
| self.gs_bucket = None |
| self.target_arch = 'ia32' |
| @@ -2593,6 +2634,10 @@ class BisectOptions(object): |
| group.add_option('--debug_ignore_perf_test', |
| action='store_true', |
| help='DEBUG: Don\'t perform performance tests.') |
| + group.add_option('--debug_ignore_regression_confidence', |
| + action='store_true', |
| + help='DEBUG: Don\'t score the confidence of the initial ' |
| + 'good and bad revisions\' test results.') |
| group.add_option('--debug_fake_first_test_mean', |
| type='int', |
| default='0', |