| Index: tools/auto_bisect/bisect_perf_regression.py
|
| diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py
|
| index f90074f4d7533adda3937ef5e8740afa4898efef..0fd8534749581101e0ec94ce684887411bbda268 100755
|
| --- a/tools/auto_bisect/bisect_perf_regression.py
|
| +++ b/tools/auto_bisect/bisect_perf_regression.py
|
| @@ -75,6 +75,10 @@ MAX_MAC_BUILD_TIME = 14400
|
| MAX_WIN_BUILD_TIME = 14400
|
| MAX_LINUX_BUILD_TIME = 14400
|
|
|
| +# The confidence percentage we require to consider the initial range a
|
| +# regression based on the test results of the inital good and bad revisions.
|
| +REGRESSION_CONFIDENCE = 95
|
| +
|
| # Patch template to add a new file, DEPS.sha under src folder.
|
| # This file contains SHA1 value of the DEPS changes made while bisecting
|
| # dependency repositories. This patch send along with DEPS patch to try server.
|
| @@ -89,6 +93,23 @@ new file mode 100644
|
| +%(deps_sha)s
|
| """
|
|
|
| +REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """
|
| +We could not reproduce the regression with this test/metric/platform combination
|
| +with enough confidence.
|
| +
|
| +Here are the results for the initial revision range:
|
| +'Good' revision: {good_rev}
|
| +\tmean: {good_mean}
|
| +\tstd.err.:{good_std_err}
|
| +\tsample size:{good_sample_size}
|
| +'Bad' revision: {bad_rev}
|
| +\tmean: {bad_mean}
|
| +\tstd.err.:{bad_std_err}
|
| +\tsample size:{bad_sample_size}
|
| +
|
| +NOTE: There's still a chance that this is actually a regression, but you may
|
| + need to bisect a different platform."""
|
| +
|
| # Git branch name used to run bisect try jobs.
|
| BISECT_TRYJOB_BRANCH = 'bisect-tryjob'
|
| # Git master branch name.
|
| @@ -589,6 +610,46 @@ def _GenerateProfileIfNecessary(command_args):
|
| return True
|
|
|
|
|
| +def _CheckRegressionConfidenceError(
|
| + good_revision,
|
| + bad_revision,
|
| + known_good_value,
|
| + known_bad_value):
|
| + """Checks whether we can be confident beyond a certain degree that the given
|
| + metrics represent a regression.
|
| +
|
| + Args:
|
| + good_revision: string representing the commit considered 'good'
|
| + bad_revision: Same as above for 'bad'.
|
| + known_good_value: A dict with at least: 'values', 'mean' and 'std_err'
|
| + known_bad_value: Same as above.
|
| +
|
| + Returns:
|
| + False if there is no error (i.e. we can be confident there's a regressioni),
|
| + a string containing the details of the lack of confidence otherwise.
|
| + """
|
| + error = False
|
| + # Adding good and bad values to a parameter list.
|
| + confidenceParams = []
|
| + for l in [known_bad_value['values'], known_good_value['values']]:
|
| + # Flatten if needed
|
| + if isinstance(l, list) and all([isinstance(x, list) for x in l]):
|
| + confidenceParams.append(sum(l, []))
|
| + else:
|
| + confidenceParams.append(l)
|
| + regression_confidence = BisectResults.ConfidenceScore(*confidenceParams)
|
| + if regression_confidence < REGRESSION_CONFIDENCE:
|
| + error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format(
|
| + good_rev=good_revision,
|
| + good_mean=known_good_value['mean'],
|
| + good_std_err=known_good_value['std_err'],
|
| + good_sample_size=len(known_good_value['values']),
|
| + bad_rev=bad_revision,
|
| + bad_mean=known_bad_value['mean'],
|
| + bad_std_err=known_bad_value['std_err'],
|
| + bad_sample_size=len(known_bad_value['values']))
|
| + return error
|
| +
|
| class DepotDirectoryRegistry(object):
|
|
|
| def __init__(self, src_cwd):
|
| @@ -2217,6 +2278,15 @@ class BisectPerformanceMetrics(object):
|
|
|
| min_revision = 0
|
| max_revision = len(revision_states) - 1
|
| + # Check how likely it is that the good and bad results are different
|
| + # beyond chance-induced variation.
|
| + if not self.opts.debug_ignore_regression_confidence:
|
| + error = _CheckRegressionConfidenceError(good_revision,
|
| + bad_revision,
|
| + known_good_value,
|
| + known_bad_value)
|
| + if error:
|
| + return BisectResults(error=error)
|
|
|
| # Can just mark the good and bad revisions explicitly here since we
|
| # already know the results.
|
| @@ -2425,6 +2495,7 @@ class BisectOptions(object):
|
| self.debug_ignore_build = None
|
| self.debug_ignore_sync = None
|
| self.debug_ignore_perf_test = None
|
| + self.debug_ignore_regression_confidence = None
|
| self.debug_fake_first_test_mean = 0
|
| self.gs_bucket = None
|
| self.target_arch = 'ia32'
|
| @@ -2593,6 +2664,10 @@ class BisectOptions(object):
|
| group.add_option('--debug_ignore_perf_test',
|
| action='store_true',
|
| help='DEBUG: Don\'t perform performance tests.')
|
| + group.add_option('--debug_ignore_regression_confidence',
|
| + action='store_true',
|
| + help='DEBUG: Don\'t score the confidence of the initial '
|
| + 'good and bad revisions\' test results.')
|
| group.add_option('--debug_fake_first_test_mean',
|
| type='int',
|
| default='0',
|
|
|