Index: tools/auto_bisect/bisect_perf_regression.py |
diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py |
index f90074f4d7533adda3937ef5e8740afa4898efef..becd04dde7281df5a302a74826d028454cb37f21 100755 |
--- a/tools/auto_bisect/bisect_perf_regression.py |
+++ b/tools/auto_bisect/bisect_perf_regression.py |
@@ -75,6 +75,10 @@ MAX_MAC_BUILD_TIME = 14400 |
MAX_WIN_BUILD_TIME = 14400 |
MAX_LINUX_BUILD_TIME = 14400 |
+# The confidence percentage we require to consider the initial range a |
+# regression based on the test results of the inital good and bad revisions. |
+REGRESSION_CONFIDENCE = 95 |
+ |
# Patch template to add a new file, DEPS.sha under src folder. |
# This file contains SHA1 value of the DEPS changes made while bisecting |
# dependency repositories. This patch send along with DEPS patch to try server. |
@@ -89,6 +93,23 @@ new file mode 100644 |
+%(deps_sha)s |
""" |
+REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """ |
+We could not reproduce the regression with this test/metric/platform combination |
+with enough confidence. |
+ |
+Here are the results for the initial revision range: |
+\'Good\' revision: {} |
qyearsley
2014/10/23 00:38:39
Giving names to each of these template fields woul
RobertoCN
2014/10/23 19:51:45
Done.
|
+\tmean: {} |
+\tstd.err.:{} |
+\tsample size:{} |
+\'Bad\' revision: {} |
+\tmean: {} |
+\tstd.err.:{} |
+\tsample size:{} |
+ |
+NOTE: There\'s still a chance that this is actually a regression, but you may |
qyearsley
2014/10/23 00:38:39
Escaping single quotes isn't necessary insider a "
RobertoCN
2014/10/23 19:51:45
Done.
|
+ need to bisect a different platform.""" |
+ |
# Git branch name used to run bisect try jobs. |
BISECT_TRYJOB_BRANCH = 'bisect-tryjob' |
# Git master branch name. |
@@ -2217,6 +2238,25 @@ class BisectPerformanceMetrics(object): |
min_revision = 0 |
max_revision = len(revision_states) - 1 |
+ # Check how likely it is that the good and bad results are different |
+ # beyond chance-induced variation. |
+ if not self.opts.debug_ignore_regression_confidence: |
+ # Adding good and bad values to a parameter list. |
+ confidenceParams = [] |
+ for l in [known_bad_value['values'], known_good_value['values']]: |
+ # Flatten if needed |
+ if isinstance(l, list) and all([isinstance(x, list) for x in l]): |
+ confidenceParams.append(sum(l, [])) |
+ else: |
+ confidenceParams.append(l) |
+ regression_confidence = BisectResults.ConfidenceScore(*confidenceParams) |
+ if regression_confidence < REGRESSION_CONFIDENCE: |
+ error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format( |
+ good_revision, known_good_value['mean'], |
+ known_good_value['std_err'], len(known_good_value['values']), |
+ bad_revision, known_bad_value['mean'], |
+ known_bad_value['std_err'], len(known_bad_value['values'])) |
+ return BisectResults(error=error) |
qyearsley
2014/10/23 00:38:38
Might be a good idea to extract everything under t
RobertoCN
2014/10/23 19:51:45
Done.
|
# Can just mark the good and bad revisions explicitly here since we |
# already know the results. |
@@ -2425,6 +2465,7 @@ class BisectOptions(object): |
self.debug_ignore_build = None |
self.debug_ignore_sync = None |
self.debug_ignore_perf_test = None |
+ self.debug_ignore_regression_confidence = None |
self.debug_fake_first_test_mean = 0 |
self.gs_bucket = None |
self.target_arch = 'ia32' |
@@ -2593,6 +2634,10 @@ class BisectOptions(object): |
group.add_option('--debug_ignore_perf_test', |
action='store_true', |
help='DEBUG: Don\'t perform performance tests.') |
+ group.add_option('--debug_ignore_regression_confidence', |
+ action='store_true', |
+ help='DEBUG: Don\'t score the confidence of the initial ' |
+ 'good and bad revisions\' test results.') |
group.add_option('--debug_fake_first_test_mean', |
type='int', |
default='0', |