Index: tools/bisect-perf-regression.py |
diff --git a/tools/bisect-perf-regression.py b/tools/bisect-perf-regression.py |
index 41b999a07078523d22e0b24596049b0be61055b3..363aa1b5ffb36b814e85d49d35a2e1b40176448a 100755 |
--- a/tools/bisect-perf-regression.py |
+++ b/tools/bisect-perf-regression.py |
@@ -171,6 +171,9 @@ MAX_MAC_BUILD_TIME = 14400 |
MAX_WIN_BUILD_TIME = 14400 |
MAX_LINUX_BUILD_TIME = 14400 |
+# The confidence percentage at which confidence can be consider "high". |
+HIGH_CONFIDENCE = 95 |
+ |
# Patch template to add a new file, DEPS.sha under src folder. |
# This file contains SHA1 value of the DEPS changes made while bisecting |
# dependency repositories. This patch send along with DEPS patch to tryserver. |
@@ -191,9 +194,9 @@ BISECT_MODE_MEAN = 'mean' |
BISECT_MODE_STD_DEV = 'std_dev' |
BISECT_MODE_RETURN_CODE = 'return_code' |
-# The perf dashboard specifically looks for the string |
-# "Estimated Confidence: 95%" to decide whether or not to cc the author(s). |
-# If you change this, please update the perf dashboard as well. |
+# The perf dashboard looks for a string like "Estimated Confidence: 95%" |
+# to decide whether or not to cc the author(s). If you change this, please |
+# update the perf dashboard as well. |
RESULTS_BANNER = """ |
===== BISECT JOB RESULTS ===== |
Status: %(status)s |
@@ -280,12 +283,18 @@ def ConfidenceScore(good_results_lists, bad_results_lists): |
Returns: |
A number in the range [0, 100]. |
""" |
- if not good_results_lists or not bad_results_lists: |
+ # If there's only one item in either list, this means only one revision was |
+ # classified good or bad; this isn't good enough evidence to make a decision. |
+ # If an empty list was passed, that also implies zero confidence. |
+ if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: |
return 0.0 |
# Flatten the lists of results lists. |
sample1 = sum(good_results_lists, []) |
sample2 = sum(bad_results_lists, []) |
+ |
+ # If there were only empty lists in either of the lists (this is unexpected |
+ # and normally shouldn't happen), then we also want to return 0. |
if not sample1 or not sample2: |
return 0.0 |
@@ -2889,7 +2898,7 @@ class BisectPerformanceMetrics(object): |
if not results_dict['confidence']: |
return None |
confidence_status = 'Successful with %(level)s confidence%(warning)s.' |
- if results_dict['confidence'] >= 95: |
+ if results_dict['confidence'] >= HIGH_CONFIDENCE: |
level = 'high' |
else: |
level = 'low' |
@@ -3173,18 +3182,13 @@ class BisectPerformanceMetrics(object): |
if self.opts.repeat_test_count == 1: |
self.warnings.append('Tests were only set to run once. This may ' |
'be insufficient to get meaningful results.') |
- if results_dict['confidence'] < 100: |
- if results_dict['confidence']: |
- self.warnings.append( |
- 'Confidence is less than 100%. There could be other candidates ' |
- 'for this regression. Try bisecting again with increased ' |
- 'repeat_count or on a sub-metric that shows the regression more ' |
- 'clearly.') |
- else: |
- self.warnings.append( |
- 'Confidence is 0%. Try bisecting again on another platform, with ' |
- 'increased repeat_count or on a sub-metric that shows the ' |
- 'regression more clearly.') |
+ if 0 < results_dict['confidence'] < HIGH_CONFIDENCE: |
+ self.warnings.append('Confidence is not high. Try bisecting again ' |
+ 'with increased repeat_count, larger range, or ' |
+ 'on another metric.') |
+ if not results_dict['confidence']: |
+ self.warnings.append('Confidence score is 0%. Try bisecting again on ' |
+ 'another platform or another metric.') |
def FormatAndPrintResults(self, bisect_results): |
"""Prints the results from a bisection run in a readable format. |