Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(25)

Unified Diff: tools/bisect-perf-regression.py

Issue 209853009: Refactor perf bisect script _CalculateConfidence method. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Refactor CalculateConfidence and add a unit test. Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | tools/bisect-perf-regression_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/bisect-perf-regression.py
diff --git a/tools/bisect-perf-regression.py b/tools/bisect-perf-regression.py
index e15cabe1cde820eafaacf25b01994a166378511c..bb5d4a63530bbbe374ad2149ff26852a331182f6 100755
--- a/tools/bisect-perf-regression.py
+++ b/tools/bisect-perf-regression.py
@@ -164,13 +164,20 @@ def _AddAdditionalDepotInfo(depot_info):
def CalculateTruncatedMean(data_set, truncate_percent):
"""Calculates the truncated mean of a set of values.
+ Note that this isn't just the mean of the set of values with the highest
+ and lowest values discarded; the non-discarded values are also weighted
+ differently depending how many values are discarded.
+
Args:
- data_set: Set of values to use in calculation.
- truncate_percent: The % from the upper/lower portions of the data set to
- discard, expressed as a value in [0, 1].
+ data_set: Non-empty list of values.
+ truncate_percent: The % from the upper and lower portions of the data set
+ to discard, expressed as a value in [0, 1].
Returns:
The truncated mean as a float.
+
+ Raises:
+ TypeError: The data set was empty after discarding values.
"""
if len(data_set) > 2:
data_set = sorted(data_set)
@@ -199,14 +206,61 @@ def CalculateTruncatedMean(data_set, truncate_percent):
return truncated_mean
-def CalculateStandardDeviation(v):
- if len(v) == 1:
+def CalculateMean(values):
+ """Calculates the arithmetic mean of a list of values."""
+ return CalculateTruncatedMean(values, 0.0)
+
+
+def CalculateConfidence(good_results_lists, bad_results_lists):
+ """Calculates a confidence percentage.
+
+ This is calculated based on how distinct the "good" and "bad" values are,
+ and how noisy the results are. More precisely, the confidence is the quotient
+ of the difference between the closest values across the good and bad groups
+ and the sum of the standard deviations of the good and bad groups.
+
+ TODO(qyearsley): Replace this confidence function with a function that
+ uses a Student's t-test. The confidence would be (1 - p-value), where
+ p-value is the probability of obtaining the given a set of good and bad
+ values just by chance.
+
+ Args:
+ good_results_lists: A list of lists of "good" result numbers.
+ bad_results_lists: A list of lists of "bad" result numbers.
+
+ Returns:
+ A number between in the range [0, 100].
+ """
+ # Get the distance between the two groups.
+ means_good = map(CalculateMean, good_results_lists)
+ means_bad = map(CalculateMean, bad_results_lists)
+ bounds_good = (min(means_good), max(means_good))
+ bounds_bad = (min(means_bad), max(means_bad))
+ dist_between_groups = min(
+ math.fabs(bounds_bad[1] - bounds_good[0]),
+ math.fabs(bounds_bad[0] - bounds_good[1]))
+
+ # Get the sum of the standard deviations of the two groups.
+ good_results_flattened = sum(good_results_lists, [])
+ bad_results_flattened = sum(bad_results_lists, [])
+ stddev_good = CalculateStandardDeviation(good_results_flattened)
+ stddev_bad = CalculateStandardDeviation(bad_results_flattened)
+ stddev_sum = stddev_good + stddev_bad
+
+ confidence = dist_between_groups / (max(0.0001, stddev_sum))
+ confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
+ return confidence
+
+
+def CalculateStandardDeviation(values):
+ """Calculates the sample standard deviation of the given list of values."""
+ if len(values) == 1:
return 0.0
- mean = CalculateTruncatedMean(v, 0.0)
- variances = [float(x) - mean for x in v]
- variances = [x * x for x in variances]
- variance = reduce(lambda x, y: float(x) + float(y), variances) / (len(v) - 1)
+ mean = CalculateMean(values)
+ differences_from_mean = [float(x) - mean for x in values]
+ squared_differences = [float(x * x) for x in differences_from_mean]
+ variance = sum(squared_differences) / (len(values) - 1)
std_dev = math.sqrt(variance)
return std_dev
@@ -228,13 +282,14 @@ def CalculatePooledStandardError(work_sets):
return 0.0
-def CalculateStandardError(v):
- if len(v) <= 1:
+def CalculateStandardError(values):
+ """Calculates the standard error of a list of values."""
+ if len(values) <= 1:
return 0.0
- std_dev = CalculateStandardDeviation(v)
+ std_dev = CalculateStandardDeviation(values)
- return std_dev / math.sqrt(len(v))
+ return std_dev / math.sqrt(len(values))
def IsStringFloat(string_to_check):
@@ -2766,11 +2821,9 @@ class BisectPerformanceMetrics(object):
if current_values:
current_values = current_values['values']
if previous_values:
- confidence = self._CalculateConfidence(previous_values,
- [current_values])
- mean_of_prev_runs = CalculateTruncatedMean(
- sum(previous_values, []), 0)
- mean_of_current_runs = CalculateTruncatedMean(current_values, 0)
+ confidence = CalculateConfidence(previous_values, [current_values])
+ mean_of_prev_runs = CalculateMean(sum(previous_values, []))
+ mean_of_current_runs = CalculateMean(current_values)
# Check that the potential regression is in the same direction as
# the overall regression. If the mean of the previous runs < the
@@ -2787,34 +2840,6 @@ class BisectPerformanceMetrics(object):
previous_id = current_id
return other_regressions
- def _CalculateConfidence(self, working_means, broken_means):
- bounds_working = []
- bounds_broken = []
- for m in working_means:
- current_mean = CalculateTruncatedMean(m, 0)
- if bounds_working:
- bounds_working[0] = min(current_mean, bounds_working[0])
- bounds_working[1] = max(current_mean, bounds_working[0])
- else:
- bounds_working = [current_mean, current_mean]
- for m in broken_means:
- current_mean = CalculateTruncatedMean(m, 0)
- if bounds_broken:
- bounds_broken[0] = min(current_mean, bounds_broken[0])
- bounds_broken[1] = max(current_mean, bounds_broken[0])
- else:
- bounds_broken = [current_mean, current_mean]
- dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),
- math.fabs(bounds_broken[0] - bounds_working[1]))
- working_mean = sum(working_means, [])
- broken_mean = sum(broken_means, [])
- len_working_group = CalculateStandardDeviation(working_mean)
- len_broken_group = CalculateStandardDeviation(broken_mean)
-
- confidence = (dist_between_groups / (
- max(0.0001, (len_broken_group + len_working_group ))))
- confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
- return confidence
def _GetResultsDict(self, revision_data, revision_data_sorted):
# Find range where it possibly broke.
@@ -2850,8 +2875,8 @@ class BisectPerformanceMetrics(object):
broken_mean = sum(broken_means, [])
# Calculate the approximate size of the regression
- mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0)
- mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0)
+ mean_of_bad_runs = CalculateMean(broken_mean)
+ mean_of_good_runs = CalculateMean(working_mean)
regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /
max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0
« no previous file with comments | « no previous file | tools/bisect-perf-regression_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698