tools/bisect-perf-regression.py - Issue 220113012: Refactor CalculateConfidence and add unit test.

Unified Diff: tools/bisect-perf-regression.py

Issue 220113012: Refactor CalculateConfidence and add unit test. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Update call-site of CalculateConfidence. Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/bisect-perf-regression.py

diff --git a/tools/bisect-perf-regression.py b/tools/bisect-perf-regression.py

index e15cabe1cde820eafaacf25b01994a166378511c..af248332fbc20635bcd24a1aea589dc7e3e41ed5 100755

--- a/tools/bisect-perf-regression.py

+++ b/tools/bisect-perf-regression.py

@@ -164,13 +164,20 @@ def _AddAdditionalDepotInfo(depot_info):

def CalculateTruncatedMean(data_set, truncate_percent):

"""Calculates the truncated mean of a set of values.

+ Note that this isn't just the mean of the set of values with the highest

+ and lowest values discarded; the non-discarded values are also weighted

+ differently depending how many values are discarded.

Args:

- data_set: Set of values to use in calculation.

- truncate_percent: The % from the upper/lower portions of the data set to

- discard, expressed as a value in [0, 1].

+ data_set: Non-empty list of values.

+ truncate_percent: The % from the upper and lower portions of the data set

+ to discard, expressed as a value in [0, 1].

Returns:

The truncated mean as a float.

+ Raises:

+ TypeError: The data set was empty after discarding values.

"""

if len(data_set) > 2:

data_set = sorted(data_set)

@@ -199,14 +206,61 @@ def CalculateTruncatedMean(data_set, truncate_percent):

return truncated_mean

-def CalculateStandardDeviation(v):

- if len(v) == 1:

+def CalculateMean(values):

+ """Calculates the arithmetic mean of a list of values."""

+ return CalculateTruncatedMean(values, 0.0)

+def CalculateConfidence(good_results_lists, bad_results_lists):

+ """Calculates a confidence percentage.

+ This is calculated based on how distinct the "good" and "bad" values are,

+ and how noisy the results are. More precisely, the confidence is the quotient

+ of the difference between the closest values across the good and bad groups

+ and the sum of the standard deviations of the good and bad groups.

+ TODO(qyearsley): Replace this confidence function with a function that

+ uses a Student's t-test. The confidence would be (1 - p-value), where

+ p-value is the probability of obtaining the given a set of good and bad

+ values just by chance.

+ Args:

+ good_results_lists: A list of lists of "good" result numbers.

+ bad_results_lists: A list of lists of "bad" result numbers.

+ Returns:

+ A number between in the range [0, 100].

+ """

+ # Get the distance between the two groups.

+ means_good = map(CalculateMean, good_results_lists)

+ means_bad = map(CalculateMean, bad_results_lists)

+ bounds_good = (min(means_good), max(means_good))

+ bounds_bad = (min(means_bad), max(means_bad))

+ dist_between_groups = min(

+ math.fabs(bounds_bad[1] - bounds_good[0]),

+ math.fabs(bounds_bad[0] - bounds_good[1]))

+ # Get the sum of the standard deviations of the two groups.

+ good_results_flattened = sum(good_results_lists, [])

+ bad_results_flattened = sum(bad_results_lists, [])

+ stddev_good = CalculateStandardDeviation(good_results_flattened)

+ stddev_bad = CalculateStandardDeviation(bad_results_flattened)

+ stddev_sum = stddev_good + stddev_bad

+ confidence = dist_between_groups / (max(0.0001, stddev_sum))

+ confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)

+ return confidence

+def CalculateStandardDeviation(values):

+ """Calculates the sample standard deviation of the given list of values."""

+ if len(values) == 1:

return 0.0

- mean = CalculateTruncatedMean(v, 0.0)

- variances = [float(x) - mean for x in v]

- variances = [x * x for x in variances]

- variance = reduce(lambda x, y: float(x) + float(y), variances) / (len(v) - 1)

+ mean = CalculateMean(values)

+ differences_from_mean = [float(x) - mean for x in values]

+ squared_differences = [float(x * x) for x in differences_from_mean]

+ variance = sum(squared_differences) / (len(values) - 1)

std_dev = math.sqrt(variance)

return std_dev

@@ -228,13 +282,14 @@ def CalculatePooledStandardError(work_sets):

return 0.0

-def CalculateStandardError(v):

- if len(v) <= 1:

+def CalculateStandardError(values):

+ """Calculates the standard error of a list of values."""

+ if len(values) <= 1:

return 0.0

- std_dev = CalculateStandardDeviation(v)

+ std_dev = CalculateStandardDeviation(values)

- return std_dev / math.sqrt(len(v))

+ return std_dev / math.sqrt(len(values))

def IsStringFloat(string_to_check):

@@ -2766,11 +2821,9 @@ class BisectPerformanceMetrics(object):

if current_values:

current_values = current_values['values']

if previous_values:

- confidence = self._CalculateConfidence(previous_values,

- [current_values])

- mean_of_prev_runs = CalculateTruncatedMean(

- sum(previous_values, []), 0)

- mean_of_current_runs = CalculateTruncatedMean(current_values, 0)

+ confidence = CalculateConfidence(previous_values, [current_values])

+ mean_of_prev_runs = CalculateMean(sum(previous_values, []))

+ mean_of_current_runs = CalculateMean(current_values)

# Check that the potential regression is in the same direction as

# the overall regression. If the mean of the previous runs < the

@@ -2787,34 +2840,6 @@ class BisectPerformanceMetrics(object):

previous_id = current_id

return other_regressions

- def _CalculateConfidence(self, working_means, broken_means):

- bounds_working = []

- bounds_broken = []

- for m in working_means:

- current_mean = CalculateTruncatedMean(m, 0)

- if bounds_working:

- bounds_working[0] = min(current_mean, bounds_working[0])

- bounds_working[1] = max(current_mean, bounds_working[0])

- else:

- bounds_working = [current_mean, current_mean]

- for m in broken_means:

- current_mean = CalculateTruncatedMean(m, 0)

- if bounds_broken:

- bounds_broken[0] = min(current_mean, bounds_broken[0])

- bounds_broken[1] = max(current_mean, bounds_broken[0])

- else:

- bounds_broken = [current_mean, current_mean]

- dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),

- math.fabs(bounds_broken[0] - bounds_working[1]))

- working_mean = sum(working_means, [])

- broken_mean = sum(broken_means, [])

- len_working_group = CalculateStandardDeviation(working_mean)

- len_broken_group = CalculateStandardDeviation(broken_mean)

- confidence = (dist_between_groups / (

- max(0.0001, (len_broken_group + len_working_group ))))

- confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)

- return confidence

def _GetResultsDict(self, revision_data, revision_data_sorted):

# Find range where it possibly broke.

@@ -2850,8 +2875,8 @@ class BisectPerformanceMetrics(object):

broken_mean = sum(broken_means, [])

# Calculate the approximate size of the regression

- mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0)

- mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0)

+ mean_of_bad_runs = CalculateMean(broken_mean)

+ mean_of_good_runs = CalculateMean(working_mean)

regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /

max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0

@@ -2863,7 +2888,7 @@ class BisectPerformanceMetrics(object):

# Give a "confidence" in the bisect. At the moment we use how distinct the

# values are before and after the last broken revision, and how noisy the

# overall graph is.

- confidence = self._CalculateConfidence(working_means, broken_means)

+ confidence = CalculateConfidence(working_means, broken_means)

culprit_revisions = []

« no previous file with comments | « no previous file | tools/bisect-perf-regression_test.py » ('j') | tools/bisect-perf-regression_test.py » ('J')