tools/bisect-perf-regression.py - Issue 417013003: Move statistical functions in bisect script to their own module.

Unified Diff: tools/bisect-perf-regression.py

Issue 417013003: Move statistical functions in bisect script to their own module. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Respond to nit Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/bisect-perf-regression.py

diff --git a/tools/bisect-perf-regression.py b/tools/bisect-perf-regression.py

index 014e646d074e65e5ee5707ae200f588c71bbcf1f..e2832ba7182a992e7e2c951a9d4ba6ccb7914aba 100755

--- a/tools/bisect-perf-regression.py

+++ b/tools/bisect-perf-regression.py

@@ -32,7 +32,6 @@ An example usage (using git hashes):

-g 1f6e67861535121c5c819c16a666f2436c207e7b\

-b b732f23b4f81c382db0b23b9035f3dadc7d925bb\

-m shutdown/simple-user-quit

"""

import copy

@@ -53,6 +52,7 @@ import zipfile

sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry'))

from auto_bisect import bisect_utils

+from auto_bisect import math_utils

from auto_bisect import post_perf_builder_job as bisect_builder

from auto_bisect import source_control as source_control_module

from telemetry.util import cloud_storage

@@ -259,57 +259,7 @@ def _AddAdditionalDepotInfo(depot_info):

DEPOT_NAMES = DEPOT_DEPS_NAME.keys()

-def CalculateTruncatedMean(data_set, truncate_percent):

- """Calculates the truncated mean of a set of values.

- Note that this isn't just the mean of the set of values with the highest

- and lowest values discarded; the non-discarded values are also weighted

- differently depending how many values are discarded.

- Args:

- data_set: Non-empty list of values.

- truncate_percent: The % from the upper and lower portions of the data set

- to discard, expressed as a value in [0, 1].

- Returns:

- The truncated mean as a float.

- Raises:

- TypeError: The data set was empty after discarding values.

- """

- if len(data_set) > 2:

- data_set = sorted(data_set)

- discard_num_float = len(data_set) * truncate_percent

- discard_num_int = int(math.floor(discard_num_float))

- kept_weight = len(data_set) - discard_num_float * 2

- data_set = data_set[discard_num_int:len(data_set)-discard_num_int]

- weight_left = 1.0 - (discard_num_float - discard_num_int)

- if weight_left < 1:

- # If the % to discard leaves a fractional portion, need to weight those

- # values.

- unweighted_vals = data_set[1:len(data_set)-1]

- weighted_vals = [data_set[0], data_set[len(data_set)-1]]

- weighted_vals = [w * weight_left for w in weighted_vals]

- data_set = weighted_vals + unweighted_vals

- else:

- kept_weight = len(data_set)

- truncated_mean = reduce(lambda x, y: float(x) + float(y),

- data_set) / kept_weight

- return truncated_mean

-def CalculateMean(values):

- """Calculates the arithmetic mean of a list of values."""

- return CalculateTruncatedMean(values, 0.0)

-def CalculateConfidence(good_results_lists, bad_results_lists):

+def ConfidenceScore(good_results_lists, bad_results_lists):

"""Calculates a confidence percentage.

This is calculated based on how distinct the "good" and "bad" values are,

@@ -330,8 +280,8 @@ def CalculateConfidence(good_results_lists, bad_results_lists):

A number between in the range [0, 100].

"""

# Get the distance between the two groups.

- means_good = map(CalculateMean, good_results_lists)

- means_bad = map(CalculateMean, bad_results_lists)

+ means_good = map(math_utils.Mean, good_results_lists)

+ means_bad = map(math_utils.Mean, bad_results_lists)

bounds_good = (min(means_good), max(means_good))

bounds_bad = (min(means_bad), max(means_bad))

dist_between_groups = min(

@@ -341,8 +291,8 @@ def CalculateConfidence(good_results_lists, bad_results_lists):

# Get the sum of the standard deviations of the two groups.

good_results_flattened = sum(good_results_lists, [])

bad_results_flattened = sum(bad_results_lists, [])

- stddev_good = CalculateStandardDeviation(good_results_flattened)

- stddev_bad = CalculateStandardDeviation(bad_results_flattened)

+ stddev_good = math_utils.StandardDeviation(good_results_flattened)

+ stddev_bad = math_utils.StandardDeviation(bad_results_flattened)

stddev_sum = stddev_good + stddev_bad

confidence = dist_between_groups / (max(0.0001, stddev_sum))

@@ -350,71 +300,6 @@ def CalculateConfidence(good_results_lists, bad_results_lists):

return confidence

-def CalculateStandardDeviation(values):

- """Calculates the sample standard deviation of the given list of values."""

- if len(values) == 1:

- return 0.0

- mean = CalculateMean(values)

- differences_from_mean = [float(x) - mean for x in values]

- squared_differences = [float(x * x) for x in differences_from_mean]

- variance = sum(squared_differences) / (len(values) - 1)

- std_dev = math.sqrt(variance)

- return std_dev

-def CalculateRelativeChange(before, after):

- """Returns the relative change of before and after, relative to before.

- There are several different ways to define relative difference between

- two numbers; sometimes it is defined as relative to the smaller number,

- or to the mean of the two numbers. This version returns the difference

- relative to the first of the two numbers.

- Args:

- before: A number representing an earlier value.

- after: Another number, representing a later value.

- Returns:

- A non-negative floating point number; 0.1 represents a 10% change.

- """

- if before == after:

- return 0.0

- if before == 0:

- return float('nan')

- difference = after - before

- return math.fabs(difference / before)

-def CalculatePooledStandardError(work_sets):

- numerator = 0.0

- denominator1 = 0.0

- denominator2 = 0.0

- for current_set in work_sets:

- std_dev = CalculateStandardDeviation(current_set)

- numerator += (len(current_set) - 1) * std_dev ** 2

- denominator1 += len(current_set) - 1

- denominator2 += 1.0 / len(current_set)

- if denominator1:

- return math.sqrt(numerator / denominator1) * math.sqrt(denominator2)

- return 0.0

-def CalculateStandardError(values):

- """Calculates the standard error of a list of values."""

- if len(values) <= 1:

- return 0.0

- std_dev = CalculateStandardDeviation(values)

- return std_dev / math.sqrt(len(values))

def GetSHA1HexDigest(contents):

"""Returns secured hash containing hexadecimal for the given contents."""

return hashlib.sha1(contents).hexdigest()

@@ -1981,10 +1866,10 @@ class BisectPerformanceMetrics(object):

else:

# Need to get the average value if there were multiple values.

- truncated_mean = CalculateTruncatedMean(metric_values,

- self.opts.truncate_percent)

- standard_err = CalculateStandardError(metric_values)

- standard_dev = CalculateStandardDeviation(metric_values)

+ truncated_mean = math_utils.TruncatedMean(

+ metric_values, self.opts.truncate_percent)

+ standard_err = math_utils.StandardError(metric_values)

+ standard_dev = math_utils.StandardDeviation(metric_values)

if self._IsBisectModeStandardDeviation():

metric_values = [standard_dev]

@@ -3174,9 +3059,9 @@ class BisectPerformanceMetrics(object):

if current_values:

current_values = current_values['values']

if previous_values:

- confidence = CalculateConfidence(previous_values, [current_values])

- mean_of_prev_runs = CalculateMean(sum(previous_values, []))

- mean_of_current_runs = CalculateMean(current_values)

+ confidence = ConfidenceScore(previous_values, [current_values])

+ mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))

+ mean_of_current_runs = math_utils.Mean(current_values)

# Check that the potential regression is in the same direction as

# the overall regression. If the mean of the previous runs < the

@@ -3228,22 +3113,22 @@ class BisectPerformanceMetrics(object):

broken_mean = sum(broken_means, [])

# Calculate the approximate size of the regression

- mean_of_bad_runs = CalculateMean(broken_mean)

- mean_of_good_runs = CalculateMean(working_mean)

+ mean_of_bad_runs = math_utils.Mean(broken_mean)

+ mean_of_good_runs = math_utils.Mean(working_mean)

- regression_size = 100 * CalculateRelativeChange(mean_of_good_runs,

+ regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,

mean_of_bad_runs)

if math.isnan(regression_size):

regression_size = 'zero-to-nonzero'

- regression_std_err = math.fabs(CalculatePooledStandardError(

+ regression_std_err = math.fabs(math_utils.PooledStandardError(

[working_mean, broken_mean]) /

max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

# Give a "confidence" in the bisect. At the moment we use how distinct the

# values are before and after the last broken revision, and how noisy the

# overall graph is.

- confidence = CalculateConfidence(working_means, broken_means)

+ confidence = ConfidenceScore(working_means, broken_means)

culprit_revisions = []

@@ -3771,5 +3656,6 @@ def main():

bisect_utils.OutputAnnotationStepClosed()

return 1

if __name__ == '__main__':

sys.exit(main())

« no previous file with comments | « tools/auto_bisect/math_utils_test.py ('k') | tools/bisect-perf-regression_test.py » ('j') | no next file with comments »