tools/auto_bisect/bisect_perf_regression.py - Issue 665893003: Re-applying reverted changes for regression confidence check + fix: ConfidenceScoretakes flat lists

Unified Diff: tools/auto_bisect/bisect_perf_regression.py

Issue 665893003: Re-applying reverted changes for regression confidence check + fix: ConfidenceScoretakes flat lists (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressing comments Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/auto_bisect/bisect_perf_regression.py

diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py

index f90074f4d7533adda3937ef5e8740afa4898efef..0fd8534749581101e0ec94ce684887411bbda268 100755

--- a/tools/auto_bisect/bisect_perf_regression.py

+++ b/tools/auto_bisect/bisect_perf_regression.py

@@ -75,6 +75,10 @@ MAX_MAC_BUILD_TIME = 14400

MAX_WIN_BUILD_TIME = 14400

MAX_LINUX_BUILD_TIME = 14400

+# The confidence percentage we require to consider the initial range a

+# regression based on the test results of the inital good and bad revisions.

+REGRESSION_CONFIDENCE = 95

# Patch template to add a new file, DEPS.sha under src folder.

# This file contains SHA1 value of the DEPS changes made while bisecting

# dependency repositories. This patch send along with DEPS patch to try server.

@@ -89,6 +93,23 @@ new file mode 100644

+%(deps_sha)s

"""

+REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """

+We could not reproduce the regression with this test/metric/platform combination

+with enough confidence.

+Here are the results for the initial revision range:

+'Good' revision: {good_rev}

+\tmean: {good_mean}

+\tstd.err.:{good_std_err}

+\tsample size:{good_sample_size}

+'Bad' revision: {bad_rev}

+\tmean: {bad_mean}

+\tstd.err.:{bad_std_err}

+\tsample size:{bad_sample_size}

+NOTE: There's still a chance that this is actually a regression, but you may

+ need to bisect a different platform."""

# Git branch name used to run bisect try jobs.

BISECT_TRYJOB_BRANCH = 'bisect-tryjob'

# Git master branch name.

@@ -589,6 +610,46 @@ def _GenerateProfileIfNecessary(command_args):

return True

+def _CheckRegressionConfidenceError(

+ good_revision,

+ bad_revision,

+ known_good_value,

+ known_bad_value):

+ """Checks whether we can be confident beyond a certain degree that the given

+ metrics represent a regression.

+ Args:

+ good_revision: string representing the commit considered 'good'

+ bad_revision: Same as above for 'bad'.

+ known_good_value: A dict with at least: 'values', 'mean' and 'std_err'

+ known_bad_value: Same as above.

+ Returns:

+ False if there is no error (i.e. we can be confident there's a regressioni),

+ a string containing the details of the lack of confidence otherwise.

+ """

+ error = False

+ # Adding good and bad values to a parameter list.

+ confidenceParams = []

+ for l in [known_bad_value['values'], known_good_value['values']]:

+ # Flatten if needed

+ if isinstance(l, list) and all([isinstance(x, list) for x in l]):

+ confidenceParams.append(sum(l, []))

+ else:

+ confidenceParams.append(l)

+ regression_confidence = BisectResults.ConfidenceScore(*confidenceParams)

+ if regression_confidence < REGRESSION_CONFIDENCE:

+ error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format(

+ good_rev=good_revision,

+ good_mean=known_good_value['mean'],

+ good_std_err=known_good_value['std_err'],

+ good_sample_size=len(known_good_value['values']),

+ bad_rev=bad_revision,

+ bad_mean=known_bad_value['mean'],

+ bad_std_err=known_bad_value['std_err'],

+ bad_sample_size=len(known_bad_value['values']))

+ return error

class DepotDirectoryRegistry(object):

def __init__(self, src_cwd):

@@ -2217,6 +2278,15 @@ class BisectPerformanceMetrics(object):

min_revision = 0

max_revision = len(revision_states) - 1

+ # Check how likely it is that the good and bad results are different

+ # beyond chance-induced variation.

+ if not self.opts.debug_ignore_regression_confidence:

+ error = _CheckRegressionConfidenceError(good_revision,

+ bad_revision,

+ known_good_value,

+ known_bad_value)

+ if error:

+ return BisectResults(error=error)

# Can just mark the good and bad revisions explicitly here since we

# already know the results.

@@ -2425,6 +2495,7 @@ class BisectOptions(object):

self.debug_ignore_build = None

self.debug_ignore_sync = None

self.debug_ignore_perf_test = None

+ self.debug_ignore_regression_confidence = None

self.debug_fake_first_test_mean = 0

self.gs_bucket = None

self.target_arch = 'ia32'

@@ -2593,6 +2664,10 @@ class BisectOptions(object):

group.add_option('--debug_ignore_perf_test',

action='store_true',

help='DEBUG: Don\'t perform performance tests.')

+ group.add_option('--debug_ignore_regression_confidence',

+ action='store_true',

+ help='DEBUG: Don\'t score the confidence of the initial '

+ 'good and bad revisions\' test results.')

group.add_option('--debug_fake_first_test_mean',

type='int',

default='0',

« no previous file with comments | « no previous file | tools/auto_bisect/bisect_perf_regression_test.py » ('j') | no next file with comments »