scripts/slave/recipe_modules/auto_bisect/bisector.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Unified Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Making initial regression check have a timeout. Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « no previous file | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: scripts/slave/recipe_modules/auto_bisect/bisector.py

diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py

index e7677c13d0e15a503dbf7444676fb88bcd116f82..5e2ebe9acc10e093a6a0fc97eae0818bdc116e86 100644

--- a/scripts/slave/recipe_modules/auto_bisect/bisector.py

+++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py

@@ -4,6 +4,7 @@

import json

import re

+import time

from . import bisect_results

from . import depot_config

@@ -42,6 +43,14 @@ VALID_RESULT_CODES = (

# to move *too* far from the original revision, we'll cap the search at 25%.

DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25

+# How long to re-test the initial good-bad range for until significant

+# difference is established.

+REGRESSION_CHECK_TIMEOUT = 6 * 60 * 60

dtu 2016/02/05 00:18:01 Is that six hours? Do we really have that much tim

+# Significance level to use for determining difference between revisions via

+# hypothesis testing.

+SIGNIFICANCE_LEVEL = 0.05

class Bisector(object):

"""This class abstracts an ongoing bisect (or n-sect) job."""

@@ -114,6 +123,38 @@ class Bisector(object):

if init_revisions:

self._expand_chromium_revision_range()

+ def significantly_different(

+ self, list_a, list_b,

+ significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover

+ """Uses an external script to run hypothesis testing with scipy.

+ The reason why we need an external script is that scipy is not available to

+ the default python installed in all platforms. We instead rely on an

+ anaconda environment to provide those packages.

+ Args:

+ list_a, list_b: Two lists representing samples to be compared.

+ significance_level: Self-describing. As a decimal fraction.

+ Returns:

+ A boolean indicating whether the null hypothesis ~(that the lists are

+ samples from the same population) can be rejected at the specified

+ significance level.

+ """

+ step_result = self.api.m.python(

+ 'Checking sample difference',

+ self.api.resource('significantly_different.py'),

+ [json.dumps(list_a), json.dumps(list_b), str(significance_level)],

+ stdout=self.api.m.json.output())

+ results = step_result.stdout

+ if results is None:

+ assert self.dummy_builds

+ return True

+ significantly_different = results['significantly_different']

+ step_result.presentation.logs[str(significantly_different)] = [

+ 'See json.output for details']

+ return significantly_different

def config_step(self):

"""Yields a simple echo step that outputs the bisect config."""

api = self.api

@@ -412,7 +453,7 @@ class Bisector(object):

'an improvement rather than a regression, given the '

'expected direction of improvement.')

- def check_initial_confidence(self):

+ def check_initial_confidence(self): # pragma: no cover

"""Checks that the initial range presents a clear enough regression.

We calculate the confidence of the results of the given 'good'

@@ -428,41 +469,44 @@ class Bisector(object):

if self.required_initial_confidence is None:

return True # pragma: no cover

+ # TODO(robertocn): Remove all uses of "confidence".

if self.dummy_initial_confidence is not None:

self.initial_confidence = float(

self.dummy_initial_confidence)

+ if (float(self.initial_confidence) <

+ float(self.required_initial_confidence)):

+ self._set_insufficient_confidence_warning()

+ return False

+ return True

+ if self.dummy_builds:

+ dummy_result = self.good_rev.values != self.bad_rev.values

+ if not dummy_result:

+ self._set_insufficient_confidence_warning()

+ return dummy_result

dtu 2016/02/05 00:18:01 Seems like there's a bunch of technical debt relat

+ expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT

+ while time.time() < expiration_time:

+ if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:

+ if self.significantly_different(self.good_rev.values,

+ self.bad_rev.values):

+ return True

+ min(self.good_rev, self.bad_rev, key=lambda x: len(x.values)).retest()

+ self._set_insufficient_confidence_warning()

+ return False

- else: # pragma: no cover

- if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5:

- # If there are too few values, the confidence score is not a good way to

- # determine whether the regression is reproducible.

- # TODO(robertocn): Investigate a straightforward approach to deal with

- # these cases. Such as the mean of one group lying within the range of

- # the other.

- return True

- self.initial_confidence = (

- self.api.m.math_utils.confidence_score(

- self.good_rev.values,

- self.bad_rev.values))

- if (self.initial_confidence <

- self.required_initial_confidence): # pragma: no cover

- self._set_insufficient_confidence_warning(self.initial_confidence)

- return False

- return True

def get_exception(self):

raise NotImplementedError() # pragma: no cover

# TODO: should return an exception with the details of the failure.

def _set_insufficient_confidence_warning(

- self, actual_confidence): # pragma: no cover

+ self): # pragma: no cover

"""Adds a warning about the lack of initial regression confidence."""

self.failed_initial_confidence = True

self.surface_result('LO_INIT_CONF')

self.warnings.append(

- ('Bisect failed to reproduce the regression with enough confidence. '

- 'Needed {:.2f}%, got {:.2f}%.').format(

- self.required_initial_confidence, actual_confidence))

+ 'Bisect failed to reproduce the regression with enough confidence.')

def _results_debug_message(self):

"""Returns a string with values used to debug a bisect result."""

« no previous file with comments | « no previous file | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py » ('J')