scripts/slave/recipe_modules/auto_bisect/bisector.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Making initial regression check have a timeout. Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

6 import re	6 import re

	7 import time

7	8

8 from . import bisect_results	9 from . import bisect_results

9 from . import depot_config	10 from . import depot_config

10 from . import revision_state	11 from . import revision_state

11	12

12 _DEPS_SHA_PATCH = """	13 _DEPS_SHA_PATCH = """

13 diff --git DEPS.sha DEPS.sha	14 diff --git DEPS.sha DEPS.sha

14 new file mode 100644	15 new file mode 100644

15 --- /dev/null	16 --- /dev/null

16 +++ DEPS.sha	17 +++ DEPS.sha

(...skipping 18 matching lines...) Expand all Loading...
35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.	36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.

36 'MISSING_METRIC', # The metric was not found in the test text/json output.	37 'MISSING_METRIC', # The metric was not found in the test text/json output.

37 'LO_FINAL_CONF', # The bisect completed without a culprit.	38 'LO_FINAL_CONF', # The bisect completed without a culprit.

38 )	39 )

39	40

40 # When we look for the next revision to build, we search nearby revisions	41 # When we look for the next revision to build, we search nearby revisions

41 # looking for a revision that's already been archived. Since we don't want	42 # looking for a revision that's already been archived. Since we don't want

42 # to move too far from the original revision, we'll cap the search at 25%.	43 # to move too far from the original revision, we'll cap the search at 25%.

43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25	44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25

44	45

	46 # How long to re-test the initial good-bad range for until significant

	47 # difference is established.

	48 REGRESSION_CHECK_TIMEOUT = 6 * 60 * 60
	dtu 2016/02/05 00:18:01 Is that six hours? Do we really have that much tim Is that six hours? Do we really have that much time?
	49

	50 # Significance level to use for determining difference between revisions via

	51 # hypothesis testing.

	52 SIGNIFICANCE_LEVEL = 0.05

	53

45	54

46 class Bisector(object):	55 class Bisector(object):

47 """This class abstracts an ongoing bisect (or n-sect) job."""	56 """This class abstracts an ongoing bisect (or n-sect) job."""

48	57

49 def __init__(self, api, bisect_config, revision_class, init_revisions=True):	58 def __init__(self, api, bisect_config, revision_class, init_revisions=True):

50 """Initializes the state of a new bisect job from a dictionary.	59 """Initializes the state of a new bisect job from a dictionary.

51	60

52 Note that the initial good_rev and bad_rev MUST resolve to a commit position	61 Note that the initial good_rev and bad_rev MUST resolve to a commit position

53 in the chromium repo.	62 in the chromium repo.

54 """	63 """

(...skipping 15 matching lines...) Expand all Loading...
70 self.test_type = bisect_config.get('test_type', 'perf')	79 self.test_type = bisect_config.get('test_type', 'perf')

71 self.improvement_direction = int(bisect_config.get(	80 self.improvement_direction = int(bisect_config.get(

72 'improvement_direction', 0)) or None	81 'improvement_direction', 0)) or None

73 # Required initial confidence might be explicitly set to None,	82 # Required initial confidence might be explicitly set to None,

74 # which means that no initial confidence check should be done.	83 # which means that no initial confidence check should be done.

75 if ('required_initial_confidence' in bisect_config and	84 if ('required_initial_confidence' in bisect_config and

76 bisect_config['required_initial_confidence'] is None):	85 bisect_config['required_initial_confidence'] is None):

77 self.required_initial_confidence = None # pragma: no cover	86 self.required_initial_confidence = None # pragma: no cover

78 else:	87 else:

79 self.required_initial_confidence = float(bisect_config.get(	88 self.required_initial_confidence = float(bisect_config.get(

80 'required_initial_confidence', 80))	89 'required_initial_confidence', 80))
	dtu 2016/02/05 00:18:02 Should we rethink what this value is? Also make it Should we rethink what this value is? Also make it a constant?
81	90

82 self.warnings = []	91 self.warnings = []

83	92

84 # Status flags	93 # Status flags

85 self.initial_confidence = None	94 self.initial_confidence = None

86 self.failed_initial_confidence = False	95 self.failed_initial_confidence = False

87 self.failed = False	96 self.failed = False

88 self.failed_direction = False	97 self.failed_direction = False

89 self.lkgr = None # Last known good revision	98 self.lkgr = None # Last known good revision

90 self.fkbr = None # First known bad revision	99 self.fkbr = None # First known bad revision

(...skipping 16 matching lines...) Expand all Loading...
107 self.good_rev.good = True	116 self.good_rev.good = True

108 self.good_rev.read_deps(self.get_perf_tester_name())	117 self.good_rev.read_deps(self.get_perf_tester_name())

109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [	118 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [

110 '%s: %s' % (key, value) for key, value in	119 '%s: %s' % (key, value) for key, value in

111 self.good_rev.deps.iteritems()]	120 self.good_rev.deps.iteritems()]

112 self.good_rev.deps = {}	121 self.good_rev.deps = {}

113 self.lkgr = self.good_rev	122 self.lkgr = self.good_rev

114 if init_revisions:	123 if init_revisions:

115 self._expand_chromium_revision_range()	124 self._expand_chromium_revision_range()

116	125

	126 def significantly_different(

	127 self, list_a, list_b,

	128 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover

	129 """Uses an external script to run hypothesis testing with scipy.

	130

	131 The reason why we need an external script is that scipy is not available to

	132 the default python installed in all platforms. We instead rely on an

	133 anaconda environment to provide those packages.

	134

	135 Args:

	136 list_a, list_b: Two lists representing samples to be compared.

	137 significance_level: Self-describing. As a decimal fraction.

	138

	139 Returns:

	140 A boolean indicating whether the null hypothesis ~(that the lists are

	141 samples from the same population) can be rejected at the specified

	142 significance level.

	143 """

	144 step_result = self.api.m.python(

	145 'Checking sample difference',

	146 self.api.resource('significantly_different.py'),

	147 [json.dumps(list_a), json.dumps(list_b), str(significance_level)],

	148 stdout=self.api.m.json.output())

	149 results = step_result.stdout

	150 if results is None:

	151 assert self.dummy_builds

	152 return True

	153 significantly_different = results['significantly_different']

	154 step_result.presentation.logs[str(significantly_different)] = [

	155 'See json.output for details']

	156 return significantly_different

	157

117 def config_step(self):	158 def config_step(self):

118 """Yields a simple echo step that outputs the bisect config."""	159 """Yields a simple echo step that outputs the bisect config."""

119 api = self.api	160 api = self.api

120 # bisect_config may come as a FrozenDict (which is not serializable).	161 # bisect_config may come as a FrozenDict (which is not serializable).

121 bisect_config = dict(self.bisect_config)	162 bisect_config = dict(self.bisect_config)

122	163

123 def fix_windows_backslashes(s):	164 def fix_windows_backslashes(s):

124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')	165 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')

125 return backslash_regex.sub(r'\\', s)	166 return backslash_regex.sub(r'\\', s)

126	167

(...skipping 278 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
405 self.failed_direction = True	446 self.failed_direction = True

406 self.warnings.append('The initial regression range for return code '	447 self.warnings.append('The initial regression range for return code '

407 'appears to show NO sign of a regression.')	448 'appears to show NO sign of a regression.')

408	449

409 def _set_failed_direction_results(self): # pragma: no cover	450 def _set_failed_direction_results(self): # pragma: no cover

410 self.failed_direction = True	451 self.failed_direction = True

411 self.warnings.append('The initial regression range appears to represent '	452 self.warnings.append('The initial regression range appears to represent '

412 'an improvement rather than a regression, given the '	453 'an improvement rather than a regression, given the '

413 'expected direction of improvement.')	454 'expected direction of improvement.')

414	455

415 def check_initial_confidence(self):	456 def check_initial_confidence(self): # pragma: no cover

416 """Checks that the initial range presents a clear enough regression.	457 """Checks that the initial range presents a clear enough regression.

417	458

418 We calculate the confidence of the results of the given 'good'	459 We calculate the confidence of the results of the given 'good'

419 and 'bad' revisions and compare it against the required confidence	460 and 'bad' revisions and compare it against the required confidence

420 set for the bisector.	461 set for the bisector.

421	462

422 Note that when a dummy regression confidence value has been set, that	463 Note that when a dummy regression confidence value has been set, that

423 is used instead.	464 is used instead.

424 """	465 """

425 if self.test_type != 'perf':	466 if self.test_type != 'perf':

426 return True	467 return True

427	468

428 if self.required_initial_confidence is None:	469 if self.required_initial_confidence is None:

429 return True # pragma: no cover	470 return True # pragma: no cover

430	471

	472 # TODO(robertocn): Remove all uses of "confidence".

431 if self.dummy_initial_confidence is not None:	473 if self.dummy_initial_confidence is not None:

432 self.initial_confidence = float(	474 self.initial_confidence = float(

433 self.dummy_initial_confidence)	475 self.dummy_initial_confidence)

	476 if (float(self.initial_confidence) <

	477 float(self.required_initial_confidence)):

	478 self._set_insufficient_confidence_warning()

	479 return False

	480 return True

434	481

435 else: # pragma: no cover	482 if self.dummy_builds:

436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5:	483 dummy_result = self.good_rev.values != self.bad_rev.values

437 # If there are too few values, the confidence score is not a good way to	484 if not dummy_result:

438 # determine whether the regression is reproducible.	485 self._set_insufficient_confidence_warning()

439 # TODO(robertocn): Investigate a straightforward approach to deal with	486 return dummy_result
	dtu 2016/02/05 00:18:01 Seems like there's a bunch of technical debt relat Seems like there's a bunch of technical debt related to this dummy test data. It's written in a cross-cutting way.
440 # these cases. Such as the mean of one group lying within the range of	487

441 # the other.	488 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT

442 return True	489 while time.time() < expiration_time:

443 self.initial_confidence = (	490 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:

444 self.api.m.math_utils.confidence_score(	491 if self.significantly_different(self.good_rev.values,

445 self.good_rev.values,	492 self.bad_rev.values):

446 self.bad_rev.values))	493 return True

447 if (self.initial_confidence <	494 min(self.good_rev, self.bad_rev, key=lambda x: len(x.values)).retest()

448 self.required_initial_confidence): # pragma: no cover	495 self._set_insufficient_confidence_warning()

449 self._set_insufficient_confidence_warning(self.initial_confidence)	496 return False

450 return False	497

451 return True

452	498

453 def get_exception(self):	499 def get_exception(self):

454 raise NotImplementedError() # pragma: no cover	500 raise NotImplementedError() # pragma: no cover

455 # TODO: should return an exception with the details of the failure.	501 # TODO: should return an exception with the details of the failure.

456	502

457 def _set_insufficient_confidence_warning(	503 def _set_insufficient_confidence_warning(

458 self, actual_confidence): # pragma: no cover	504 self): # pragma: no cover

459 """Adds a warning about the lack of initial regression confidence."""	505 """Adds a warning about the lack of initial regression confidence."""

460 self.failed_initial_confidence = True	506 self.failed_initial_confidence = True

461 self.surface_result('LO_INIT_CONF')	507 self.surface_result('LO_INIT_CONF')

462 self.warnings.append(	508 self.warnings.append(

463 ('Bisect failed to reproduce the regression with enough confidence. '	509 'Bisect failed to reproduce the regression with enough confidence.')

464 'Needed {:.2f}%, got {:.2f}%.').format(

465 self.required_initial_confidence, actual_confidence))

466	510

467 def _results_debug_message(self):	511 def _results_debug_message(self):

468 """Returns a string with values used to debug a bisect result."""	512 """Returns a string with values used to debug a bisect result."""

469 result = 'bisector.lkgr: %r\n' % self.lkgr	513 result = 'bisector.lkgr: %r\n' % self.lkgr

470 result += 'bisector.fkbr: %r\n\n' % self.fkbr	514 result += 'bisector.fkbr: %r\n\n' % self.fkbr

471 result += self._revision_value_table()	515 result += self._revision_value_table()

472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):	516 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):

473 result += '\n' + self._t_test_results()	517 result += '\n' + self._t_test_results()

474 return result	518 return result

475	519

(...skipping 334 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
810	854

811 def surface_result(self, result_string):	855 def surface_result(self, result_string):

812 assert result_string in VALID_RESULT_CODES	856 assert result_string in VALID_RESULT_CODES

813 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).	857 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).

814 result_code = prefix + result_string	858 result_code = prefix + result_string

815 assert len(result_code) <= 20	859 assert len(result_code) <= 20

816 if result_code not in self.result_codes:	860 if result_code not in self.result_codes:

817 self.result_codes.add(result_code)	861 self.result_codes.add(result_code)

818 properties = self.api.m.step.active_result.presentation.properties	862 properties = self.api.m.step.active_result.presentation.properties

819 properties['extra_result_code'] = sorted(self.result_codes)	863 properties['extra_result_code'] = sorted(self.result_codes)

OLD	NEW

« no previous file with comments | « no previous file | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py » ('J')