scripts/slave/recipe_modules/auto_bisect/bisector.py - Issue 1702013004: Revert of Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1702013004: Revert of Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

6 import re	6 import re

7 import time

8	7

9 from . import bisect_results	8 from . import bisect_results

10 from . import depot_config	9 from . import depot_config

11 from . import revision_state	10 from . import revision_state

12	11

13 _DEPS_SHA_PATCH = """	12 _DEPS_SHA_PATCH = """

14 diff --git DEPS.sha DEPS.sha	13 diff --git DEPS.sha DEPS.sha

15 new file mode 100644	14 new file mode 100644

16 --- /dev/null	15 --- /dev/null

17 +++ DEPS.sha	16 +++ DEPS.sha

(...skipping 18 matching lines...) Expand all Loading...
36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.	35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.

37 'MISSING_METRIC', # The metric was not found in the test text/json output.	36 'MISSING_METRIC', # The metric was not found in the test text/json output.

38 'LO_FINAL_CONF', # The bisect completed without a culprit.	37 'LO_FINAL_CONF', # The bisect completed without a culprit.

39 )	38 )

40	39

41 # When we look for the next revision to build, we search nearby revisions	40 # When we look for the next revision to build, we search nearby revisions

42 # looking for a revision that's already been archived. Since we don't want	41 # looking for a revision that's already been archived. Since we don't want

43 # to move too far from the original revision, we'll cap the search at 25%.	42 # to move too far from the original revision, we'll cap the search at 25%.

44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25	43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25

45	44

46 # How long to re-test the initial good-bad range for until significant

47 # difference is established.

48 REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60

49 # If we reach this number of samples on the reference range and have not

50 # achieved statistical significance, bail.

51 MAX_REQUIRED_SAMPLES = 50

52

53 # Significance level to use for determining difference between revisions via

54 # hypothesis testing.

55 SIGNIFICANCE_LEVEL = 0.01

56

57	45

58 class Bisector(object):	46 class Bisector(object):

59 """This class abstracts an ongoing bisect (or n-sect) job."""	47 """This class abstracts an ongoing bisect (or n-sect) job."""

60	48

61 def __init__(self, api, bisect_config, revision_class, init_revisions=True):	49 def __init__(self, api, bisect_config, revision_class, init_revisions=True):

62 """Initializes the state of a new bisect job from a dictionary.	50 """Initializes the state of a new bisect job from a dictionary.

63	51

64 Note that the initial good_rev and bad_rev MUST resolve to a commit position	52 Note that the initial good_rev and bad_rev MUST resolve to a commit position

65 in the chromium repo.	53 in the chromium repo.

66 """	54 """

67 super(Bisector, self).__init__()	55 super(Bisector, self).__init__()

68 self._api = api	56 self._api = api

69 self.ensure_sync_master_branch()	57 self.ensure_sync_master_branch()

70 self.bisect_config = bisect_config	58 self.bisect_config = bisect_config

71 self.config_step()	59 self.config_step()

72 self.revision_class = revision_class	60 self.revision_class = revision_class

73 self.result_codes = set()	61 self.result_codes = set()

74 self.last_tested_revision = None

75	62

76 # Test-only properties.	63 # Test-only properties.

77 # TODO: Replace these with proper mod_test_data.	64 # TODO: Replace these with proper mod_test_data.

78 self.dummy_initial_confidence = bisect_config.get(	65 self.dummy_initial_confidence = bisect_config.get(

79 'dummy_initial_confidence')	66 'dummy_initial_confidence')

80 self.dummy_builds = bisect_config.get('dummy_builds', False)	67 self.dummy_builds = bisect_config.get('dummy_builds', False)

81	68

82 # Load configuration items.	69 # Load configuration items.

83 self.test_type = bisect_config.get('test_type', 'perf')	70 self.test_type = bisect_config.get('test_type', 'perf')

84 self.improvement_direction = int(bisect_config.get(	71 self.improvement_direction = int(bisect_config.get(

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
120 self.good_rev.good = True	107 self.good_rev.good = True

121 self.good_rev.read_deps(self.get_perf_tester_name())	108 self.good_rev.read_deps(self.get_perf_tester_name())

122 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [	109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [

123 '%s: %s' % (key, value) for key, value in	110 '%s: %s' % (key, value) for key, value in

124 self.good_rev.deps.iteritems()]	111 self.good_rev.deps.iteritems()]

125 self.good_rev.deps = {}	112 self.good_rev.deps = {}

126 self.lkgr = self.good_rev	113 self.lkgr = self.good_rev

127 if init_revisions:	114 if init_revisions:

128 self._expand_chromium_revision_range()	115 self._expand_chromium_revision_range()

129	116

130 def significantly_different(

131 self, list_a, list_b,

132 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover

133 """Uses an external script to run hypothesis testing with scipy.

134

135 The reason why we need an external script is that scipy is not available to

136 the default python installed in all platforms. We instead rely on an

137 anaconda environment to provide those packages.

138

139 Args:

140 list_a, list_b: Two lists representing samples to be compared.

141 significance_level: Self-describing. As a decimal fraction.

142

143 Returns:

144 A boolean indicating whether the null hypothesis ~(that the lists are

145 samples from the same population) can be rejected at the specified

146 significance level.

147 """

148 step_result = self.api.m.python(

149 'Checking sample difference',

150 self.api.resource('significantly_different.py'),

151 [json.dumps(list_a), json.dumps(list_b), str(significance_level)],

152 stdout=self.api.m.json.output())

153 results = step_result.stdout

154 if results is None:

155 assert self.dummy_builds

156 return True

157 significantly_different = results['significantly_different']

158 step_result.presentation.logs[str(significantly_different)] = [

159 'See json.output for details']

160 return significantly_different

161

162 def config_step(self):	117 def config_step(self):

163 """Yields a simple echo step that outputs the bisect config."""	118 """Yields a simple echo step that outputs the bisect config."""

164 api = self.api	119 api = self.api

165 # bisect_config may come as a FrozenDict (which is not serializable).	120 # bisect_config may come as a FrozenDict (which is not serializable).

166 bisect_config = dict(self.bisect_config)	121 bisect_config = dict(self.bisect_config)

167	122

168 def fix_windows_backslashes(s):	123 def fix_windows_backslashes(s):

169 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')	124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')

170 return backslash_regex.sub(r'\\', s)	125 return backslash_regex.sub(r'\\', s)

171	126

(...skipping 278 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
450 self.failed_direction = True	405 self.failed_direction = True

451 self.warnings.append('The initial regression range for return code '	406 self.warnings.append('The initial regression range for return code '

452 'appears to show NO sign of a regression.')	407 'appears to show NO sign of a regression.')

453	408

454 def _set_failed_direction_results(self): # pragma: no cover	409 def _set_failed_direction_results(self): # pragma: no cover

455 self.failed_direction = True	410 self.failed_direction = True

456 self.warnings.append('The initial regression range appears to represent '	411 self.warnings.append('The initial regression range appears to represent '

457 'an improvement rather than a regression, given the '	412 'an improvement rather than a regression, given the '

458 'expected direction of improvement.')	413 'expected direction of improvement.')

459	414

460 def check_initial_confidence(self): # pragma: no cover	415 def check_initial_confidence(self):

461 """Checks that the initial range presents a clear enough regression.	416 """Checks that the initial range presents a clear enough regression.

462	417

463 We calculate the confidence of the results of the given 'good'	418 We calculate the confidence of the results of the given 'good'

464 and 'bad' revisions and compare it against the required confidence	419 and 'bad' revisions and compare it against the required confidence

465 set for the bisector.	420 set for the bisector.

466	421

467 Note that when a dummy regression confidence value has been set, that	422 Note that when a dummy regression confidence value has been set, that

468 is used instead.	423 is used instead.

469 """	424 """

470 if self.test_type != 'perf':	425 if self.test_type != 'perf':

471 return True	426 return True

472	427

473 if self.required_initial_confidence is None:	428 if self.required_initial_confidence is None:

474 return True # pragma: no cover	429 return True # pragma: no cover

475	430

476 # TODO(robertocn): Remove all uses of "confidence".

477 if self.dummy_initial_confidence is not None:	431 if self.dummy_initial_confidence is not None:

478 self.initial_confidence = float(	432 self.initial_confidence = float(

479 self.dummy_initial_confidence)	433 self.dummy_initial_confidence)

480 if (float(self.initial_confidence) <

481 float(self.required_initial_confidence)):

482 self._set_insufficient_confidence_warning()

483 return False

484 return True

485	434

486 if self.dummy_builds:	435 else: # pragma: no cover

487 dummy_result = self.good_rev.values != self.bad_rev.values	436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5:

488 if not dummy_result:	437 # If there are too few values, the confidence score is not a good way to

489 self._set_insufficient_confidence_warning()	438 # determine whether the regression is reproducible.

490 return dummy_result	439 # TODO(robertocn): Investigate a straightforward approach to deal with

491	440 # these cases. Such as the mean of one group lying within the range of

492 with self.api.m.step.nest('Re-testing reference range'):	441 # the other.

493 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT	442 return True

494 while time.time() < expiration_time:	443 self.initial_confidence = (

495 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:	444 self.api.m.math_utils.confidence_score(

496 if self.significantly_different(self.good_rev.values,	445 self.good_rev.values,

497 self.bad_rev.values):	446 self.bad_rev.values))

498 return True	447 if (self.initial_confidence <

499 if len(self.good_rev.values) == len(self.bad_rev.values):	448 self.required_initial_confidence): # pragma: no cover

500 revision_to_retest = self.last_tested_revision	449 self._set_insufficient_confidence_warning(self.initial_confidence)

501 else:

502 revision_to_retest = min(self.good_rev, self.bad_rev,

503 key=lambda x: len(x.values))

504 if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES:

505 revision_to_retest.retest()

506 else:

507 break

508 self._set_insufficient_confidence_warning()

509 return False	450 return False

510	451 return True

511	452

512 def get_exception(self):	453 def get_exception(self):

513 raise NotImplementedError() # pragma: no cover	454 raise NotImplementedError() # pragma: no cover

514 # TODO: should return an exception with the details of the failure.	455 # TODO: should return an exception with the details of the failure.

515	456

516 def _set_insufficient_confidence_warning(	457 def _set_insufficient_confidence_warning(

517 self): # pragma: no cover	458 self, actual_confidence): # pragma: no cover

518 """Adds a warning about the lack of initial regression confidence."""	459 """Adds a warning about the lack of initial regression confidence."""

519 self.failed_initial_confidence = True	460 self.failed_initial_confidence = True

520 self.surface_result('LO_INIT_CONF')	461 self.surface_result('LO_INIT_CONF')

521 self.warnings.append(	462 self.warnings.append(

522 'Bisect failed to reproduce the regression with enough confidence.')	463 ('Bisect failed to reproduce the regression with enough confidence. '

	464 'Needed {:.2f}%, got {:.2f}%.').format(

	465 self.required_initial_confidence, actual_confidence))

523	466

524 def _results_debug_message(self):	467 def _results_debug_message(self):

525 """Returns a string with values used to debug a bisect result."""	468 """Returns a string with values used to debug a bisect result."""

526 result = 'bisector.lkgr: %r\n' % self.lkgr	469 result = 'bisector.lkgr: %r\n' % self.lkgr

527 result += 'bisector.fkbr: %r\n\n' % self.fkbr	470 result += 'bisector.fkbr: %r\n\n' % self.fkbr

528 result += self._revision_value_table()	471 result += self._revision_value_table()

529 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):	472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):

530 result += '\n' + self._t_test_results()	473 result += '\n' + self._t_test_results()

531 return result	474 return result

532	475

(...skipping 340 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
873	816

874 def surface_result(self, result_string):	817 def surface_result(self, result_string):

875 assert result_string in VALID_RESULT_CODES	818 assert result_string in VALID_RESULT_CODES

876 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).	819 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).

877 result_code = prefix + result_string	820 result_code = prefix + result_string

878 assert len(result_code) <= 20	821 assert len(result_code) <= 20

879 if result_code not in self.result_codes:	822 if result_code not in self.result_codes:

880 self.result_codes.add(result_code)	823 self.result_codes.add(result_code)

881 properties = self.api.m.step.active_result.presentation.properties	824 properties = self.api.m.step.active_result.presentation.properties

882 properties['extra_result_code'] = sorted(self.result_codes)	825 properties['extra_result_code'] = sorted(self.result_codes)

OLD	NEW

« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/api.py ('k') | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | no next file with comments »