scripts/slave/recipe_modules/auto_bisect/bisector.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Rebasing Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

6 import re	6 import re

	7 import time

7	8

8 from . import bisect_results	9 from . import bisect_results

9 from . import depot_config	10 from . import depot_config

10 from . import revision_state	11 from . import revision_state

11	12

12 _DEPS_SHA_PATCH = """	13 _DEPS_SHA_PATCH = """

13 diff --git DEPS.sha DEPS.sha	14 diff --git DEPS.sha DEPS.sha

14 new file mode 100644	15 new file mode 100644

15 --- /dev/null	16 --- /dev/null

16 +++ DEPS.sha	17 +++ DEPS.sha

(...skipping 18 matching lines...) Expand all Loading...
35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.	36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.

36 'MISSING_METRIC', # The metric was not found in the test text/json output.	37 'MISSING_METRIC', # The metric was not found in the test text/json output.

37 'LO_FINAL_CONF', # The bisect completed without a culprit.	38 'LO_FINAL_CONF', # The bisect completed without a culprit.

38 )	39 )

39	40

40 # When we look for the next revision to build, we search nearby revisions	41 # When we look for the next revision to build, we search nearby revisions

41 # looking for a revision that's already been archived. Since we don't want	42 # looking for a revision that's already been archived. Since we don't want

42 # to move too far from the original revision, we'll cap the search at 25%.	43 # to move too far from the original revision, we'll cap the search at 25%.

43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25	44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25

44	45

	46 # How long to re-test the initial good-bad range for until significant

	47 # difference is established.

	48 REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60

	49 # If we reach this number of samples on the reference range and have not

	50 # achieved statistical significance, bail.

	51 MAX_REQUIRED_SAMPLES = 50

	52

	53 # Significance level to use for determining difference between revisions via

	54 # hypothesis testing.

	55 SIGNIFICANCE_LEVEL = 0.01

	56

45	57

46 class Bisector(object):	58 class Bisector(object):

47 """This class abstracts an ongoing bisect (or n-sect) job."""	59 """This class abstracts an ongoing bisect (or n-sect) job."""

48	60

49 def __init__(self, api, bisect_config, revision_class, init_revisions=True):	61 def __init__(self, api, bisect_config, revision_class, init_revisions=True):

50 """Initializes the state of a new bisect job from a dictionary.	62 """Initializes the state of a new bisect job from a dictionary.

51	63

52 Note that the initial good_rev and bad_rev MUST resolve to a commit position	64 Note that the initial good_rev and bad_rev MUST resolve to a commit position

53 in the chromium repo.	65 in the chromium repo.

54 """	66 """

55 super(Bisector, self).__init__()	67 super(Bisector, self).__init__()

56 self._api = api	68 self._api = api

57 self.ensure_sync_master_branch()	69 self.ensure_sync_master_branch()

58 self.bisect_config = bisect_config	70 self.bisect_config = bisect_config

59 self.config_step()	71 self.config_step()

60 self.revision_class = revision_class	72 self.revision_class = revision_class

61 self.result_codes = set()	73 self.result_codes = set()

	74 self.last_tested_revision = None

62	75

63 # Test-only properties.	76 # Test-only properties.

64 # TODO: Replace these with proper mod_test_data.	77 # TODO: Replace these with proper mod_test_data.

65 self.dummy_initial_confidence = bisect_config.get(	78 self.dummy_initial_confidence = bisect_config.get(

66 'dummy_initial_confidence')	79 'dummy_initial_confidence')

67 self.dummy_builds = bisect_config.get('dummy_builds', False)	80 self.dummy_builds = bisect_config.get('dummy_builds', False)

68	81

69 # Load configuration items.	82 # Load configuration items.

70 self.test_type = bisect_config.get('test_type', 'perf')	83 self.test_type = bisect_config.get('test_type', 'perf')

71 self.improvement_direction = int(bisect_config.get(	84 self.improvement_direction = int(bisect_config.get(

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107 self.good_rev.good = True	120 self.good_rev.good = True

108 self.good_rev.read_deps(self.get_perf_tester_name())	121 self.good_rev.read_deps(self.get_perf_tester_name())

109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [	122 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [

110 '%s: %s' % (key, value) for key, value in	123 '%s: %s' % (key, value) for key, value in

111 self.good_rev.deps.iteritems()]	124 self.good_rev.deps.iteritems()]

112 self.good_rev.deps = {}	125 self.good_rev.deps = {}

113 self.lkgr = self.good_rev	126 self.lkgr = self.good_rev

114 if init_revisions:	127 if init_revisions:

115 self._expand_chromium_revision_range()	128 self._expand_chromium_revision_range()

116	129

	130 def significantly_different(

	131 self, list_a, list_b,

	132 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover

	133 """Uses an external script to run hypothesis testing with scipy.

	134

	135 The reason why we need an external script is that scipy is not available to

	136 the default python installed in all platforms. We instead rely on an

	137 anaconda environment to provide those packages.

	138

	139 Args:

	140 list_a, list_b: Two lists representing samples to be compared.

	141 significance_level: Self-describing. As a decimal fraction.

	142

	143 Returns:

	144 A boolean indicating whether the null hypothesis ~(that the lists are

	145 samples from the same population) can be rejected at the specified

	146 significance level.

	147 """

	148 step_result = self.api.m.python(

	149 'Checking sample difference',

	150 self.api.resource('significantly_different.py'),

	151 [json.dumps(list_a), json.dumps(list_b), str(significance_level)],

	152 stdout=self.api.m.json.output())

	153 results = step_result.stdout

	154 if results is None:

	155 assert self.dummy_builds

	156 return True

	157 significantly_different = results['significantly_different']

	158 step_result.presentation.logs[str(significantly_different)] = [

	159 'See json.output for details']

	160 return significantly_different

	161

117 def config_step(self):	162 def config_step(self):

118 """Yields a simple echo step that outputs the bisect config."""	163 """Yields a simple echo step that outputs the bisect config."""

119 api = self.api	164 api = self.api

120 # bisect_config may come as a FrozenDict (which is not serializable).	165 # bisect_config may come as a FrozenDict (which is not serializable).

121 bisect_config = dict(self.bisect_config)	166 bisect_config = dict(self.bisect_config)

122	167

123 def fix_windows_backslashes(s):	168 def fix_windows_backslashes(s):

124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')	169 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')

125 return backslash_regex.sub(r'\\', s)	170 return backslash_regex.sub(r'\\', s)

126	171

(...skipping 278 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
405 self.failed_direction = True	450 self.failed_direction = True

406 self.warnings.append('The initial regression range for return code '	451 self.warnings.append('The initial regression range for return code '

407 'appears to show NO sign of a regression.')	452 'appears to show NO sign of a regression.')

408	453

409 def _set_failed_direction_results(self): # pragma: no cover	454 def _set_failed_direction_results(self): # pragma: no cover

410 self.failed_direction = True	455 self.failed_direction = True

411 self.warnings.append('The initial regression range appears to represent '	456 self.warnings.append('The initial regression range appears to represent '

412 'an improvement rather than a regression, given the '	457 'an improvement rather than a regression, given the '

413 'expected direction of improvement.')	458 'expected direction of improvement.')

414	459

415 def check_initial_confidence(self):	460 def check_initial_confidence(self): # pragma: no cover

416 """Checks that the initial range presents a clear enough regression.	461 """Checks that the initial range presents a clear enough regression.

417	462

418 We calculate the confidence of the results of the given 'good'	463 We calculate the confidence of the results of the given 'good'

419 and 'bad' revisions and compare it against the required confidence	464 and 'bad' revisions and compare it against the required confidence

420 set for the bisector.	465 set for the bisector.

421	466

422 Note that when a dummy regression confidence value has been set, that	467 Note that when a dummy regression confidence value has been set, that

423 is used instead.	468 is used instead.

424 """	469 """

425 if self.test_type != 'perf':	470 if self.test_type != 'perf':

426 return True	471 return True

427	472

428 if self.required_initial_confidence is None:	473 if self.required_initial_confidence is None:

429 return True # pragma: no cover	474 return True # pragma: no cover

430	475

	476 # TODO(robertocn): Remove all uses of "confidence".

431 if self.dummy_initial_confidence is not None:	477 if self.dummy_initial_confidence is not None:

432 self.initial_confidence = float(	478 self.initial_confidence = float(

433 self.dummy_initial_confidence)	479 self.dummy_initial_confidence)

	480 if (float(self.initial_confidence) <

	481 float(self.required_initial_confidence)):

	482 self._set_insufficient_confidence_warning()

	483 return False

	484 return True

434	485

435 else: # pragma: no cover	486 if self.dummy_builds:

436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5:	487 dummy_result = self.good_rev.values != self.bad_rev.values

437 # If there are too few values, the confidence score is not a good way to	488 if not dummy_result:

438 # determine whether the regression is reproducible.	489 self._set_insufficient_confidence_warning()

439 # TODO(robertocn): Investigate a straightforward approach to deal with	490 return dummy_result

440 # these cases. Such as the mean of one group lying within the range of	491

441 # the other.	492 with self.api.m.step.nest('Re-testing reference range'):

442 return True	493 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT

443 self.initial_confidence = (	494 while time.time() < expiration_time:

444 self.api.m.math_utils.confidence_score(	495 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:

445 self.good_rev.values,	496 if self.significantly_different(self.good_rev.values,

446 self.bad_rev.values))	497 self.bad_rev.values):

447 if (self.initial_confidence <	498 return True

448 self.required_initial_confidence): # pragma: no cover	499 if len(self.good_rev.values) == len(self.bad_rev.values):

449 self._set_insufficient_confidence_warning(self.initial_confidence)	500 revision_to_retest = self.last_tested_revision

	501 else:

	502 revision_to_retest = min(self.good_rev, self.bad_rev,

	503 key=lambda x: len(x.values))

	504 if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES:

	505 revision_to_retest.retest()

	506 else:

	507 break

	508 self._set_insufficient_confidence_warning()

450 return False	509 return False

451 return True	510

452	511

453 def get_exception(self):	512 def get_exception(self):

454 raise NotImplementedError() # pragma: no cover	513 raise NotImplementedError() # pragma: no cover

455 # TODO: should return an exception with the details of the failure.	514 # TODO: should return an exception with the details of the failure.

456	515

457 def _set_insufficient_confidence_warning(	516 def _set_insufficient_confidence_warning(

458 self, actual_confidence): # pragma: no cover	517 self): # pragma: no cover

459 """Adds a warning about the lack of initial regression confidence."""	518 """Adds a warning about the lack of initial regression confidence."""

460 self.failed_initial_confidence = True	519 self.failed_initial_confidence = True

461 self.surface_result('LO_INIT_CONF')	520 self.surface_result('LO_INIT_CONF')

462 self.warnings.append(	521 self.warnings.append(

463 ('Bisect failed to reproduce the regression with enough confidence. '	522 'Bisect failed to reproduce the regression with enough confidence.')

464 'Needed {:.2f}%, got {:.2f}%.').format(

465 self.required_initial_confidence, actual_confidence))

466	523

467 def _results_debug_message(self):	524 def _results_debug_message(self):

468 """Returns a string with values used to debug a bisect result."""	525 """Returns a string with values used to debug a bisect result."""

469 result = 'bisector.lkgr: %r\n' % self.lkgr	526 result = 'bisector.lkgr: %r\n' % self.lkgr

470 result += 'bisector.fkbr: %r\n\n' % self.fkbr	527 result += 'bisector.fkbr: %r\n\n' % self.fkbr

471 result += self._revision_value_table()	528 result += self._revision_value_table()

472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):	529 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):

473 result += '\n' + self._t_test_results()	530 result += '\n' + self._t_test_results()

474 return result	531 return result

475	532

(...skipping 340 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
816	873

817 def surface_result(self, result_string):	874 def surface_result(self, result_string):

818 assert result_string in VALID_RESULT_CODES	875 assert result_string in VALID_RESULT_CODES

819 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).	876 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).

820 result_code = prefix + result_string	877 result_code = prefix + result_string

821 assert len(result_code) <= 20	878 assert len(result_code) <= 20

822 if result_code not in self.result_codes:	879 if result_code not in self.result_codes:

823 self.result_codes.add(result_code)	880 self.result_codes.add(result_code)

824 properties = self.api.m.step.active_result.presentation.properties	881 properties = self.api.m.step.active_result.presentation.properties

825 properties['extra_result_code'] = sorted(self.result_codes)	882 properties['extra_result_code'] = sorted(self.result_codes)

OLD	NEW

« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/api.py ('k') | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | no next file with comments »