tools/auto_bisect/bisect_results.py - Issue 554283003: Refactored bisect results dicts into a separate class

Side by Side Diff: tools/auto_bisect/bisect_results.py

Issue 554283003: Refactored bisect results dicts into a separate class (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Adressed comments Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright 2014 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import math

	6 import os

	7

	8 import bisect_utils

	9 import math_utils

	10 import ttest

	11

	12

	13 def ConfidenceScore(good_results_lists, bad_results_lists):

	14 """Calculates a confidence score.

	15

	16 This score is a percentage which represents our degree of confidence in the

	17 proposition that the good results and bad results are distinct groups, and

	18 their differences aren't due to chance alone.

	19

	20

	21 Args:

	22 good_results_lists: A list of lists of "good" result numbers.

	23 bad_results_lists: A list of lists of "bad" result numbers.

	24

	25 Returns:

	26 A number in the range [0, 100].

	27 """

	28 # If there's only one item in either list, this means only one revision was

	29 # classified good or bad; this isn't good enough evidence to make a decision.

	30 # If an empty list was passed, that also implies zero confidence.

	31 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:

	32 return 0.0

	33

	34 # Flatten the lists of results lists.

	35 sample1 = sum(good_results_lists, [])

	36 sample2 = sum(bad_results_lists, [])

	37

	38 # If there were only empty lists in either of the lists (this is unexpected

	39 # and normally shouldn't happen), then we also want to return 0.

	40 if not sample1 or not sample2:

	41 return 0.0

	42

	43 # The p-value is approximately the probability of obtaining the given set

	44 # of good and bad values just by chance.

	45 _, _, p_value = ttest.WelchsTTest(sample1, sample2)

	46 return 100.0 * (1.0 - p_value)

	47

	48

	49 class BisectResults(object):

	50

	51 def __init__(self, depot_registry, source_control):

	52 self._depot_registry = depot_registry

	53 self.revision_data = {}

	54 self.error = None

	55 self._source_control = source_control

	56

	57 @staticmethod

	58 def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):

	59 """Compiles a list of other possible regressions from the revision data.

	60

	61 Args:

	62 revision_data_sorted: Sorted list of (revision, revision data) pairs.

	63 bad_greater_than_good: Whether the result value at the "bad" revision is

	64 numerically greater than the result value at the "good" revision.

	65

	66 Returns:

	67 A list of [current_rev, previous_rev, confidence] for other places where

	68 there may have been a regression.

	69 """

	70 other_regressions = []

	71 previous_values = []

	72 previous_id = None

	73 for current_id, current_data in revision_data_sorted:

	74 current_values = current_data['value']

	75 if current_values:

	76 current_values = current_values['values']

	77 if previous_values:

	78 confidence = ConfidenceScore(previous_values, [current_values])

	79 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))

	80 mean_of_current_runs = math_utils.Mean(current_values)

	81

	82 # Check that the potential regression is in the same direction as

	83 # the overall regression. If the mean of the previous runs < the

	84 # mean of the current runs, this local regression is in same

	85 # direction.

	86 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs

	87 is_same_direction = (prev_less_than_current if

	88 bad_greater_than_good else not prev_less_than_current)

	89

	90 # Only report potential regressions with high confidence.

	91 if is_same_direction and confidence > 50:

	92 other_regressions.append([current_id, previous_id, confidence])

	93 previous_values.append(current_values)

	94 previous_id = current_id

	95 return other_regressions

	96

	97 def GetResultsDict(self):

	98 """Prepares and returns information about the final resulsts as a dict.

	99

	100 Returns:

	101 A dictionary with the following fields

	102

	103 'first_working_revision': First good revision.

	104 'last_broken_revision': Last bad revision.

	105 'culprit_revisions': A list of revisions, which contain the bad change

	106 introducing the failure.

	107 'other_regressions': A list of tuples representing other regressions,

	108 which may have occured.

	109 'regression_size': For performance bisects, this is a relative change of

	110 the mean metric value. For other bisects this field always contains

	111 'zero-to-nonzero'.

	112 'regression_std_err': For performance bisects, it is a pooled standard

	113 error for groups of good and bad runs. Not used for other bisects.

	114 'confidence': For performance bisects, it is a confidence that the good

	115 and bad runs are distinct groups. Not used for non-performance

	116 bisects.

	117 'revision_data_sorted': dict mapping revision ids to data about that

	118 revision. Each piece of revision data consists of a dict with the

	119 following keys:

	120

	121 'passed': Represents whether the performance test was successful at

	122 that revision. Possible values include: 1 (passed), 0 (failed),

	123 '?' (skipped), 'F' (build failed).

	124 'depot': The depot that this revision is from (i.e. WebKit)

	125 'external': If the revision is a 'src' revision, 'external' contains

	126 the revisions of each of the external libraries.

	127 'sort': A sort value for sorting the dict in order of commits.

	128

	129 For example:

	130 {

	131 'CL #1':

	132 {

	133 'passed': False,

	134 'depot': 'chromium',

	135 'external': None,

	136 'sort': 0

	137 }

	138 }

	139 """

	140 revision_data_sorted = sorted(self.revision_data.iteritems(),

	141 key = lambda x: x[1]['sort'])

	142

	143 # Find range where it possibly broke.

	144 first_working_revision = None

	145 first_working_revision_index = -1

	146 last_broken_revision = None

	147 last_broken_revision_index = -1

	148

	149 culprit_revisions = []

	150 other_regressions = []

	151 regression_size = 0.0

	152 regression_std_err = 0.0

	153 confidence = 0.0

	154

	155 for i in xrange(len(revision_data_sorted)):

	156 k, v = revision_data_sorted[i]

	157 if v['passed'] == 1:

	158 if not first_working_revision:

	159 first_working_revision = k

	160 first_working_revision_index = i

	161

	162 if not v['passed']:

	163 last_broken_revision = k

	164 last_broken_revision_index = i

	165

	166 if last_broken_revision != None and first_working_revision != None:

	167 broken_means = []

	168 for i in xrange(0, last_broken_revision_index + 1):

	169 if revision_data_sorted[i][1]['value']:

	170 broken_means.append(revision_data_sorted[i][1]['value']['values'])

	171

	172 working_means = []

	173 for i in xrange(first_working_revision_index, len(revision_data_sorted)):

	174 if revision_data_sorted[i][1]['value']:

	175 working_means.append(revision_data_sorted[i][1]['value']['values'])

	176

	177 # Flatten the lists to calculate mean of all values.

	178 working_mean = sum(working_means, [])

	179 broken_mean = sum(broken_means, [])

	180

	181 # Calculate the approximate size of the regression

	182 mean_of_bad_runs = math_utils.Mean(broken_mean)

	183 mean_of_good_runs = math_utils.Mean(working_mean)

	184

	185 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,

	186 mean_of_bad_runs)

	187 if math.isnan(regression_size):

	188 regression_size = 'zero-to-nonzero'

	189

	190 regression_std_err = math.fabs(math_utils.PooledStandardError(

	191 [working_mean, broken_mean]) /

	192 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

	193

	194 # Give a "confidence" in the bisect. At the moment we use how distinct the

	195 # values are before and after the last broken revision, and how noisy the

	196 # overall graph is.

	197 confidence = ConfidenceScore(working_means, broken_means)

	198

	199 culprit_revisions = []

	200

	201 cwd = os.getcwd()

	202 self._depot_registry.ChangeToDepotDir(

	203 self.revision_data[last_broken_revision]['depot'])

	204

	205 if self.revision_data[last_broken_revision]['depot'] == 'cros':

	206 # Want to get a list of all the commits and what depots they belong

	207 # to so that we can grab info about each.

	208 cmd = ['repo', 'forall', '-c',

	209 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (

	210 last_broken_revision, first_working_revision + 1)]

	211 output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)

	212

	213 changes = []

	214 assert not return_code, ('An error occurred while running '

	215 '"%s"' % ' '.join(cmd))

	216 last_depot = None

	217 cwd = os.getcwd()

	218 for l in output.split('\n'):

	219 if l:

	220 # Output will be in form:

	221 # /path_to_depot

	222 # /path_to_other_depot

	223 # <SHA1>

	224 # /path_again

	225 # <SHA1>

	226 # etc.

	227 if l[0] == '/':

	228 last_depot = l

	229 else:

	230 contents = l.split(' ')

	231 if len(contents) > 1:

	232 changes.append([last_depot, contents[0]])

	233 for c in changes:

	234 os.chdir(c[0])

	235 info = self._source_control.QueryRevisionInfo(c[1])

	236 culprit_revisions.append((c[1], info, None))

	237 else:

	238 for i in xrange(last_broken_revision_index, len(revision_data_sorted)):

	239 k, v = revision_data_sorted[i]

	240 if k == first_working_revision:

	241 break

	242 self._depot_registry.ChangeToDepotDir(v['depot'])

	243 info = self._source_control.QueryRevisionInfo(k)

	244 culprit_revisions.append((k, info, v['depot']))

	245 os.chdir(cwd)

	246

	247 # Check for any other possible regression ranges.

	248 other_regressions = self._FindOtherRegressions(

	249 revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)

	250

	251 return {

	252 'first_working_revision': first_working_revision,

	253 'last_broken_revision': last_broken_revision,

	254 'culprit_revisions': culprit_revisions,

	255 'other_regressions': other_regressions,

	256 'regression_size': regression_size,

	257 'regression_std_err': regression_std_err,

	258 'confidence': confidence,

	259 'revision_data_sorted': revision_data_sorted

	260 }

OLD	NEW

« no previous file with comments | « tools/auto_bisect/bisect_perf_regression_test.py ('k') | tools/auto_bisect/bisect_results_test.py » ('j') | no next file with comments »