Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(323)

Side by Side Diff: tools/auto_bisect/bisect_results.py

Issue 554283003: Refactored bisect results dicts into a separate class (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Adressed comments Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 import math
6 import os
7
8 import bisect_utils
9 import math_utils
10 import ttest
11
12
13 def ConfidenceScore(good_results_lists, bad_results_lists):
14 """Calculates a confidence score.
15
16 This score is a percentage which represents our degree of confidence in the
17 proposition that the good results and bad results are distinct groups, and
18 their differences aren't due to chance alone.
19
20
21 Args:
22 good_results_lists: A list of lists of "good" result numbers.
23 bad_results_lists: A list of lists of "bad" result numbers.
24
25 Returns:
26 A number in the range [0, 100].
27 """
28 # If there's only one item in either list, this means only one revision was
29 # classified good or bad; this isn't good enough evidence to make a decision.
30 # If an empty list was passed, that also implies zero confidence.
31 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
32 return 0.0
33
34 # Flatten the lists of results lists.
35 sample1 = sum(good_results_lists, [])
36 sample2 = sum(bad_results_lists, [])
37
38 # If there were only empty lists in either of the lists (this is unexpected
39 # and normally shouldn't happen), then we also want to return 0.
40 if not sample1 or not sample2:
41 return 0.0
42
43 # The p-value is approximately the probability of obtaining the given set
44 # of good and bad values just by chance.
45 _, _, p_value = ttest.WelchsTTest(sample1, sample2)
46 return 100.0 * (1.0 - p_value)
47
48
49 class BisectResults(object):
50
51 def __init__(self, depot_registry, source_control):
52 self._depot_registry = depot_registry
53 self.revision_data = {}
54 self.error = None
55 self._source_control = source_control
56
57 @staticmethod
58 def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
59 """Compiles a list of other possible regressions from the revision data.
60
61 Args:
62 revision_data_sorted: Sorted list of (revision, revision data) pairs.
63 bad_greater_than_good: Whether the result value at the "bad" revision is
64 numerically greater than the result value at the "good" revision.
65
66 Returns:
67 A list of [current_rev, previous_rev, confidence] for other places where
68 there may have been a regression.
69 """
70 other_regressions = []
71 previous_values = []
72 previous_id = None
73 for current_id, current_data in revision_data_sorted:
74 current_values = current_data['value']
75 if current_values:
76 current_values = current_values['values']
77 if previous_values:
78 confidence = ConfidenceScore(previous_values, [current_values])
79 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
80 mean_of_current_runs = math_utils.Mean(current_values)
81
82 # Check that the potential regression is in the same direction as
83 # the overall regression. If the mean of the previous runs < the
84 # mean of the current runs, this local regression is in same
85 # direction.
86 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
87 is_same_direction = (prev_less_than_current if
88 bad_greater_than_good else not prev_less_than_current)
89
90 # Only report potential regressions with high confidence.
91 if is_same_direction and confidence > 50:
92 other_regressions.append([current_id, previous_id, confidence])
93 previous_values.append(current_values)
94 previous_id = current_id
95 return other_regressions
96
97 def GetResultsDict(self):
98 """Prepares and returns information about the final resulsts as a dict.
99
100 Returns:
101 A dictionary with the following fields
102
103 'first_working_revision': First good revision.
104 'last_broken_revision': Last bad revision.
105 'culprit_revisions': A list of revisions, which contain the bad change
106 introducing the failure.
107 'other_regressions': A list of tuples representing other regressions,
108 which may have occured.
109 'regression_size': For performance bisects, this is a relative change of
110 the mean metric value. For other bisects this field always contains
111 'zero-to-nonzero'.
112 'regression_std_err': For performance bisects, it is a pooled standard
113 error for groups of good and bad runs. Not used for other bisects.
114 'confidence': For performance bisects, it is a confidence that the good
115 and bad runs are distinct groups. Not used for non-performance
116 bisects.
117 'revision_data_sorted': dict mapping revision ids to data about that
118 revision. Each piece of revision data consists of a dict with the
119 following keys:
120
121 'passed': Represents whether the performance test was successful at
122 that revision. Possible values include: 1 (passed), 0 (failed),
123 '?' (skipped), 'F' (build failed).
124 'depot': The depot that this revision is from (i.e. WebKit)
125 'external': If the revision is a 'src' revision, 'external' contains
126 the revisions of each of the external libraries.
127 'sort': A sort value for sorting the dict in order of commits.
128
129 For example:
130 {
131 'CL #1':
132 {
133 'passed': False,
134 'depot': 'chromium',
135 'external': None,
136 'sort': 0
137 }
138 }
139 """
140 revision_data_sorted = sorted(self.revision_data.iteritems(),
141 key = lambda x: x[1]['sort'])
142
143 # Find range where it possibly broke.
144 first_working_revision = None
145 first_working_revision_index = -1
146 last_broken_revision = None
147 last_broken_revision_index = -1
148
149 culprit_revisions = []
150 other_regressions = []
151 regression_size = 0.0
152 regression_std_err = 0.0
153 confidence = 0.0
154
155 for i in xrange(len(revision_data_sorted)):
156 k, v = revision_data_sorted[i]
157 if v['passed'] == 1:
158 if not first_working_revision:
159 first_working_revision = k
160 first_working_revision_index = i
161
162 if not v['passed']:
163 last_broken_revision = k
164 last_broken_revision_index = i
165
166 if last_broken_revision != None and first_working_revision != None:
167 broken_means = []
168 for i in xrange(0, last_broken_revision_index + 1):
169 if revision_data_sorted[i][1]['value']:
170 broken_means.append(revision_data_sorted[i][1]['value']['values'])
171
172 working_means = []
173 for i in xrange(first_working_revision_index, len(revision_data_sorted)):
174 if revision_data_sorted[i][1]['value']:
175 working_means.append(revision_data_sorted[i][1]['value']['values'])
176
177 # Flatten the lists to calculate mean of all values.
178 working_mean = sum(working_means, [])
179 broken_mean = sum(broken_means, [])
180
181 # Calculate the approximate size of the regression
182 mean_of_bad_runs = math_utils.Mean(broken_mean)
183 mean_of_good_runs = math_utils.Mean(working_mean)
184
185 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
186 mean_of_bad_runs)
187 if math.isnan(regression_size):
188 regression_size = 'zero-to-nonzero'
189
190 regression_std_err = math.fabs(math_utils.PooledStandardError(
191 [working_mean, broken_mean]) /
192 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
193
194 # Give a "confidence" in the bisect. At the moment we use how distinct the
195 # values are before and after the last broken revision, and how noisy the
196 # overall graph is.
197 confidence = ConfidenceScore(working_means, broken_means)
198
199 culprit_revisions = []
200
201 cwd = os.getcwd()
202 self._depot_registry.ChangeToDepotDir(
203 self.revision_data[last_broken_revision]['depot'])
204
205 if self.revision_data[last_broken_revision]['depot'] == 'cros':
206 # Want to get a list of all the commits and what depots they belong
207 # to so that we can grab info about each.
208 cmd = ['repo', 'forall', '-c',
209 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
210 last_broken_revision, first_working_revision + 1)]
211 output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
212
213 changes = []
214 assert not return_code, ('An error occurred while running '
215 '"%s"' % ' '.join(cmd))
216 last_depot = None
217 cwd = os.getcwd()
218 for l in output.split('\n'):
219 if l:
220 # Output will be in form:
221 # /path_to_depot
222 # /path_to_other_depot
223 # <SHA1>
224 # /path_again
225 # <SHA1>
226 # etc.
227 if l[0] == '/':
228 last_depot = l
229 else:
230 contents = l.split(' ')
231 if len(contents) > 1:
232 changes.append([last_depot, contents[0]])
233 for c in changes:
234 os.chdir(c[0])
235 info = self._source_control.QueryRevisionInfo(c[1])
236 culprit_revisions.append((c[1], info, None))
237 else:
238 for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
239 k, v = revision_data_sorted[i]
240 if k == first_working_revision:
241 break
242 self._depot_registry.ChangeToDepotDir(v['depot'])
243 info = self._source_control.QueryRevisionInfo(k)
244 culprit_revisions.append((k, info, v['depot']))
245 os.chdir(cwd)
246
247 # Check for any other possible regression ranges.
248 other_regressions = self._FindOtherRegressions(
249 revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
250
251 return {
252 'first_working_revision': first_working_revision,
253 'last_broken_revision': last_broken_revision,
254 'culprit_revisions': culprit_revisions,
255 'other_regressions': other_regressions,
256 'regression_size': regression_size,
257 'regression_std_err': regression_std_err,
258 'confidence': confidence,
259 'revision_data_sorted': revision_data_sorted
260 }
OLDNEW
« no previous file with comments | « tools/auto_bisect/bisect_perf_regression_test.py ('k') | tools/auto_bisect/bisect_results_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698