tools/auto_bisect/bisect_results.py - Issue 665893003: Re-applying reverted changes for regression confidence check + fix: ConfidenceScoretakes flat lists

Side by Side Diff: tools/auto_bisect/bisect_results.py

Issue 665893003: Re-applying reverted changes for regression confidence check + fix: ConfidenceScoretakes flat lists (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebasing after significant refactoring. Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2014 The Chromium Authors. All rights reserved.	1 # Copyright 2014 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import math	5 import math

6 import os	6 import os

7	7

8 import bisect_utils	8 import bisect_utils

9 import math_utils	9 import math_utils

10 import source_control	10 import source_control

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
102 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:	102 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:

103 warnings.append('Confidence is not high. Try bisecting again '	103 warnings.append('Confidence is not high. Try bisecting again '

104 'with increased repeat_count, larger range, or '	104 'with increased repeat_count, larger range, or '

105 'on another metric.')	105 'on another metric.')

106 if not confidence:	106 if not confidence:

107 warnings.append('Confidence score is 0%. Try bisecting again on '	107 warnings.append('Confidence score is 0%. Try bisecting again on '

108 'another platform or another metric.')	108 'another platform or another metric.')

109 return warnings	109 return warnings

110	110

111 @staticmethod	111 @staticmethod

112 def ConfidenceScore(good_results_lists, bad_results_lists,	112 def ConfidenceScore(sample1, sample2,

113 accept_single_bad_or_good=False):	113 accept_single_bad_or_good=False):

114 """Calculates a confidence score.	114 """Calculates a confidence score.

115	115

116 This score is a percentage which represents our degree of confidence in the	116 This score is a percentage which represents our degree of confidence in the

117 proposition that the good results and bad results are distinct groups, and	117 proposition that the good results and bad results are distinct groups, and

118 their differences aren't due to chance alone.	118 their differences aren't due to chance alone.

119	119

120	120

121 Args:	121 Args:

122 good_results_lists: A list of lists of "good" result numbers.	122 sample1: A flat list of "good" result numbers.

123 bad_results_lists: A list of lists of "bad" result numbers.	123 sample2: A flat list of "bad" result numbers.

124 accept_single_bad_or_good: If True, computes confidence even if there is	124 accept_single_bad_or_good: If True, computes confidence even if there is

125 just one bad or good revision, otherwise single good or bad revision	125 just one bad or good revision, otherwise single good or bad revision

126 always returns 0.0 confidence. This flag will probably get away when	126 always returns 0.0 confidence. This flag will probably get away when

127 we will implement expanding the bisect range by one more revision for	127 we will implement expanding the bisect range by one more revision for

128 such case.	128 such case.

129	129

130 Returns:	130 Returns:

131 A number in the range [0, 100].	131 A number in the range [0, 100].

132 """	132 """

133 # If there's only one item in either list, this means only one revision was	133 # If there's only one item in either list, this means only one revision was

134 # classified good or bad; this isn't good enough evidence to make a	134 # classified good or bad; this isn't good enough evidence to make a

135 # decision. If an empty list was passed, that also implies zero confidence.	135 # decision. If an empty list was passed, that also implies zero confidence.

136 if not accept_single_bad_or_good:	136 if not accept_single_bad_or_good:

137 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:	137 if len(sample1) <= 1 or len(sample2) <= 1:

138 return 0.0	138 return 0.0

139	139

140 # Flatten the lists of results lists.

141 sample1 = sum(good_results_lists, [])

142 sample2 = sum(bad_results_lists, [])

143

144 # If there were only empty lists in either of the lists (this is unexpected	140 # If there were only empty lists in either of the lists (this is unexpected

145 # and normally shouldn't happen), then we also want to return 0.	141 # and normally shouldn't happen), then we also want to return 0.

146 if not sample1 or not sample2:	142 if not sample1 or not sample2:

147 return 0.0	143 return 0.0

148	144

149 # The p-value is approximately the probability of obtaining the given set	145 # The p-value is approximately the probability of obtaining the given set

150 # of good and bad values just by chance.	146 # of good and bad values just by chance.

151 _, _, p_value = ttest.WelchsTTest(sample1, sample2)	147 _, _, p_value = ttest.WelchsTTest(sample1, sample2)

152 return 100.0 * (1.0 - p_value)	148 return 100.0 * (1.0 - p_value)

153	149

(...skipping 10 matching lines...) Expand all Loading...
164 A list of [current_rev, previous_rev, confidence] for other places where	160 A list of [current_rev, previous_rev, confidence] for other places where

165 there may have been a regression.	161 there may have been a regression.

166 """	162 """

167 other_regressions = []	163 other_regressions = []

168 previous_values = []	164 previous_values = []

169 prev_state = None	165 prev_state = None

170 for revision_state in revision_states:	166 for revision_state in revision_states:

171 if revision_state.value:	167 if revision_state.value:

172 current_values = revision_state.value['values']	168 current_values = revision_state.value['values']

173 if previous_values:	169 if previous_values:

174 confidence = cls.ConfidenceScore(previous_values, [current_values],	170 confidence_params = (sum(previous_values, []),

	171 sum([current_values], []))

	172 confidence = cls.ConfidenceScore(*confidence_params,

175 accept_single_bad_or_good=True)	173 accept_single_bad_or_good=True)

176 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))	174 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))

177 mean_of_current_runs = math_utils.Mean(current_values)	175 mean_of_current_runs = math_utils.Mean(current_values)

178	176

179 # Check that the potential regression is in the same direction as	177 # Check that the potential regression is in the same direction as

180 # the overall regression. If the mean of the previous runs < the	178 # the overall regression. If the mean of the previous runs < the

181 # mean of the current runs, this local regression is in same	179 # mean of the current runs, this local regression is in same

182 # direction.	180 # direction.

183 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs	181 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs

184 is_same_direction = (prev_greater_than_current if	182 is_same_direction = (prev_greater_than_current if

(...skipping 61 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
246 if math.isnan(regression_size):	244 if math.isnan(regression_size):

247 regression_size = 'zero-to-nonzero'	245 regression_size = 'zero-to-nonzero'

248	246

249 regression_std_err = math.fabs(math_utils.PooledStandardError(	247 regression_std_err = math.fabs(math_utils.PooledStandardError(

250 [working_mean, broken_mean]) /	248 [working_mean, broken_mean]) /

251 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0	249 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

252	250

253 # Give a "confidence" in the bisect. At the moment we use how distinct the	251 # Give a "confidence" in the bisect. At the moment we use how distinct the

254 # values are before and after the last broken revision, and how noisy the	252 # values are before and after the last broken revision, and how noisy the

255 # overall graph is.	253 # overall graph is.

256 confidence = cls.ConfidenceScore(working_means, broken_means)	254 confidence_params = (sum(working_means, []), sum(broken_means, []))

	255 confidence = cls.ConfidenceScore(*confidence_params)

257	256

258 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs	257 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs

259	258

260 return {'regression_size': regression_size,	259 return {'regression_size': regression_size,

261 'regression_std_err': regression_std_err,	260 'regression_std_err': regression_std_err,

262 'confidence': confidence,	261 'confidence': confidence,

263 'bad_greater_than_good': bad_greater_than_good}	262 'bad_greater_than_good': bad_greater_than_good}

OLD	NEW

« tools/auto_bisect/bisect_perf_regression.py ('K') | « tools/auto_bisect/bisect_perf_regression_test.py ('k') | tools/auto_bisect/bisect_results_test.py » ('j') | no next file with comments »