appengine/findit/crash/loglinear/changelist_classifier.py - Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM.

Side by Side Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)

Patch Set: Address comment. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 from collections import defaultdict	5 from collections import defaultdict

6 import logging	6 import logging

7 import math	7 import math

8	8

9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher	9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher

10 from crash import changelist_classifier	10 from crash import changelist_classifier

11 from crash.crash_report_with_dependencies import CrashReportWithDependencies	11 from crash.crash_report_with_dependencies import CrashReportWithDependencies

12 from crash.loglinear.changelist_features import min_distance	12 from crash.loglinear.changelist_features import min_distance

13 from crash.loglinear.changelist_features import top_frame_index	13 from crash.loglinear.changelist_features import top_frame_index

14 from crash.loglinear.model import ToFeatureFunction	14 from crash.loglinear.feature import FeatureFunction

15 from crash.loglinear.model import UnnormalizedLogLinearModel	15 from crash.loglinear.model import UnnormalizedLogLinearModel

16 from crash.stacktrace import CallStack	16 from crash.stacktrace import CallStack

17 from crash.stacktrace import Stacktrace	17 from crash.stacktrace import Stacktrace

18 from crash.suspect import StackInfo	18 from crash.suspect import StackInfo

19	19

20	20

21 class LogLinearChangelistClassifier(object):	21 class LogLinearChangelistClassifier(object):

22 """A ``LogLinearModel``-based implementation of CL classification."""	22 """A ``LogLinearModel``-based implementation of CL classification."""

23	23

24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3):	24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3):

(...skipping 10 matching lines...) Expand all Loading...
35 for. We take this argument as a dict rather than as a list so that	35 for. We take this argument as a dict rather than as a list so that

36 callers needn't worry about what order to provide the weights in.	36 callers needn't worry about what order to provide the weights in.

37 top_n_frames (int): how many frames of each callstack to look at.	37 top_n_frames (int): how many frames of each callstack to look at.

38 top_n_suspects (int): maximum number of suspects to return.	38 top_n_suspects (int): maximum number of suspects to return.

39 """	39 """

40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository)	40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository)

41 self._get_repository = get_repository	41 self._get_repository = get_repository

42 self._top_n_frames = top_n_frames	42 self._top_n_frames = top_n_frames

43 self._top_n_suspects = top_n_suspects	43 self._top_n_suspects = top_n_suspects

44	44

45 feature_function = ToFeatureFunction([	45 feature_function = FeatureFunction([

46 top_frame_index.TopFrameIndexFeature(top_n_frames),	46 top_frame_index.TopFrameIndexFeature(top_n_frames),

47 min_distance.MinDistanceFeature(),	47 min_distance.MinDistanceFeature(),

48 ])	48 ])

49	49

50 weight_list = [	50 self._model = UnnormalizedLogLinearModel(feature_function, weights)

51 weights['TopFrameIndex'],

52 weights['MinDistance'],

53 ]

54

55 self._model = UnnormalizedLogLinearModel(feature_function, weight_list)

56

57 # TODO(crbug.com/674262): remove the need for storing these weights.

58 self._weights = weights

59

60 # TODO(crbug.com/673964): something better for detecting "close to log(0)".

61 def _LogZeroish(self, x):

62 """Determine whether a float is close enough to log(0).

63

64 If a ``FeatureValue`` has a (log-domain) score of -inf for a given

65 ``Suspect``, then that suspect has zero probability of being the

66 culprit. We want to filter these suspects out, to clean up the

67 output of classification; so this method encapsulates the logic of

68 that check.

69

70 Args:

71 x (float): the float to check

72

73 Returns:

74 ``True`` if ``x`` is close enough to log(0); else ``False``.

75 """

76 return x < 0 and math.isinf(x)

77

78 def _SingleFeatureScore(self, feature_value):

79 """Returns the score (aka weighted value) of a ``FeatureValue``.

80

81 This function assumes the report's stacktrace has already had any necessary

82 preprocessing (like filtering or truncating) applied.

83

84 Args:

85 feature_value (FeatureValue): the feature value to check.

86

87 Returns:

88 The score of the feature value.

89 """

90 return feature_value.value * self._weights.get(feature_value.name, 0.)

91	51

92 def __call__(self, report):	52 def __call__(self, report):

93 """Finds changelists suspected of being responsible for the crash report.	53 """Finds changelists suspected of being responsible for the crash report.

94	54

95 Args:	55 Args:

96 report (CrashReport): the report to be analyzed.	56 report (CrashReport): the report to be analyzed.

97	57

98 Returns:	58 Returns:

99 List of ``Suspect``s, sorted by probability from highest to lowest.	59 List of ``Suspect``s, sorted by probability from highest to lowest.

100 """	60 """

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
144 return changelist_classifier.FindSuspects(	104 return changelist_classifier.FindSuspects(

145 dep_to_file_to_changelogs,	105 dep_to_file_to_changelogs,

146 dep_to_file_to_stack_infos,	106 dep_to_file_to_stack_infos,

147 report.dependencies,	107 report.dependencies,

148 self._get_repository,	108 self._get_repository,

149 ignore_cls)	109 ignore_cls)

150	110

151 def RankSuspects(self, report, suspects):	111 def RankSuspects(self, report, suspects):

152 """Returns a lineup of the suspects in order of likelihood.	112 """Returns a lineup of the suspects in order of likelihood.

153	113

	114 Suspects with a discardable score or lower ranking than top_n_suspects

	115 will be filtered.

	116

154 Args:	117 Args:

155 report (CrashReportWithDependencies): the crash we seek to explain.	118 report (CrashReportWithDependencies): the crash we seek to explain.

156 suspects (list of Suspect): the CLs to consider blaming for the crash.	119 suspects (iterable of Suspect): the CLs to consider blaming for the crash.

157	120

158 Returns:	121 Returns:

159 A list of suspects in order according to their likelihood. This	122 A list of suspects in order according to their likelihood. This

160 list contains elements of the ``suspects`` list, where we mutate	123 list contains elements of the ``suspects`` list, where we mutate

161 some of the fields to store information about why that suspect	124 some of the fields to store information about why that suspect

162 is being blamed (e.g., the ``confidence``, ``reasons``, and	125 is being blamed (e.g., the ``confidence``, ``reasons``, and

163 ``changed_files`` fields are updated). In addition to sorting the	126 ``changed_files`` fields are updated). In addition to sorting the

164 suspects, we also filter out those which are exceedingly unlikely	127 suspects, we also filter out those which are exceedingly unlikely

165 or don't make the ``top_n_suspects`` cut.	128 or don't make the ``top_n_suspects`` cut.

166 """	129 """

167 # Score the suspects and organize them for outputting/returning.	130 # Score the suspects and organize them for outputting/returning.

168 features_given_report = self._model.Features(report)	131 features_given_report = self._model.Features(report)

169 score_given_report = self._model.Score(report)	132 score_given_report = self._model.Score(report)

170	133

171 scored_suspects = []	134 scored_suspects = []

172 for suspect in suspects:	135 for suspect in suspects:

173 score = score_given_report(suspect)	136 score = score_given_report(suspect)

174 if self._LogZeroish(score):	137 if self._model.LogZeroish(score):

175 logging.debug('Discarding suspect because it has zero probability: %s'	138 logging.debug('Discarding suspect because it has zero probability: %s'

176 % str(suspect.ToDict()))	139 % str(suspect.ToDict()))

177 continue	140 continue

178	141

179 suspect.confidence = score	142 suspect.confidence = score

180 features = features_given_report(suspect)	143 features = features_given_report(suspect)

181 suspect.reasons = self.FormatReasons(features)	144 suspect.reasons = self._model.FormatReasons(features.itervalues())

182 suspect.changed_files = [	145 suspect.changed_files = [

183 changed_file.ToDict()	146 changed_file.ToDict() for changed_file in

184 for changed_file in self.AggregateChangedFiles(features)]	147 self._model.AggregateChangedFiles(features.itervalues())]

185 scored_suspects.append(suspect)	148 scored_suspects.append(suspect)

186	149

187 scored_suspects.sort(key=lambda suspect: suspect.confidence)	150 scored_suspects.sort(key=lambda suspect: suspect.confidence)

188 return scored_suspects[:self._top_n_suspects]	151 return scored_suspects[:self._top_n_suspects]

189

190 def FormatReasons(self, features):

191 """Collect and format a list of all ``FeatureValue.reason`` strings.

192

193 Args:

194 features (list of FeatureValue): the values whose ``reason``

195 strings should be collected.

196

197 Returns:

198 A list of ``(str, float, str)`` triples; where the first string is

199 the feature name, the float is some numeric representation of how

200 much influence this feature exerts on the ``Suspect`` being blamed,

201 and the final string is the ``FeatureValue.reason``. The list is

202 sorted by feature name, just to ensure that it comes out in some

203 canonical order.

204

205 At present, the float is the log-domain score of the feature

206 value. However, this isn't the best thing for UX reasons. In the

207 future it might be replaced by the normal-domain score, or by

208 the probability.

209 """

210 formatted_reasons = []

211 for feature in features:

212 feature_score = self._SingleFeatureScore(feature)

213 if self._LogZeroish(feature_score): # pragma: no cover

214 logging.debug('Discarding reasons from feature %s'

215 ' because it has zero probability' % feature.name)

216 continue

217

218 formatted_reasons.append((feature.name, feature_score, feature.reason))

219

220 return sorted(formatted_reasons,

221 key=lambda formatted_reason: formatted_reason[0])

222

223 def AggregateChangedFiles(self, features):

224 """Merge multiple``FeatureValue.changed_files`` lists into one.

225

226 Args:

227 features (list of FeatureValue): the values whose ``changed_files``

228 lists should be aggregated.

229

230 Returns:

231 A list of ``ChangedFile`` objects sorted by file name. The sorting

232 is not essential, but is provided to ease testing by ensuring the

233 output is in some canonical order.

234

235 Raises:

236 ``ValueError`` if any file name is given inconsistent ``blame_url``s.

237 """

238 all_changed_files = {}

239 for feature in features:

240 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover

241 logging.debug('Discarding changed files from feature %s'

242 ' because it has zero probability' % feature.name)

243 continue

244

245 for changed_file in feature.changed_files or []:

246 accumulated_changed_file = all_changed_files.get(changed_file.name)

247 if accumulated_changed_file is None:

248 all_changed_files[changed_file.name] = changed_file

249 continue

250

251 if (accumulated_changed_file.blame_url !=

252 changed_file.blame_url): # pragma: no cover

253 raise ValueError('Blame URLs do not match: %s != %s'

254 % (accumulated_changed_file.blame_url, changed_file.blame_url))

255 accumulated_changed_file.reasons.extend(changed_file.reasons or [])

256

257 return sorted(all_changed_files.values(),

258 key=lambda changed_file: changed_file.name)

OLD	NEW

« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/feature.py » ('j') | appengine/findit/crash/loglinear/feature.py » ('J')