appengine/findit/crash/loglinear/changelist_classifier.py - Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM.

Side by Side Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)

Patch Set: Update doc strs. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 from collections import defaultdict	5 from collections import defaultdict

6 import logging	6 import logging

7 import math	7 import math

8	8

9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher	9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher

10 from crash import changelist_classifier	10 from crash import changelist_classifier

11 from crash.crash_report_with_dependencies import CrashReportWithDependencies	11 from crash.crash_report_with_dependencies import CrashReportWithDependencies

12 from crash.loglinear.changelist_features import min_distance	12 from crash.loglinear.changelist_features import min_distance

13 from crash.loglinear.changelist_features import top_frame_index	13 from crash.loglinear.changelist_features import top_frame_index

14 from crash.loglinear.model import ToFeatureFunction	14 from crash.loglinear.feature import FeatureFunction

15 from crash.loglinear.model import UnnormalizedLogLinearModel	15 from crash.loglinear.model import UnnormalizedLogLinearModel

16 from crash.stacktrace import CallStack	16 from crash.stacktrace import CallStack

17 from crash.stacktrace import Stacktrace	17 from crash.stacktrace import Stacktrace

18 from crash.suspect import StackInfo	18 from crash.suspect import StackInfo

19	19

20	20

21 class LogLinearChangelistClassifier(object):	21 class LogLinearChangelistClassifier(object):

22 """A ``LogLinearModel``-based implementation of CL classification."""	22 """A ``LogLinearModel``-based implementation of CL classification."""

23	23

24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3):	24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3):

(...skipping 10 matching lines...) Expand all Loading...
35 for. We take this argument as a dict rather than as a list so that	35 for. We take this argument as a dict rather than as a list so that

36 callers needn't worry about what order to provide the weights in.	36 callers needn't worry about what order to provide the weights in.

37 top_n_frames (int): how many frames of each callstack to look at.	37 top_n_frames (int): how many frames of each callstack to look at.

38 top_n_suspects (int): maximum number of suspects to return.	38 top_n_suspects (int): maximum number of suspects to return.

39 """	39 """

40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository)	40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository)

41 self._get_repository = get_repository	41 self._get_repository = get_repository

42 self._top_n_frames = top_n_frames	42 self._top_n_frames = top_n_frames

43 self._top_n_suspects = top_n_suspects	43 self._top_n_suspects = top_n_suspects

44	44

45 feature_function = ToFeatureFunction([	45 feature_function = FeatureFunction([

46 top_frame_index.TopFrameIndexFeature(top_n_frames),	46 top_frame_index.TopFrameIndexFeature(top_n_frames),

47 min_distance.MinDistanceFeature(),	47 min_distance.MinDistanceFeature(),

48 ])	48 ])

49	49

50 weight_list = [	50 self._model = UnnormalizedLogLinearModel(feature_function, weights)

51 weights['TopFrameIndex'],

52 weights['MinDistance'],

53 ]

54

55 self._model = UnnormalizedLogLinearModel(feature_function, weight_list)

56

57 # TODO(crbug.com/674262): remove the need for storing these weights.

58 self._weights = weights

59	51

60 # TODO(crbug.com/673964): something better for detecting "close to log(0)".	52 # TODO(crbug.com/673964): something better for detecting "close to log(0)".

61 def _LogZeroish(self, x):	53 def _LogZeroish(self, x):
	wrengr 2017/01/11 20:38:30 This should also be moved to UnnormalizedLLM. That This should also be moved to UnnormalizedLLM. That way it can be based on the same epsilon used there, rather than only checking that it is exactly equal to -inf. Sharu Jiang 2017/01/12 01:41:38 Done. Show quoted text On 2017/01/11 20:38:30, wrengr wrote: > This should also be moved to UnnormalizedLLM. That way it can be based on the > same epsilon used there, rather than only checking that it is exactly equal to > -inf. Done.
62 """Determine whether a float is close enough to log(0).	54 """Determine whether a float is close enough to log(0).

63	55

64 If a ``FeatureValue`` has a (log-domain) score of -inf for a given	56 If a ``FeatureValue`` has a (log-domain) score of -inf for a given

65 ``Suspect``, then that suspect has zero probability of being the	57 ``Suspect``, then that suspect has zero probability of being the

66 culprit. We want to filter these suspects out, to clean up the	58 culprit. We want to filter these suspects out, to clean up the

67 output of classification; so this method encapsulates the logic of	59 output of classification; so this method encapsulates the logic of

68 that check.	60 that check.

69	61

70 Args:	62 Args:

71 x (float): the float to check	63 x (float): the float to check

72	64

73 Returns:	65 Returns:

74 ``True`` if ``x`` is close enough to log(0); else ``False``.	66 ``True`` if ``x`` is close enough to log(0); else ``False``.

75 """	67 """

76 return x < 0 and math.isinf(x)	68 return x < 0 and math.isinf(x)

77	69

78 def _SingleFeatureScore(self, feature_value):

79 """Returns the score (aka weighted value) of a ``FeatureValue``.

80

81 This function assumes the report's stacktrace has already had any necessary

82 preprocessing (like filtering or truncating) applied.

83

84 Args:

85 feature_value (FeatureValue): the feature value to check.

86

87 Returns:

88 The score of the feature value.

89 """

90 return feature_value.value * self._weights.get(feature_value.name, 0.)

91

92 def __call__(self, report):	70 def __call__(self, report):

93 """Finds changelists suspected of being responsible for the crash report.	71 """Finds changelists suspected of being responsible for the crash report.

94	72

95 Args:	73 Args:

96 report (CrashReport): the report to be analyzed.	74 report (CrashReport): the report to be analyzed.

97	75

98 Returns:	76 Returns:

99 List of ``Suspect``s, sorted by probability from highest to lowest.	77 List of ``Suspect``s, sorted by probability from highest to lowest.

100 """	78 """

101 report = CrashReportWithDependencies(report, self._dependency_fetcher)	79 report = CrashReportWithDependencies(report, self._dependency_fetcher)

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
135 scored_suspects = []	113 scored_suspects = []

136 for suspect in suspects:	114 for suspect in suspects:

137 score = score_given_report(suspect)	115 score = score_given_report(suspect)

138 if self._LogZeroish(score):	116 if self._LogZeroish(score):

139 logging.debug('Discarding suspect because it has zero probability: %s'	117 logging.debug('Discarding suspect because it has zero probability: %s'

140 % str(suspect.ToDict()))	118 % str(suspect.ToDict()))

141 continue	119 continue

142	120

143 suspect.confidence = score	121 suspect.confidence = score

144 features = features_given_report(suspect)	122 features = features_given_report(suspect)

145 suspect.reasons = self.FormatReasons(features)	123 suspect.reasons = self.FormatReasons(features.itervalues())

146 suspect.changed_files = [	124 suspect.changed_files = [

147 changed_file.ToDict()	125 changed_file.ToDict()

148 for changed_file in self.AggregateChangedFiles(features)]	126 for changed_file in self.AggregateChangedFiles(features.itervalues())]

149 scored_suspects.append(suspect)	127 scored_suspects.append(suspect)

150	128

151 scored_suspects.sort(key=lambda suspect: suspect.confidence)	129 scored_suspects.sort(key=lambda suspect: suspect.confidence)

152 return scored_suspects[:self._top_n_suspects]	130 return scored_suspects[:self._top_n_suspects]

153	131

154 def FormatReasons(self, features):	132 def FormatReasons(self, features):

155 """Collect and format a list of all ``FeatureValue.reason`` strings.	133 """Collect and format a list of all ``FeatureValue.reason`` strings.

156	134

157 Args:	135 Args:

158 features (list of FeatureValue): the values whose ``reason``	136 features (list of FeatureValue): the values whose ``reason``
	wrengr 2017/01/11 20:38:30 since now we're passing an iterator rather than a since now we're passing an iterator rather than a list, you should update this to say either "iterator" or "iterable" (the function doesn't really care which it gets). Sharu Jiang 2017/01/12 01:41:37 Done. Show quoted text On 2017/01/11 20:38:30, wrengr wrote: > since now we're passing an iterator rather than a list, you should update this > to say either "iterator" or "iterable" (the function doesn't really care which > it gets). Done.
159 strings should be collected.	137 strings should be collected.

160	138

161 Returns:	139 Returns:

162 A list of ``(str, float, str)`` triples; where the first string is	140 A list of ``(str, float, str)`` triples; where the first string is

163 the feature name, the float is some numeric representation of how	141 the feature name, the float is some numeric representation of how

164 much influence this feature exerts on the ``Suspect`` being blamed,	142 much influence this feature exerts on the ``Suspect`` being blamed,

165 and the final string is the ``FeatureValue.reason``. The list is	143 and the final string is the ``FeatureValue.reason``. The list is

166 sorted by feature name, just to ensure that it comes out in some	144 sorted by feature name, just to ensure that it comes out in some

167 canonical order.	145 canonical order.

168	146

169 At present, the float is the log-domain score of the feature	147 At present, the float is the log-domain score of the feature

170 value. However, this isn't the best thing for UX reasons. In the	148 value. However, this isn't the best thing for UX reasons. In the

171 future it might be replaced by the normal-domain score, or by	149 future it might be replaced by the normal-domain score, or by

172 the probability.	150 the probability.

173 """	151 """

174 formatted_reasons = []	152 formatted_reasons = []

175 for feature in features:	153 for feature in features:

176 feature_score = self._SingleFeatureScore(feature)	154 feature_score = self._model.SingleFeatureScore(feature)

177 if self._LogZeroish(feature_score): # pragma: no cover	155 if self._LogZeroish(feature_score): # pragma: no cover

178 logging.debug('Discarding reasons from feature %s'	156 logging.debug('Discarding reasons from feature %s'

179 ' because it has zero probability' % feature.name)	157 ' because it has zero probability' % feature.name)

180 continue	158 continue

181	159

182 formatted_reasons.append((feature.name, feature_score, feature.reason))	160 formatted_reasons.append((feature.name, feature_score, feature.reason))

183	161

184 return sorted(formatted_reasons,	162 return sorted(formatted_reasons,
	wrengr 2017/01/11 20:38:30 unrelated to the CL's goals, but I just noticed: T unrelated to the CL's goals, but I just noticed: This should be changed to ``formatted_reasons.sort(...); return formatted_reasons`` since there's no point in allocating a new list here. Sharu Jiang 2017/01/12 01:41:37 Done. Show quoted text On 2017/01/11 20:38:30, wrengr wrote: > unrelated to the CL's goals, but I just noticed: This should be changed to > ``formatted_reasons.sort(...); return formatted_reasons`` since there's no point > in allocating a new list here. Done.
185 key=lambda formatted_reason: formatted_reason[0])	163 key=lambda formatted_reason: formatted_reason[0])

186	164

187 def AggregateChangedFiles(self, features):	165 def AggregateChangedFiles(self, features):

188 """Merge multiple``FeatureValue.changed_files`` lists into one.	166 """Merge multiple``FeatureValue.changed_files`` lists into one.

189	167

190 Args:	168 Args:

191 features (list of FeatureValue): the values whose ``changed_files``	169 features (list of FeatureValue): the values whose ``changed_files``
	wrengr 2017/01/11 20:38:30 ditto ditto Sharu Jiang 2017/01/12 01:41:37 Done. Show quoted text On 2017/01/11 20:38:30, wrengr wrote: > ditto Done.
192 lists should be aggregated.	170 lists should be aggregated.

193	171

194 Returns:	172 Returns:

195 A list of ``ChangedFile`` objects sorted by file name. The sorting	173 A list of ``ChangedFile`` objects sorted by file name. The sorting

196 is not essential, but is provided to ease testing by ensuring the	174 is not essential, but is provided to ease testing by ensuring the

197 output is in some canonical order.	175 output is in some canonical order.

198	176

199 Raises:	177 Raises:

200 ``ValueError`` if any file name is given inconsistent ``blame_url``s.	178 ``ValueError`` if any file name is given inconsistent ``blame_url``s.

201 """	179 """

202 all_changed_files = {}	180 all_changed_files = {}

203 for feature in features:	181 for feature in features:

204 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover	182 if self._LogZeroish(

	183 self._model.SingleFeatureScore(feature)): # pragma: no cover

205 logging.debug('Discarding changed files from feature %s'	184 logging.debug('Discarding changed files from feature %s'

206 ' because it has zero probability' % feature.name)	185 ' because it has zero probability' % feature.name)

207 continue	186 continue

208	187

209 for changed_file in feature.changed_files or []:	188 for changed_file in feature.changed_files or []:

210 accumulated_changed_file = all_changed_files.get(changed_file.name)	189 accumulated_changed_file = all_changed_files.get(changed_file.name)

211 if accumulated_changed_file is None:	190 if accumulated_changed_file is None:

212 all_changed_files[changed_file.name] = changed_file	191 all_changed_files[changed_file.name] = changed_file

213 continue	192 continue

214	193

215 if (accumulated_changed_file.blame_url !=	194 if (accumulated_changed_file.blame_url !=

216 changed_file.blame_url): # pragma: no cover	195 changed_file.blame_url): # pragma: no cover

217 raise ValueError('Blame URLs do not match: %s != %s'	196 raise ValueError('Blame URLs do not match: %s != %s'

218 % (accumulated_changed_file.blame_url, changed_file.blame_url))	197 % (accumulated_changed_file.blame_url, changed_file.blame_url))

219 accumulated_changed_file.reasons.extend(changed_file.reasons or [])	198 accumulated_changed_file.reasons.extend(changed_file.reasons or [])

220	199

221 return sorted(all_changed_files.values(),	200 return sorted(all_changed_files.values(),
	wrengr 2017/01/11 20:38:30 ditto. ditto. Sharu Jiang 2017/01/12 01:41:37 Done. Show quoted text On 2017/01/11 20:38:30, wrengr wrote: > ditto. Done.
222 key=lambda changed_file: changed_file.name)	201 key=lambda changed_file: changed_file.name)

OLD	NEW

« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/feature.py » ('j') | appengine/findit/crash/loglinear/feature.py » ('J')