Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(94)

Side by Side Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)
Patch Set: Update doc strs. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from collections import defaultdict 5 from collections import defaultdict
6 import logging 6 import logging
7 import math 7 import math
8 8
9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher 9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher
10 from crash import changelist_classifier 10 from crash import changelist_classifier
11 from crash.crash_report_with_dependencies import CrashReportWithDependencies 11 from crash.crash_report_with_dependencies import CrashReportWithDependencies
12 from crash.loglinear.changelist_features import min_distance 12 from crash.loglinear.changelist_features import min_distance
13 from crash.loglinear.changelist_features import top_frame_index 13 from crash.loglinear.changelist_features import top_frame_index
14 from crash.loglinear.model import ToFeatureFunction 14 from crash.loglinear.feature import FeatureFunction
15 from crash.loglinear.model import UnnormalizedLogLinearModel 15 from crash.loglinear.model import UnnormalizedLogLinearModel
16 from crash.stacktrace import CallStack 16 from crash.stacktrace import CallStack
17 from crash.stacktrace import Stacktrace 17 from crash.stacktrace import Stacktrace
18 from crash.suspect import StackInfo 18 from crash.suspect import StackInfo
19 19
20 20
21 class LogLinearChangelistClassifier(object): 21 class LogLinearChangelistClassifier(object):
22 """A ``LogLinearModel``-based implementation of CL classification.""" 22 """A ``LogLinearModel``-based implementation of CL classification."""
23 23
24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3): 24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3):
(...skipping 10 matching lines...) Expand all
35 for. We take this argument as a dict rather than as a list so that 35 for. We take this argument as a dict rather than as a list so that
36 callers needn't worry about what order to provide the weights in. 36 callers needn't worry about what order to provide the weights in.
37 top_n_frames (int): how many frames of each callstack to look at. 37 top_n_frames (int): how many frames of each callstack to look at.
38 top_n_suspects (int): maximum number of suspects to return. 38 top_n_suspects (int): maximum number of suspects to return.
39 """ 39 """
40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository) 40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository)
41 self._get_repository = get_repository 41 self._get_repository = get_repository
42 self._top_n_frames = top_n_frames 42 self._top_n_frames = top_n_frames
43 self._top_n_suspects = top_n_suspects 43 self._top_n_suspects = top_n_suspects
44 44
45 feature_function = ToFeatureFunction([ 45 feature_function = FeatureFunction([
46 top_frame_index.TopFrameIndexFeature(top_n_frames), 46 top_frame_index.TopFrameIndexFeature(top_n_frames),
47 min_distance.MinDistanceFeature(), 47 min_distance.MinDistanceFeature(),
48 ]) 48 ])
49 49
50 weight_list = [ 50 self._model = UnnormalizedLogLinearModel(feature_function, weights)
51 weights['TopFrameIndex'],
52 weights['MinDistance'],
53 ]
54
55 self._model = UnnormalizedLogLinearModel(feature_function, weight_list)
56
57 # TODO(crbug.com/674262): remove the need for storing these weights.
58 self._weights = weights
59 51
60 # TODO(crbug.com/673964): something better for detecting "close to log(0)". 52 # TODO(crbug.com/673964): something better for detecting "close to log(0)".
61 def _LogZeroish(self, x): 53 def _LogZeroish(self, x):
wrengr 2017/01/11 20:38:30 This should also be moved to UnnormalizedLLM. That
Sharu Jiang 2017/01/12 01:41:38 Done.
62 """Determine whether a float is close enough to log(0). 54 """Determine whether a float is close enough to log(0).
63 55
64 If a ``FeatureValue`` has a (log-domain) score of -inf for a given 56 If a ``FeatureValue`` has a (log-domain) score of -inf for a given
65 ``Suspect``, then that suspect has zero probability of being the 57 ``Suspect``, then that suspect has zero probability of being the
66 culprit. We want to filter these suspects out, to clean up the 58 culprit. We want to filter these suspects out, to clean up the
67 output of classification; so this method encapsulates the logic of 59 output of classification; so this method encapsulates the logic of
68 that check. 60 that check.
69 61
70 Args: 62 Args:
71 x (float): the float to check 63 x (float): the float to check
72 64
73 Returns: 65 Returns:
74 ``True`` if ``x`` is close enough to log(0); else ``False``. 66 ``True`` if ``x`` is close enough to log(0); else ``False``.
75 """ 67 """
76 return x < 0 and math.isinf(x) 68 return x < 0 and math.isinf(x)
77 69
78 def _SingleFeatureScore(self, feature_value):
79 """Returns the score (aka weighted value) of a ``FeatureValue``.
80
81 This function assumes the report's stacktrace has already had any necessary
82 preprocessing (like filtering or truncating) applied.
83
84 Args:
85 feature_value (FeatureValue): the feature value to check.
86
87 Returns:
88 The score of the feature value.
89 """
90 return feature_value.value * self._weights.get(feature_value.name, 0.)
91
92 def __call__(self, report): 70 def __call__(self, report):
93 """Finds changelists suspected of being responsible for the crash report. 71 """Finds changelists suspected of being responsible for the crash report.
94 72
95 Args: 73 Args:
96 report (CrashReport): the report to be analyzed. 74 report (CrashReport): the report to be analyzed.
97 75
98 Returns: 76 Returns:
99 List of ``Suspect``s, sorted by probability from highest to lowest. 77 List of ``Suspect``s, sorted by probability from highest to lowest.
100 """ 78 """
101 report = CrashReportWithDependencies(report, self._dependency_fetcher) 79 report = CrashReportWithDependencies(report, self._dependency_fetcher)
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
135 scored_suspects = [] 113 scored_suspects = []
136 for suspect in suspects: 114 for suspect in suspects:
137 score = score_given_report(suspect) 115 score = score_given_report(suspect)
138 if self._LogZeroish(score): 116 if self._LogZeroish(score):
139 logging.debug('Discarding suspect because it has zero probability: %s' 117 logging.debug('Discarding suspect because it has zero probability: %s'
140 % str(suspect.ToDict())) 118 % str(suspect.ToDict()))
141 continue 119 continue
142 120
143 suspect.confidence = score 121 suspect.confidence = score
144 features = features_given_report(suspect) 122 features = features_given_report(suspect)
145 suspect.reasons = self.FormatReasons(features) 123 suspect.reasons = self.FormatReasons(features.itervalues())
146 suspect.changed_files = [ 124 suspect.changed_files = [
147 changed_file.ToDict() 125 changed_file.ToDict()
148 for changed_file in self.AggregateChangedFiles(features)] 126 for changed_file in self.AggregateChangedFiles(features.itervalues())]
149 scored_suspects.append(suspect) 127 scored_suspects.append(suspect)
150 128
151 scored_suspects.sort(key=lambda suspect: suspect.confidence) 129 scored_suspects.sort(key=lambda suspect: suspect.confidence)
152 return scored_suspects[:self._top_n_suspects] 130 return scored_suspects[:self._top_n_suspects]
153 131
154 def FormatReasons(self, features): 132 def FormatReasons(self, features):
155 """Collect and format a list of all ``FeatureValue.reason`` strings. 133 """Collect and format a list of all ``FeatureValue.reason`` strings.
156 134
157 Args: 135 Args:
158 features (list of FeatureValue): the values whose ``reason`` 136 features (list of FeatureValue): the values whose ``reason``
wrengr 2017/01/11 20:38:30 since now we're passing an iterator rather than a
Sharu Jiang 2017/01/12 01:41:37 Done.
159 strings should be collected. 137 strings should be collected.
160 138
161 Returns: 139 Returns:
162 A list of ``(str, float, str)`` triples; where the first string is 140 A list of ``(str, float, str)`` triples; where the first string is
163 the feature name, the float is some numeric representation of how 141 the feature name, the float is some numeric representation of how
164 much influence this feature exerts on the ``Suspect`` being blamed, 142 much influence this feature exerts on the ``Suspect`` being blamed,
165 and the final string is the ``FeatureValue.reason``. The list is 143 and the final string is the ``FeatureValue.reason``. The list is
166 sorted by feature name, just to ensure that it comes out in some 144 sorted by feature name, just to ensure that it comes out in some
167 canonical order. 145 canonical order.
168 146
169 At present, the float is the log-domain score of the feature 147 At present, the float is the log-domain score of the feature
170 value. However, this isn't the best thing for UX reasons. In the 148 value. However, this isn't the best thing for UX reasons. In the
171 future it might be replaced by the normal-domain score, or by 149 future it might be replaced by the normal-domain score, or by
172 the probability. 150 the probability.
173 """ 151 """
174 formatted_reasons = [] 152 formatted_reasons = []
175 for feature in features: 153 for feature in features:
176 feature_score = self._SingleFeatureScore(feature) 154 feature_score = self._model.SingleFeatureScore(feature)
177 if self._LogZeroish(feature_score): # pragma: no cover 155 if self._LogZeroish(feature_score): # pragma: no cover
178 logging.debug('Discarding reasons from feature %s' 156 logging.debug('Discarding reasons from feature %s'
179 ' because it has zero probability' % feature.name) 157 ' because it has zero probability' % feature.name)
180 continue 158 continue
181 159
182 formatted_reasons.append((feature.name, feature_score, feature.reason)) 160 formatted_reasons.append((feature.name, feature_score, feature.reason))
183 161
184 return sorted(formatted_reasons, 162 return sorted(formatted_reasons,
wrengr 2017/01/11 20:38:30 unrelated to the CL's goals, but I just noticed: T
Sharu Jiang 2017/01/12 01:41:37 Done.
185 key=lambda formatted_reason: formatted_reason[0]) 163 key=lambda formatted_reason: formatted_reason[0])
186 164
187 def AggregateChangedFiles(self, features): 165 def AggregateChangedFiles(self, features):
188 """Merge multiple``FeatureValue.changed_files`` lists into one. 166 """Merge multiple``FeatureValue.changed_files`` lists into one.
189 167
190 Args: 168 Args:
191 features (list of FeatureValue): the values whose ``changed_files`` 169 features (list of FeatureValue): the values whose ``changed_files``
wrengr 2017/01/11 20:38:30 ditto
Sharu Jiang 2017/01/12 01:41:37 Done.
192 lists should be aggregated. 170 lists should be aggregated.
193 171
194 Returns: 172 Returns:
195 A list of ``ChangedFile`` objects sorted by file name. The sorting 173 A list of ``ChangedFile`` objects sorted by file name. The sorting
196 is not essential, but is provided to ease testing by ensuring the 174 is not essential, but is provided to ease testing by ensuring the
197 output is in some canonical order. 175 output is in some canonical order.
198 176
199 Raises: 177 Raises:
200 ``ValueError`` if any file name is given inconsistent ``blame_url``s. 178 ``ValueError`` if any file name is given inconsistent ``blame_url``s.
201 """ 179 """
202 all_changed_files = {} 180 all_changed_files = {}
203 for feature in features: 181 for feature in features:
204 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover 182 if self._LogZeroish(
183 self._model.SingleFeatureScore(feature)): # pragma: no cover
205 logging.debug('Discarding changed files from feature %s' 184 logging.debug('Discarding changed files from feature %s'
206 ' because it has zero probability' % feature.name) 185 ' because it has zero probability' % feature.name)
207 continue 186 continue
208 187
209 for changed_file in feature.changed_files or []: 188 for changed_file in feature.changed_files or []:
210 accumulated_changed_file = all_changed_files.get(changed_file.name) 189 accumulated_changed_file = all_changed_files.get(changed_file.name)
211 if accumulated_changed_file is None: 190 if accumulated_changed_file is None:
212 all_changed_files[changed_file.name] = changed_file 191 all_changed_files[changed_file.name] = changed_file
213 continue 192 continue
214 193
215 if (accumulated_changed_file.blame_url != 194 if (accumulated_changed_file.blame_url !=
216 changed_file.blame_url): # pragma: no cover 195 changed_file.blame_url): # pragma: no cover
217 raise ValueError('Blame URLs do not match: %s != %s' 196 raise ValueError('Blame URLs do not match: %s != %s'
218 % (accumulated_changed_file.blame_url, changed_file.blame_url)) 197 % (accumulated_changed_file.blame_url, changed_file.blame_url))
219 accumulated_changed_file.reasons.extend(changed_file.reasons or []) 198 accumulated_changed_file.reasons.extend(changed_file.reasons or [])
220 199
221 return sorted(all_changed_files.values(), 200 return sorted(all_changed_files.values(),
wrengr 2017/01/11 20:38:30 ditto.
Sharu Jiang 2017/01/12 01:41:37 Done.
222 key=lambda changed_file: changed_file.name) 201 key=lambda changed_file: changed_file.name)
OLDNEW
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/feature.py » ('j') | appengine/findit/crash/loglinear/feature.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698