Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from collections import defaultdict | 5 from collections import defaultdict |
| 6 import logging | 6 import logging |
| 7 import math | 7 import math |
| 8 | 8 |
| 9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher | 9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher |
| 10 from crash import changelist_classifier | 10 from crash import changelist_classifier |
| 11 from crash.crash_report_with_dependencies import CrashReportWithDependencies | 11 from crash.crash_report_with_dependencies import CrashReportWithDependencies |
| 12 from crash.loglinear.changelist_features import min_distance | 12 from crash.loglinear.changelist_features import min_distance |
| 13 from crash.loglinear.changelist_features import top_frame_index | 13 from crash.loglinear.changelist_features import top_frame_index |
| 14 from crash.loglinear.model import ToFeatureFunction | 14 from crash.loglinear.feature import FeatureFunction |
| 15 from crash.loglinear.model import UnnormalizedLogLinearModel | 15 from crash.loglinear.model import UnnormalizedLogLinearModel |
| 16 from crash.stacktrace import CallStack | 16 from crash.stacktrace import CallStack |
| 17 from crash.stacktrace import Stacktrace | 17 from crash.stacktrace import Stacktrace |
| 18 from crash.suspect import StackInfo | 18 from crash.suspect import StackInfo |
| 19 | 19 |
| 20 | 20 |
| 21 class LogLinearChangelistClassifier(object): | 21 class LogLinearChangelistClassifier(object): |
| 22 """A ``LogLinearModel``-based implementation of CL classification.""" | 22 """A ``LogLinearModel``-based implementation of CL classification.""" |
| 23 | 23 |
| 24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3): | 24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3): |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 35 for. We take this argument as a dict rather than as a list so that | 35 for. We take this argument as a dict rather than as a list so that |
| 36 callers needn't worry about what order to provide the weights in. | 36 callers needn't worry about what order to provide the weights in. |
| 37 top_n_frames (int): how many frames of each callstack to look at. | 37 top_n_frames (int): how many frames of each callstack to look at. |
| 38 top_n_suspects (int): maximum number of suspects to return. | 38 top_n_suspects (int): maximum number of suspects to return. |
| 39 """ | 39 """ |
| 40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository) | 40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository) |
| 41 self._get_repository = get_repository | 41 self._get_repository = get_repository |
| 42 self._top_n_frames = top_n_frames | 42 self._top_n_frames = top_n_frames |
| 43 self._top_n_suspects = top_n_suspects | 43 self._top_n_suspects = top_n_suspects |
| 44 | 44 |
| 45 feature_function = ToFeatureFunction([ | 45 feature_function = FeatureFunction([ |
| 46 top_frame_index.TopFrameIndexFeature(top_n_frames), | 46 top_frame_index.TopFrameIndexFeature(top_n_frames), |
| 47 min_distance.MinDistanceFeature(), | 47 min_distance.MinDistanceFeature(), |
| 48 ]) | 48 ]) |
| 49 | 49 |
| 50 weight_list = [ | 50 self._model = UnnormalizedLogLinearModel(feature_function, weights) |
| 51 weights['TopFrameIndex'], | |
| 52 weights['MinDistance'], | |
| 53 ] | |
| 54 | |
| 55 self._model = UnnormalizedLogLinearModel(feature_function, weight_list) | |
| 56 | |
| 57 # TODO(crbug.com/674262): remove the need for storing these weights. | |
| 58 self._weights = weights | |
| 59 | 51 |
| 60 # TODO(crbug.com/673964): something better for detecting "close to log(0)". | 52 # TODO(crbug.com/673964): something better for detecting "close to log(0)". |
| 61 def _LogZeroish(self, x): | 53 def _LogZeroish(self, x): |
|
wrengr
2017/01/11 20:38:30
This should also be moved to UnnormalizedLLM. That
Sharu Jiang
2017/01/12 01:41:38
Done.
| |
| 62 """Determine whether a float is close enough to log(0). | 54 """Determine whether a float is close enough to log(0). |
| 63 | 55 |
| 64 If a ``FeatureValue`` has a (log-domain) score of -inf for a given | 56 If a ``FeatureValue`` has a (log-domain) score of -inf for a given |
| 65 ``Suspect``, then that suspect has zero probability of being the | 57 ``Suspect``, then that suspect has zero probability of being the |
| 66 culprit. We want to filter these suspects out, to clean up the | 58 culprit. We want to filter these suspects out, to clean up the |
| 67 output of classification; so this method encapsulates the logic of | 59 output of classification; so this method encapsulates the logic of |
| 68 that check. | 60 that check. |
| 69 | 61 |
| 70 Args: | 62 Args: |
| 71 x (float): the float to check | 63 x (float): the float to check |
| 72 | 64 |
| 73 Returns: | 65 Returns: |
| 74 ``True`` if ``x`` is close enough to log(0); else ``False``. | 66 ``True`` if ``x`` is close enough to log(0); else ``False``. |
| 75 """ | 67 """ |
| 76 return x < 0 and math.isinf(x) | 68 return x < 0 and math.isinf(x) |
| 77 | 69 |
| 78 def _SingleFeatureScore(self, feature_value): | |
| 79 """Returns the score (aka weighted value) of a ``FeatureValue``. | |
| 80 | |
| 81 This function assumes the report's stacktrace has already had any necessary | |
| 82 preprocessing (like filtering or truncating) applied. | |
| 83 | |
| 84 Args: | |
| 85 feature_value (FeatureValue): the feature value to check. | |
| 86 | |
| 87 Returns: | |
| 88 The score of the feature value. | |
| 89 """ | |
| 90 return feature_value.value * self._weights.get(feature_value.name, 0.) | |
| 91 | |
| 92 def __call__(self, report): | 70 def __call__(self, report): |
| 93 """Finds changelists suspected of being responsible for the crash report. | 71 """Finds changelists suspected of being responsible for the crash report. |
| 94 | 72 |
| 95 Args: | 73 Args: |
| 96 report (CrashReport): the report to be analyzed. | 74 report (CrashReport): the report to be analyzed. |
| 97 | 75 |
| 98 Returns: | 76 Returns: |
| 99 List of ``Suspect``s, sorted by probability from highest to lowest. | 77 List of ``Suspect``s, sorted by probability from highest to lowest. |
| 100 """ | 78 """ |
| 101 report = CrashReportWithDependencies(report, self._dependency_fetcher) | 79 report = CrashReportWithDependencies(report, self._dependency_fetcher) |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 135 scored_suspects = [] | 113 scored_suspects = [] |
| 136 for suspect in suspects: | 114 for suspect in suspects: |
| 137 score = score_given_report(suspect) | 115 score = score_given_report(suspect) |
| 138 if self._LogZeroish(score): | 116 if self._LogZeroish(score): |
| 139 logging.debug('Discarding suspect because it has zero probability: %s' | 117 logging.debug('Discarding suspect because it has zero probability: %s' |
| 140 % str(suspect.ToDict())) | 118 % str(suspect.ToDict())) |
| 141 continue | 119 continue |
| 142 | 120 |
| 143 suspect.confidence = score | 121 suspect.confidence = score |
| 144 features = features_given_report(suspect) | 122 features = features_given_report(suspect) |
| 145 suspect.reasons = self.FormatReasons(features) | 123 suspect.reasons = self.FormatReasons(features.itervalues()) |
| 146 suspect.changed_files = [ | 124 suspect.changed_files = [ |
| 147 changed_file.ToDict() | 125 changed_file.ToDict() |
| 148 for changed_file in self.AggregateChangedFiles(features)] | 126 for changed_file in self.AggregateChangedFiles(features.itervalues())] |
| 149 scored_suspects.append(suspect) | 127 scored_suspects.append(suspect) |
| 150 | 128 |
| 151 scored_suspects.sort(key=lambda suspect: suspect.confidence) | 129 scored_suspects.sort(key=lambda suspect: suspect.confidence) |
| 152 return scored_suspects[:self._top_n_suspects] | 130 return scored_suspects[:self._top_n_suspects] |
| 153 | 131 |
| 154 def FormatReasons(self, features): | 132 def FormatReasons(self, features): |
| 155 """Collect and format a list of all ``FeatureValue.reason`` strings. | 133 """Collect and format a list of all ``FeatureValue.reason`` strings. |
| 156 | 134 |
| 157 Args: | 135 Args: |
| 158 features (list of FeatureValue): the values whose ``reason`` | 136 features (list of FeatureValue): the values whose ``reason`` |
|
wrengr
2017/01/11 20:38:30
since now we're passing an iterator rather than a
Sharu Jiang
2017/01/12 01:41:37
Done.
| |
| 159 strings should be collected. | 137 strings should be collected. |
| 160 | 138 |
| 161 Returns: | 139 Returns: |
| 162 A list of ``(str, float, str)`` triples; where the first string is | 140 A list of ``(str, float, str)`` triples; where the first string is |
| 163 the feature name, the float is some numeric representation of how | 141 the feature name, the float is some numeric representation of how |
| 164 much influence this feature exerts on the ``Suspect`` being blamed, | 142 much influence this feature exerts on the ``Suspect`` being blamed, |
| 165 and the final string is the ``FeatureValue.reason``. The list is | 143 and the final string is the ``FeatureValue.reason``. The list is |
| 166 sorted by feature name, just to ensure that it comes out in some | 144 sorted by feature name, just to ensure that it comes out in some |
| 167 canonical order. | 145 canonical order. |
| 168 | 146 |
| 169 At present, the float is the log-domain score of the feature | 147 At present, the float is the log-domain score of the feature |
| 170 value. However, this isn't the best thing for UX reasons. In the | 148 value. However, this isn't the best thing for UX reasons. In the |
| 171 future it might be replaced by the normal-domain score, or by | 149 future it might be replaced by the normal-domain score, or by |
| 172 the probability. | 150 the probability. |
| 173 """ | 151 """ |
| 174 formatted_reasons = [] | 152 formatted_reasons = [] |
| 175 for feature in features: | 153 for feature in features: |
| 176 feature_score = self._SingleFeatureScore(feature) | 154 feature_score = self._model.SingleFeatureScore(feature) |
| 177 if self._LogZeroish(feature_score): # pragma: no cover | 155 if self._LogZeroish(feature_score): # pragma: no cover |
| 178 logging.debug('Discarding reasons from feature %s' | 156 logging.debug('Discarding reasons from feature %s' |
| 179 ' because it has zero probability' % feature.name) | 157 ' because it has zero probability' % feature.name) |
| 180 continue | 158 continue |
| 181 | 159 |
| 182 formatted_reasons.append((feature.name, feature_score, feature.reason)) | 160 formatted_reasons.append((feature.name, feature_score, feature.reason)) |
| 183 | 161 |
| 184 return sorted(formatted_reasons, | 162 return sorted(formatted_reasons, |
|
wrengr
2017/01/11 20:38:30
unrelated to the CL's goals, but I just noticed: T
Sharu Jiang
2017/01/12 01:41:37
Done.
| |
| 185 key=lambda formatted_reason: formatted_reason[0]) | 163 key=lambda formatted_reason: formatted_reason[0]) |
| 186 | 164 |
| 187 def AggregateChangedFiles(self, features): | 165 def AggregateChangedFiles(self, features): |
| 188 """Merge multiple``FeatureValue.changed_files`` lists into one. | 166 """Merge multiple``FeatureValue.changed_files`` lists into one. |
| 189 | 167 |
| 190 Args: | 168 Args: |
| 191 features (list of FeatureValue): the values whose ``changed_files`` | 169 features (list of FeatureValue): the values whose ``changed_files`` |
|
wrengr
2017/01/11 20:38:30
ditto
Sharu Jiang
2017/01/12 01:41:37
Done.
| |
| 192 lists should be aggregated. | 170 lists should be aggregated. |
| 193 | 171 |
| 194 Returns: | 172 Returns: |
| 195 A list of ``ChangedFile`` objects sorted by file name. The sorting | 173 A list of ``ChangedFile`` objects sorted by file name. The sorting |
| 196 is not essential, but is provided to ease testing by ensuring the | 174 is not essential, but is provided to ease testing by ensuring the |
| 197 output is in some canonical order. | 175 output is in some canonical order. |
| 198 | 176 |
| 199 Raises: | 177 Raises: |
| 200 ``ValueError`` if any file name is given inconsistent ``blame_url``s. | 178 ``ValueError`` if any file name is given inconsistent ``blame_url``s. |
| 201 """ | 179 """ |
| 202 all_changed_files = {} | 180 all_changed_files = {} |
| 203 for feature in features: | 181 for feature in features: |
| 204 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover | 182 if self._LogZeroish( |
| 183 self._model.SingleFeatureScore(feature)): # pragma: no cover | |
| 205 logging.debug('Discarding changed files from feature %s' | 184 logging.debug('Discarding changed files from feature %s' |
| 206 ' because it has zero probability' % feature.name) | 185 ' because it has zero probability' % feature.name) |
| 207 continue | 186 continue |
| 208 | 187 |
| 209 for changed_file in feature.changed_files or []: | 188 for changed_file in feature.changed_files or []: |
| 210 accumulated_changed_file = all_changed_files.get(changed_file.name) | 189 accumulated_changed_file = all_changed_files.get(changed_file.name) |
| 211 if accumulated_changed_file is None: | 190 if accumulated_changed_file is None: |
| 212 all_changed_files[changed_file.name] = changed_file | 191 all_changed_files[changed_file.name] = changed_file |
| 213 continue | 192 continue |
| 214 | 193 |
| 215 if (accumulated_changed_file.blame_url != | 194 if (accumulated_changed_file.blame_url != |
| 216 changed_file.blame_url): # pragma: no cover | 195 changed_file.blame_url): # pragma: no cover |
| 217 raise ValueError('Blame URLs do not match: %s != %s' | 196 raise ValueError('Blame URLs do not match: %s != %s' |
| 218 % (accumulated_changed_file.blame_url, changed_file.blame_url)) | 197 % (accumulated_changed_file.blame_url, changed_file.blame_url)) |
| 219 accumulated_changed_file.reasons.extend(changed_file.reasons or []) | 198 accumulated_changed_file.reasons.extend(changed_file.reasons or []) |
| 220 | 199 |
| 221 return sorted(all_changed_files.values(), | 200 return sorted(all_changed_files.values(), |
|
wrengr
2017/01/11 20:38:30
ditto.
Sharu Jiang
2017/01/12 01:41:37
Done.
| |
| 222 key=lambda changed_file: changed_file.name) | 201 key=lambda changed_file: changed_file.name) |
| OLD | NEW |