Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 import logging | |
| 6 import math | |
| 7 | |
| 8 from common import chrome_dependency_fetcher | |
| 9 from crash import changelist_classifier | |
| 10 from crash.loglinear.changelist_features import min_distance | |
| 11 from crash.loglinear.changelist_features import top_frame_index | |
| 12 from crash.loglinear.model import ToFeatureFunction | |
| 13 from crash.loglinear.model import UnnormalizedLogLinearModel | |
| 14 from crash.stacktrace import CallStack | |
| 15 from crash.stacktrace import Stacktrace | |
| 16 | |
| 17 | |
| 18 class LogLinearChangelistClassifier(object): | |
| 19 """A ``LogLinearModel``-based implementation of CL classification.""" | |
| 20 | |
| 21 def __init__(self, repository, weights, top_n_frames=7, top_n_results=3): | |
| 22 """Args: | |
| 23 repository (Repository): the Git repository for getting CLs to classify. | |
| 24 weights (dict of float): the weights for the features. The keys of | |
| 25 the dictionary are the names of the feature that weight is | |
| 26 for. We take this argument as a dict rather than as a list so that | |
| 27 callers needn't worry about what order to provide the weights in. | |
| 28 top_n_frames (int): how many frames of each callstack to look at. | |
| 29 top_n_results (int): maximum number of results to return. | |
| 30 """ | |
| 31 self._repository = repository | |
| 32 self._top_n_frames = top_n_frames | |
| 33 self._top_n_results = top_n_results | |
| 34 | |
| 35 feature_function = ToFeatureFunction([ | |
| 36 top_frame_index.TopFrameIndexFeature(top_n_frames), | |
| 37 min_distance.MinDistanceFeature(), | |
| 38 ]) | |
| 39 | |
| 40 weight_list = [ | |
| 41 weights['TopFrameIndex'], | |
| 42 weights['MinDistance'], | |
| 43 ] | |
| 44 | |
| 45 self._model = UnnormalizedLogLinearModel(feature_function, weight_list) | |
| 46 | |
| 47 # TODO(crbug.com/674262): remove the need for storing these weights. | |
| 48 self._weights = weights | |
| 49 | |
| 50 # TODO(crbug.com/673964): something better for detecting "close to log(0)". | |
|
Martin Barbella
2016/12/16 23:28:37
I'm not sure that a very clean solution to this wo
wrengr
2016/12/17 00:24:54
Yeah, it's not clear there's anything much better
| |
| 51 def _LogZeroish(self, x): | |
| 52 """Determine whether a float is close enough to log(0). | |
| 53 | |
| 54 If a ``FeatureValue`` has a (log-domain) score of -inf for a given | |
| 55 ``MatchResult``, then that result has zero probability of being | |
| 56 blamed. We want to filter these results out, to clean up the output | |
| 57 of classification; so this method encapsulates the logic of that check. | |
| 58 | |
| 59 Args: | |
| 60 x (float): the float to check | |
| 61 | |
| 62 Returns: | |
| 63 ``True`` if ``x`` is close enough to log(0); else ``False``. | |
| 64 """ | |
| 65 return x < 0 and math.isinf(x) | |
| 66 | |
| 67 def _SingleFeatureScore(self, feature_value): | |
| 68 """Returns the score (aka weighted value) of a ``FeatureValue``. | |
| 69 | |
| 70 This method is a hack for filtering the JSON output ``__call__`` | |
| 71 returns. If we really really need this, then we should probably move | |
| 72 it to the classes defining loglinear models. | |
| 73 | |
| 74 Args: | |
| 75 feature_value (FeatureValue): the feature value to check. | |
| 76 | |
| 77 Returns: | |
| 78 The score of the feature value. | |
| 79 """ | |
| 80 return feature_value.value * self._weights.get(feature_value.name, 0.) | |
| 81 | |
| 82 def __call__(self, report): | |
| 83 """Finds changelists suspected of being responsible for the crash report. | |
| 84 | |
| 85 Args: | |
| 86 report (CrashReport): the report to be analyzed. | |
| 87 | |
| 88 Returns: | |
| 89 List of Results, sorted by probability from highest to lowest. | |
| 90 """ | |
| 91 if not report.regression_range: | |
| 92 logging.warning('ChangelistClassifier.__call__: Missing regression range ' | |
| 93 'for report: %s', str(report)) | |
| 94 return [] | |
| 95 last_good_version, first_bad_version = report.regression_range | |
| 96 logging.info('ChangelistClassifier.__call__: Regression range %s:%s', | |
| 97 last_good_version, first_bad_version) | |
| 98 | |
| 99 # Restrict analysis to just the top n frames in each callstack. | |
| 100 stacktrace = Stacktrace([ | |
| 101 stack.SliceFrames(None, self._top_n_frames) | |
| 102 for stack in report.stacktrace]) | |
| 103 | |
| 104 # We are only interested in the deps in crash stack (the callstack that | |
| 105 # caused the crash). | |
| 106 # TODO(wrengr): we may want to receive the crash deps as an argument, | |
| 107 # so that when this method is called via Findit.FindCulprit, we avoid | |
| 108 # doing redundant work creating it. | |
| 109 stack_deps = changelist_classifier.GetDepsInCrashStack( | |
| 110 report.stacktrace.crash_stack, | |
| 111 chrome_dependency_fetcher.ChromeDependencyFetcher( | |
| 112 self._repository).GetDependency(report.crashed_version, | |
| 113 report.platform)) | |
| 114 | |
| 115 # Get dep and file to changelogs, stack_info and blame dicts. | |
| 116 dep_rolls = chrome_dependency_fetcher.ChromeDependencyFetcher( | |
| 117 self._repository).GetDependencyRollsDict( | |
| 118 last_good_version, first_bad_version, report.platform) | |
| 119 | |
| 120 # Regression of a dep added/deleted (old_revision/new_revision is None) can | |
| 121 # not be known for sure and this case rarely happens, so just filter them | |
| 122 # out. | |
| 123 regression_deps_rolls = {} | |
| 124 for dep_path, dep_roll in dep_rolls.iteritems(): | |
| 125 if not dep_roll.old_revision or not dep_roll.new_revision: | |
| 126 logging.info('Skip %s denpendency %s', | |
| 127 'added' if dep_roll.new_revision else 'deleted', dep_path) | |
| 128 continue | |
| 129 regression_deps_rolls[dep_path] = dep_roll | |
| 130 | |
| 131 dep_to_file_to_changelogs, ignore_cls = ( | |
| 132 changelist_classifier.GetChangeLogsForFilesGroupedByDeps( | |
| 133 regression_deps_rolls, stack_deps, self._repository)) | |
| 134 dep_to_file_to_stack_infos = ( | |
| 135 changelist_classifier.GetStackInfosForFilesGroupedByDeps( | |
| 136 stacktrace, stack_deps)) | |
| 137 | |
| 138 # Get the possible results. | |
| 139 results = changelist_classifier.FindMatchResults( | |
| 140 dep_to_file_to_changelogs, | |
| 141 dep_to_file_to_stack_infos, | |
| 142 stack_deps, | |
| 143 self._repository, | |
| 144 ignore_cls) | |
| 145 if results is None: | |
| 146 return [] | |
| 147 | |
| 148 # Score the results and organize them for outputting/returning. | |
| 149 features_given_report = self._model.Features(report) | |
| 150 score_given_report = self._model.Score(report) | |
| 151 scored_results = [] | |
| 152 for result in results: | |
| 153 score = score_given_report(result) | |
| 154 if self._LogZeroish(score): | |
| 155 logging.debug('Discarding result because it has zero probability: %s' | |
| 156 % str(result.ToDict())) | |
| 157 continue | |
| 158 | |
| 159 result.confidence = score | |
| 160 features = features_given_report(result) | |
| 161 result.reasons = self.FormatReasons(features) | |
| 162 result.changed_files = [changed_file.ToDict() | |
| 163 for changed_file in self.AggregateChangedFiles(features)] | |
| 164 scored_results.append(result) | |
| 165 | |
| 166 scored_results.sort(key=lambda result: result.confidence) | |
| 167 return scored_results[:self._top_n_results] | |
| 168 | |
| 169 def FormatReasons(self, features): | |
| 170 """Collect and format a list of all ``FeatureValue.reason`` strings. | |
| 171 | |
| 172 Args: | |
| 173 features (list of FeatureValue): the values whose ``reason`` | |
| 174 strings should be collected. | |
| 175 | |
| 176 Returns: | |
| 177 A list of ``(str, float, str)`` triples; where the first string | |
| 178 is the feature name, the float is some numeric representation | |
| 179 of how much influence this feature exerts on the ``MatchResult`` | |
| 180 being blamed, and the final string is the ``FeatureValue.reason``. | |
| 181 The list is sorted by feature name, just to ensure that it comes | |
| 182 out in some canonical order. | |
| 183 | |
| 184 At present, the float is the log-domain score of the feature | |
| 185 value. However, this isn't the best thing for UX reasons. In the | |
| 186 future it might be replaced by the normal-domain score, or by | |
| 187 the probability. | |
| 188 """ | |
| 189 formatted_reasons = [] | |
| 190 for feature in features: | |
| 191 feature_score = self._SingleFeatureScore(feature) | |
| 192 if self._LogZeroish(feature_score): # pragma: no cover | |
| 193 logging.debug('Discarding reasons from feature %s' | |
| 194 ' because it has zero probability' % feature.name) | |
| 195 continue | |
| 196 | |
| 197 formatted_reasons.append((feature.name, feature_score, feature.reason)) | |
| 198 | |
| 199 return sorted(formatted_reasons, | |
| 200 key=lambda formatted_reason: formatted_reason[0]) | |
| 201 | |
| 202 def AggregateChangedFiles(self, features): | |
| 203 """Merge multiple``FeatureValue.changed_files`` lists into one. | |
| 204 | |
| 205 Args: | |
| 206 features (list of FeatureValue): the values whose ``changed_files`` | |
| 207 lists should be aggregated. | |
| 208 | |
| 209 Returns: | |
| 210 A list of ``ChangedFile`` objects sorted by file name. The sorting | |
| 211 is not essential, but is provided to ease testing by ensuring the | |
| 212 output is in some canonical order. | |
| 213 | |
| 214 Raises: | |
| 215 ``ValueError`` if any file name is given inconsistent ``blame_url``s. | |
| 216 """ | |
| 217 all_changed_files = {} | |
| 218 for feature in features: | |
| 219 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover | |
| 220 logging.debug('Discarding changed files from feature %s' | |
| 221 ' because it has zero probability' % feature.name) | |
| 222 continue | |
| 223 | |
| 224 for changed_file in feature.changed_files or []: | |
| 225 accumulated_changed_file = all_changed_files.get(changed_file.name) | |
| 226 if accumulated_changed_file is None: | |
| 227 all_changed_files[changed_file.name] = changed_file | |
| 228 continue | |
| 229 | |
| 230 assert accumulated_changed_file.blame_url == changed_file.blame_url, ( | |
|
Martin Barbella
2016/12/16 23:28:37
Is this based on something from the existing code?
wrengr
2016/12/17 00:24:54
The scorer-based ChangelistClassifier has a sort o
| |
| 231 ValueError('Blame URLs do not match: %s != %s' | |
| 232 % (accumulated_changed_file.blame_url, changed_file.blame_url))) | |
| 233 accumulated_changed_file.reasons.extend(changed_file.reasons or []) | |
| 234 | |
| 235 return sorted(all_changed_files.values(), | |
| 236 key=lambda changed_file: changed_file.name) | |
| OLD | NEW |