Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2230)

Unified Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2560723005: Implementing a new LogLinearModel-based CL classifier (Closed)
Patch Set: more breaking apart of the CL Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: appengine/findit/crash/loglinear/changelist_classifier.py
diff --git a/appengine/findit/crash/loglinear/changelist_classifier.py b/appengine/findit/crash/loglinear/changelist_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff13cec2caf4115ab6745285299643752c83b0ef
--- /dev/null
+++ b/appengine/findit/crash/loglinear/changelist_classifier.py
@@ -0,0 +1,236 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+import math
+
+from common import chrome_dependency_fetcher
+from crash import changelist_classifier
+from crash.loglinear.changelist_features import min_distance
+from crash.loglinear.changelist_features import top_frame_index
+from crash.loglinear.model import ToFeatureFunction
+from crash.loglinear.model import UnnormalizedLogLinearModel
+from crash.stacktrace import CallStack
+from crash.stacktrace import Stacktrace
+
+
+class LogLinearChangelistClassifier(object):
+ """A ``LogLinearModel``-based implementation of CL classification."""
+
+ def __init__(self, repository, weights, top_n_frames=7, top_n_results=3):
Sharu Jiang 2016/12/15 20:11:32 We can pass the already trained model in, and let
wrengr 2016/12/15 21:35:07 Sure, but someone has to construct that model. For
+ """Args:
+ repository (Repository): the Git repository for getting CLs to classify.
+ weights (dict of float): the weights for the features. The keys of
+ the dictionary are the names of the feature that weight is
+ for. We take this argument as a dict rather than as a list so rhat
Sharu Jiang 2016/12/15 20:11:32 typo: rhat -> that
wrengr 2016/12/15 21:35:07 Done.
+ callers needn't worry about what order to provide the weights in.
+ top_n_frames (int): how many frames of each callstack to look at.
+ top_n_results (int): maximum number of results to return.
+ """
+ self._repository = repository
+ self._top_n_frames = top_n_frames
+ self._top_n_results = top_n_results
+
+ feature_function = ToFeatureFunction([
+ top_frame_index.TopFrameIndexFeature(top_n_frames),
+ min_distance.MinDistanceFeature(),
+ ])
+
+ weight_list = [
+ weights['TopFrameIndex'],
+ weights['MinDistance'],
+ ]
+
+ self._model = UnnormalizedLogLinearModel(feature_function, weight_list)
+
+ # TODO(crbug.com/674262): remove the need for storing these weights.
+ self._weights = weights
+
+ # TODO(crbug.com/673964): something better for detecting "close to log(0)".
+ def _LogZeroish(self, x):
+ """Determine whether a float is close enough to log(0).
+
+ If a ``FeatureValue`` has a (log-domain) score of -inf for a given
+ ``MatchResult``, then that result has zero probability of being
+ blamed. We want to filter these results out, to clean up the output
+ of classification; so this method encapsulates the logic of that check.
+
+ Args:
+ x (float): the float to check
+
+ Returns:
+ ``True`` if ``x`` is close enough to log(0); else ``False``.
+ """
+ return x < 0 and math.isinf(x)
+
+ def _SingleFeatureScore(self, feature_value):
+ """Returns the score (aka weighted value) of a ``FeatureValue``.
+
+ This method is a hack for filtering the JSON output ``__call__``
+ returns. If we really really need this, then we should probably move
+ it to the classes defining loglinear models.
Sharu Jiang 2016/12/15 20:11:32 I think this is very hacky, we should move it to l
wrengr 2016/12/15 21:35:07 I agree. Since it'll be fairly substantial, will d
+
+ Args:
+ feature_value (FeatureValue): the feature value to check.
+
+ Returns:
+ The score of the feature value.
+ """
+ return feature_value.value * self._weights.get(feature_value.name, 0.)
+
+ def __call__(self, report):
Sharu Jiang 2016/12/15 20:11:32 we have some duplicate code with ChangeLogClassifi
wrengr 2016/12/15 21:35:07 Yeah, there's a lot of duplication between the LLM
Sharu Jiang 2016/12/16 19:23:20 We may try out other models in the further even th
+ """Finds changelists suspected of being responsible for the crash report.
+
+ Args:
+ report (CrashReport): the report to be analyzed.
+
+ Returns:
+ List of Results, sorted by probability from highest to lowest.
+ """
+ if not report.regression_range:
+ logging.warning('ChangelistClassifier.__call__: Missing regression range '
+ 'for report: %s', str(report))
+ return []
+ last_good_version, first_bad_version = report.regression_range
+ logging.info('ChangelistClassifier.__call__: Regression range %s:%s',
+ last_good_version, first_bad_version)
+
+ # Restrict analysis to just the top n frames in each callstack.
+ stacktrace = Stacktrace([
+ stack.SliceFrames(None, self._top_n_frames)
+ for stack in report.stacktrace])
+
+ # We are only interested in the deps in crash stack (the callstack that
+ # caused the crash).
+ # TODO(wrengr): we may want to receive the crash deps as an argument,
+ # so that when this method is called via Findit.FindCulprit, we avoid
+ # doing redundant work creating it.
+ stack_deps = changelist_classifier.GetDepsInCrashStack(
+ report.stacktrace.crash_stack,
+ chrome_dependency_fetcher.ChromeDependencyFetcher(
+ self._repository).GetDependency(report.crashed_version,
+ report.platform))
+
+ # Get dep and file to changelogs, stack_info and blame dicts.
+ dep_rolls = chrome_dependency_fetcher.ChromeDependencyFetcher(
+ self._repository).GetDependencyRollsDict(
+ last_good_version, first_bad_version, report.platform)
+
+ # Regression of a dep added/deleted (old_revision/new_revision is None) can
+ # not be known for sure and this case rarely happens, so just filter them
+ # out.
+ regression_deps_rolls = {}
+ for dep_path, dep_roll in dep_rolls.iteritems():
+ if not dep_roll.old_revision or not dep_roll.new_revision:
+ logging.info('Skip %s denpendency %s',
+ 'added' if dep_roll.new_revision else 'deleted', dep_path)
+ continue
+ regression_deps_rolls[dep_path] = dep_roll
+
+ dep_to_file_to_changelogs, ignore_cls = (
+ changelist_classifier.GetChangeLogsForFilesGroupedByDeps(
Sharu Jiang 2016/12/15 20:11:32 Now both changelist_classifier and loglinear chang
wrengr 2016/12/15 21:35:07 Agree, but ditto what I said above.
+ regression_deps_rolls, stack_deps, self._repository))
+ dep_to_file_to_stack_infos = (
+ changelist_classifier.GetStackInfosForFilesGroupedByDeps(
Sharu Jiang 2016/12/15 20:11:32 Ditto.
+ stacktrace, stack_deps))
+
+ # Get the possible results.
+ results = changelist_classifier.FindMatchResults(
Sharu Jiang 2016/12/15 20:11:32 Ditto.
+ dep_to_file_to_changelogs,
+ dep_to_file_to_stack_infos,
+ stack_deps,
+ self._repository,
+ ignore_cls)
+ if results is None:
+ return []
+
+ # Score the results and organize them for outputting/returning.
+ features_given_report = self._model.Features(report)
+ score_given_report = self._model.Score(report)
+ scored_results = []
+ for result in results:
+ score = score_given_report(result)
+ if self._LogZeroish(score):
+ logging.debug('Discarding result because it has zero probability: %s'
+ % str(result.ToDict()))
+ continue
+
+ result.confidence = score
+ features = features_given_report(result)
+ result.reasons = self.FormatReasons(features)
+ result.changed_files = [changed_file.ToDict()
+ for changed_file in self.AggregateChangedFiles(features)]
+ scored_results.append(result)
+
+ scored_results.sort(key=lambda result: result.confidence)
+ return scored_results[:self._top_n_results]
+
+ def FormatReasons(self, features):
+ """Collect and format a list of all ``FeatureValue.reason`` strings.
+
+ Args:
+ features (list of FeatureValue): the values whose ``reason``
+ strings should be collected.
+
+ Returns:
+ A list of ``(str, float, str)`` triples; where the first string
+ is the feature name, the float is some numeric representation
+ of how much influence this feature exerts on the ``MatchResult``
+ being blamed, and the final string is the ``FeatureValue.reason``.
+ The list is sorted by feature name, just to ensure that it comes
+ out in some canonical order.
+
+ At present, the float is the log-domain score of the feature
+ value. However, this isn't the best thing for UX reasons. In the
+ future it might be replaced by the normal-domain score, or by
+ the probability.
+ """
+ formatted_reasons = []
+ for feature in features:
+ feature_score = self._SingleFeatureScore(feature)
+ if self._LogZeroish(feature_score): # pragma: no cover
+ logging.debug('Discarding reasons from feature %s'
+ ' because it has zero probability' % feature.name)
+ continue
+
+ formatted_reasons.append((feature.name, feature_score, feature.reason))
+
+ return sorted(formatted_reasons,
+ key=lambda formatted_reason: formatted_reason[0])
+
+ def AggregateChangedFiles(self, features):
+ """Merge multiple``FeatureValue.changed_files`` lists into one.
+
+ Args:
+ features (list of FeatureValue): the values whose ``changed_files``
+ lists should be aggregated.
+
+ Returns:
+ A list of ``ChangedFile`` objects sorted by file name. The sorting
+ is not essential, but is provided to ease testing by ensuring the
+ output is in some canonical order.
+
+ Raises:
+ ``ValueError`` if any file name is given inconsistent ``blame_url``s.
+ """
+ all_changed_files = {}
+ for feature in features:
+ if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover
+ logging.debug('Discarding changed files from feature %s'
+ ' because it has zero probability' % feature.name)
+ continue
+
+ for changed_file in feature.changed_files or []:
+ accumulated_changed_file = all_changed_files.get(changed_file.name)
+ if accumulated_changed_file is None:
+ all_changed_files[changed_file.name] = changed_file
+ continue
+
+ assert accumulated_changed_file.blame_url == changed_file.blame_url, (
+ ValueError('Blame URLs do not match: %s != %s'
+ % (accumulated_changed_file.blame_url, changed_file.blame_url)))
+ accumulated_changed_file.reasons.extend(changed_file.reasons or [])
+
+ return sorted(all_changed_files.values(),
+ key=lambda changed_file: changed_file.name)
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698