appengine/findit/crash/loglinear/changelist_classifier.py - Issue 2560723005: Implementing a new LogLinearModel-based CL classifier

Side by Side Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2560723005: Implementing a new LogLinearModel-based CL classifier (Closed)

Patch Set: rebase Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 # Copyright 2016 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import logging

	6 import math

	7

	8 from common import chrome_dependency_fetcher

	9 from crash import changelist_classifier

	10 from crash.loglinear.changelist_features import min_distance

	11 from crash.loglinear.changelist_features import top_frame_index

	12 from crash.loglinear.model import ToFeatureFunction

	13 from crash.loglinear.model import UnnormalizedLogLinearModel

	14 from crash.stacktrace import CallStack

	15 from crash.stacktrace import Stacktrace

	16

	17

	18 class LogLinearChangelistClassifier(object):

	19 """A ``LogLinearModel``-based implementation of CL classification."""

	20

	21 def __init__(self, repository, weights, top_n_frames=7, top_n_suspects=3):

	22 """Args:

	23 repository (Repository): the Git repository for getting CLs to classify.

	24 weights (dict of float): the weights for the features. The keys of

	25 the dictionary are the names of the feature that weight is

	26 for. We take this argument as a dict rather than as a list so that

	27 callers needn't worry about what order to provide the weights in.

	28 top_n_frames (int): how many frames of each callstack to look at.

	29 top_n_suspects (int): maximum number of suspects to return.

	30 """

	31 self._repository = repository

	32 self._top_n_frames = top_n_frames

	33 self._top_n_suspects = top_n_suspects

	34

	35 feature_function = ToFeatureFunction([

	36 top_frame_index.TopFrameIndexFeature(top_n_frames),

	37 min_distance.MinDistanceFeature(),

	38 ])

	39

	40 weight_list = [

	41 weights['TopFrameIndex'],

	42 weights['MinDistance'],

	43 ]

	44

	45 self._model = UnnormalizedLogLinearModel(feature_function, weight_list)

	46

	47 # TODO(crbug.com/674262): remove the need for storing these weights.

	48 self._weights = weights

	49

	50 # TODO(crbug.com/673964): something better for detecting "close to log(0)".

	51 def _LogZeroish(self, x):

	52 """Determine whether a float is close enough to log(0).

	53

	54 If a ``FeatureValue`` has a (log-domain) score of -inf for a given

	55 ``Suspect``, then that suspect has zero probability of being the

	56 culprit. We want to filter these suspects out, to clean up the

	57 output of classification; so this method encapsulates the logic of

	58 that check.

	59

	60 Args:

	61 x (float): the float to check

	62

	63 Returns:

	64 ``True`` if ``x`` is close enough to log(0); else ``False``.

	65 """

	66 return x < 0 and math.isinf(x)

	67

	68 def _SingleFeatureScore(self, feature_value):

	69 """Returns the score (aka weighted value) of a ``FeatureValue``.

	70

	71 This method is a hack for filtering the JSON output ``__call__``

	72 returns. If we really really need this, then we should probably move

	73 it to the classes defining loglinear models.

	74

	75 Args:

	76 feature_value (FeatureValue): the feature value to check.

	77

	78 Returns:

	79 The score of the feature value.

	80 """

	81 return feature_value.value * self._weights.get(feature_value.name, 0.)

	82

	83 def __call__(self, report):

	84 """Finds changelists suspected of being responsible for the crash report.

	85

	86 Args:

	87 report (CrashReport): the report to be analyzed.

	88

	89 Returns:

	90 List of ``Suspect``s, sorted by probability from highest to lowest.

	91 """

	92 if not report.regression_range:

	93 logging.warning('ChangelistClassifier.__call__: Missing regression range '

	94 'for report: %s', str(report))

	95 return []

	96 last_good_version, first_bad_version = report.regression_range

	97 logging.info('ChangelistClassifier.__call__: Regression range %s:%s',

	98 last_good_version, first_bad_version)

	99

	100 # Restrict analysis to just the top n frames in each callstack.

	101 stacktrace = Stacktrace([

	102 stack.SliceFrames(None, self._top_n_frames)

	103 for stack in report.stacktrace])

	104

	105 # We are only interested in the deps in crash stack (the callstack that

	106 # caused the crash).

	107 # TODO(wrengr): we may want to receive the crash deps as an argument,

	108 # so that when this method is called via Findit.FindCulprit, we avoid

	109 # doing redundant work creating it.

	110 stack_deps = changelist_classifier.GetDepsInCrashStack(

	111 report.stacktrace.crash_stack,

	112 chrome_dependency_fetcher.ChromeDependencyFetcher(

	113 self._repository).GetDependency(report.crashed_version,

	114 report.platform))

	115

	116 # Get dep and file to changelogs, stack_info and blame dicts.

	117 dep_rolls = chrome_dependency_fetcher.ChromeDependencyFetcher(

	118 self._repository).GetDependencyRollsDict(

	119 last_good_version, first_bad_version, report.platform)

	120

	121 # Regression of a dep added/deleted (old_revision/new_revision is None) can

	122 # not be known for sure and this case rarely happens, so just filter them

	123 # out.

	124 regression_deps_rolls = {}

	125 for dep_path, dep_roll in dep_rolls.iteritems():

	126 if not dep_roll.old_revision or not dep_roll.new_revision:

	127 logging.info('Skip %s denpendency %s',

	128 'added' if dep_roll.new_revision else 'deleted', dep_path)

	129 continue

	130 regression_deps_rolls[dep_path] = dep_roll

	131

	132 dep_to_file_to_changelogs, ignore_cls = (

	133 changelist_classifier.GetChangeLogsForFilesGroupedByDeps(

	134 regression_deps_rolls, stack_deps, self._repository))

	135 dep_to_file_to_stack_infos = (

	136 changelist_classifier.GetStackInfosForFilesGroupedByDeps(

	137 stacktrace, stack_deps))

	138

	139 # Get the possible suspects.

	140 suspects = changelist_classifier.FindSuspects(

	141 dep_to_file_to_changelogs,

	142 dep_to_file_to_stack_infos,

	143 stack_deps,

	144 self._repository,

	145 ignore_cls)

	146 if suspects is None:

	147 return []

	148

	149 # Score the suspects and organize them for outputting/returning.

	150 features_given_report = self._model.Features(report)

	151 score_given_report = self._model.Score(report)

	152 scored_suspects = []

	153 for suspect in suspects:

	154 score = score_given_report(suspect)

	155 if self._LogZeroish(score):

	156 logging.debug('Discarding suspect because it has zero probability: %s'

	157 % str(suspect.ToDict()))

	158 continue

	159

	160 suspect.confidence = score

	161 features = features_given_report(suspect)

	162 suspect.reasons = self.FormatReasons(features)

	163 suspect.changed_files = [changed_file.ToDict()

	164 for changed_file in self.AggregateChangedFiles(features)]

	165 scored_suspects.append(suspect)

	166

	167 scored_suspects.sort(key=lambda suspect: suspect.confidence)

	168 return scored_suspects[:self._top_n_suspects]

	169

	170 def FormatReasons(self, features):

	171 """Collect and format a list of all ``FeatureValue.reason`` strings.

	172

	173 Args:

	174 features (list of FeatureValue): the values whose ``reason``

	175 strings should be collected.

	176

	177 Returns:

	178 A list of ``(str, float, str)`` triples; where the first string is

	179 the feature name, the float is some numeric representation of how

	180 much influence this feature exerts on the ``Suspect`` being blamed,

	181 and the final string is the ``FeatureValue.reason``. The list is

	182 sorted by feature name, just to ensure that it comes out in some

	183 canonical order.

	184

	185 At present, the float is the log-domain score of the feature

	186 value. However, this isn't the best thing for UX reasons. In the

	187 future it might be replaced by the normal-domain score, or by

	188 the probability.

	189 """

	190 formatted_reasons = []

	191 for feature in features:

	192 feature_score = self._SingleFeatureScore(feature)

	193 if self._LogZeroish(feature_score): # pragma: no cover

	194 logging.debug('Discarding reasons from feature %s'

	195 ' because it has zero probability' % feature.name)

	196 continue

	197

	198 formatted_reasons.append((feature.name, feature_score, feature.reason))

	199

	200 return sorted(formatted_reasons,

	201 key=lambda formatted_reason: formatted_reason[0])

	202

	203 def AggregateChangedFiles(self, features):

	204 """Merge multiple``FeatureValue.changed_files`` lists into one.

	205

	206 Args:

	207 features (list of FeatureValue): the values whose ``changed_files``

	208 lists should be aggregated.

	209

	210 Returns:

	211 A list of ``ChangedFile`` objects sorted by file name. The sorting

	212 is not essential, but is provided to ease testing by ensuring the

	213 output is in some canonical order.

	214

	215 Raises:

	216 ``ValueError`` if any file name is given inconsistent ``blame_url``s.

	217 """

	218 all_changed_files = {}

	219 for feature in features:

	220 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover

	221 logging.debug('Discarding changed files from feature %s'

	222 ' because it has zero probability' % feature.name)

	223 continue

	224

	225 for changed_file in feature.changed_files or []:

	226 accumulated_changed_file = all_changed_files.get(changed_file.name)

	227 if accumulated_changed_file is None:

	228 all_changed_files[changed_file.name] = changed_file

	229 continue

	230

	231 assert accumulated_changed_file.blame_url == changed_file.blame_url, (

	232 ValueError('Blame URLs do not match: %s != %s'

	233 % (accumulated_changed_file.blame_url, changed_file.blame_url)))

	234 accumulated_changed_file.reasons.extend(changed_file.reasons or [])

	235

	236 return sorted(all_changed_files.values(),

	237 key=lambda changed_file: changed_file.name)

OLD	NEW

« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/model.py » ('j') | no next file with comments »