Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3857)

Unified Diff: appengine/findit/crash/loglinear/changelist_features/min_distance.py

Issue 2613153006: [Predator] Add TouchCrashedFileMetaFeature. (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/crash/loglinear/changelist_features/min_distance.py
diff --git a/appengine/findit/crash/loglinear/changelist_features/min_distance.py b/appengine/findit/crash/loglinear/changelist_features/min_distance.py
index 480cf8961c120ad78813103382e8e11d0b78999f..cbb6b5490fad46cc94f72f1dd9d16f8d71740385 100644
--- a/appengine/findit/crash/loglinear/changelist_features/min_distance.py
+++ b/appengine/findit/crash/loglinear/changelist_features/min_distance.py
@@ -5,10 +5,12 @@
import logging
import math
+from crash.loglinear.changelist_classifier import StackInfo
from crash.loglinear.feature import ChangedFile
from crash.loglinear.feature import Feature
from crash.loglinear.feature import FeatureValue
from crash.loglinear.feature import LogLinearlyScaled
+from libs.gitiles.diff import ChangeType
import libs.math.logarithms as lmath
# N.B., this must not be infinity, else we'll start getting NaN values
@@ -16,6 +18,56 @@ import libs.math.logarithms as lmath
DEFAULT_MAXIMUM = 50
+# TODO(wrengr): we should change things to use integers with None as
chanli 2017/01/23 04:37:45 Nit: since wrengr has left the team, change the TO
Sharu Jiang 2017/01/23 21:44:46 Done.
+# \"infinity\", rather than using floats.
+# TODO(http://crbug.com/644476): this class needs a better name.
+class DistanceInfo(object):
+
+ def __init__(self, distance, frame):
+ self.distance = distance
+ self.frame = frame
+
+ def Update(self, distance, frame):
+ if distance < self.distance:
+ self.distance = distance
+ self.frame = frame
+
+ def IsInfinity(self):
+ return math.isinf(self.distance)
+
+ def __str__(self): # pragma: no cover
+ return 'Min distance(distance = %f, frame = %s)' % (float(self.distance),
+ str(self.frame))
+
+ def __eq__(self, other):
+ return self.distance == other.distance and self.frame == other.frame
+
+
+def DistanceBetweenLineRanges((start1, end1), (start2, end2)):
+ """Given two ranges, compute the (unsigned) distance between them.
+
+ Args:
+ start1 (int): the first line included in the first range.
+ end1 (int): the last line included in the first range. Must be
+ greater than or equal to ``start1``.
+ start2 (int): the first line included in the second range.
+ end2 (int): the last line included in the second range. Must be
+ greater than or equal to ``start1``.
+
+ Returns:
+ If the end of the earlier range comes before the start of the later
+ range, then the difference between those points. Otherwise, returns
+ zero (because the ranges overlap).
+ """
+ if end1 < start1:
+ raise ValueError('the first range is empty: %d < %d' % (end1, start1))
+ if end2 < start2:
+ raise ValueError('the second range is empty: %d < %d' % (end2, start2))
+ # There are six possible cases, but in all the cases where the two
+ # ranges overlap, the latter two differences will be negative.
+ return max(0, start2 - end1, start1 - end2)
+
+
class MinDistanceFeature(Feature):
"""Returns the minimum min_distance scaled between -inf and 0.
@@ -34,13 +86,14 @@ class MinDistanceFeature(Feature):
between we scale the normal-domain values linearly, which means the
log-domain values are scaled exponentially.
"""
- def __init__(self, maximum=None):
+ def __init__(self, get_repository, maximum=None):
"""
Args:
maximum (float): An upper bound on the min_distance to
consider. This argument is optional and defaults to
``DEFAULT_MAXIMUM``.
"""
+ self._get_repository = get_repository
if maximum is None:
maximum = DEFAULT_MAXIMUM
self._maximum = maximum
chanli 2017/01/23 04:37:45 Nit: this can be simplified to self._maximum = max
Sharu Jiang 2017/01/23 21:44:46 They are not exactly the same. If maximum is 0, we
@@ -49,8 +102,53 @@ class MinDistanceFeature(Feature):
def name(self):
return 'MinDistance'
+ def DistanceInfoBetweenTouchedFileAndStacktrace(
+ self, revision, touched_file, stack_infos, crash_dependency):
+ """Gets ``DistanceInfo`` between touched lines and crashed lines for a file.
+
+ Args:
chanli 2017/01/23 04:37:45 Nit: add docstring for revision
Sharu Jiang 2017/01/23 21:44:46 Done.
+ touched_file (FileChangeInfo): The file touched by a suspect.
+ stack_infos (list of StackInfos): Information about frames that contains
+ this touched file and the priorities of their callstacks.
+ crash_dependency (Dependency): The depedency of crashed revision.
+
+ Returns:
+ ``DistanceInfo`` object of touched file and stacktrace.
+ """
+ # TODO(katesonia) ``GetBlame`` is called for the same file everytime
+ # there is a suspect that touched it, which can be very expensive.
+ # The blame information can either be cached through repository (cached
+ # by memcache based on repo url, revision and file path), or this
+ # function can have a static in-memory cache to cache blame for touched
+ # files, however since blame information is big, it's not a good idea to
+ # keep it in memory.
+ repository = self._get_repository(crash_dependency.repo_url)
+ blame = repository.GetBlame(touched_file.new_path,
+ crash_dependency.revision)
chanli 2017/01/23 04:37:45 Why not use revision directly? And crash_dependenc
Sharu Jiang 2017/01/23 21:44:46 We also need ``crash_dependency.repo_url`` in the
+ if not blame:
+ logging.warning('Failed to get blame information for %s',
+ touched_file.new_path)
+ return None
+
+ # Distance of this file.
+ distance_info = DistanceInfo(float('inf'), None)
+ for region in blame:
+ if region.revision != revision:
+ continue
+
+ region_start = region.start
+ region_end = region_start + region.count - 1
+ for stack_info in stack_infos:
+ frame_start = stack_info.frame.crashed_line_numbers[0]
+ frame_end = stack_info.frame.crashed_line_numbers[-1]
+ distance = DistanceBetweenLineRanges((frame_start, frame_end),
+ (region_start, region_end))
+ distance_info.Update(distance, stack_info.frame)
+
+ return distance_info
+
def __call__(self, report):
- """Returns the scaled min ``AnalysisInfo.min_distance`` across all files.
+ """Returns the scaled min ``DistanceInfo.distance`` across all files.
Args:
report (CrashReport): the crash report being analyzed.
@@ -60,30 +158,66 @@ class MinDistanceFeature(Feature):
for) a stack frame in that suspect and the CL in that suspect, as a
log-domain ``float``.
"""
- def FeatureValueGivenReport(suspect):
- analyses = suspect.file_to_analysis_info
- if not analyses:
- message = 'No AnalysisInfo for any file in suspect: %s' % str(suspect)
- logging.warning(message)
- return FeatureValue(self.name, lmath.LOG_ZERO, message, None)
+ def FeatureValueGivenReport(suspect, touched_file_to_stack_infos):
+ """Function mapping suspect related data to MinDistance FeatureValue.
+
+ Args:
+ suspect (Suspect): The suspected changelog and some meta information
+ about it.
+ touched_file_to_stack_infos(dict): Dict mapping ``FileChangeInfo`` to
+ a list of ``StackInfo``s representing all the frames that the suspect
+ touched.
+
+ Returns:
+ The ``FeatureValue`` of this feature.
+ """
+ if not touched_file_to_stack_infos:
+ FeatureValue(self.name, lmath.LOG_ZERO,
+ 'No file got touched by the suspect.', None)
- min_distance = min(per_file_analysis.min_distance
- for per_file_analysis in analyses.itervalues())
+ distance_info = DistanceInfo(float('inf'), None)
+ touched_file_to_distance_info = {}
+ for touched_file, stack_infos in touched_file_to_stack_infos.iteritems():
+ distance_info_per_file = (
+ self.DistanceInfoBetweenTouchedFileAndStacktrace(
+ suspect.changelog.revision, touched_file, stack_infos,
+ report.dependencies[suspect.dep_path]))
+ # It is possible that a changelog doesn't show in the blame of a file,
+ # in this case, treat the changelog as if it didn't change the file.
+ if not distance_info_per_file:
+ logging.warning('suspect\'s change cannot be blamed due to lack of'
+ 'blame information for crashed file %s' %
+ touched_file.new_path)
+ continue
+
+ # This file change shouldn't be blamed because it's a blame cl of the
+ # crashed file.
chanli 2017/01/23 04:37:45 I don't quite understand this comment... IIUC, dis
Sharu Jiang 2017/01/23 21:44:46 That's right, wrong comment... corrected it.
+ if distance_info_per_file.IsInfinity():
+ continue
+
+ touched_file_to_distance_info[touched_file] = distance_info_per_file
+ distance_info.Update(distance_info_per_file.distance,
+ distance_info_per_file.frame)
return FeatureValue(
name = self.name,
- value = LogLinearlyScaled(float(min_distance), float(self._maximum)),
- reason = ('Minimum distance is %d' % min_distance),
- changed_files = self._ChangedFiles(suspect),
- )
+ value = LogLinearlyScaled(float(distance_info.distance),
+ float(self._maximum)),
+ reason = ('Minimum distance is %f' % float(distance_info.distance)),
+ changed_files = MinDistanceFeature.ChangedFiles(
+ suspect, touched_file_to_distance_info, report.crashed_version))
return FeatureValueGivenReport
- def _ChangedFiles(self, suspect):
- """Get all the changed files causing this feature to blame this suspect.
+ @staticmethod
+ def ChangedFiles(suspect, touched_file_to_distance_info, crashed_version):
+ """Get all the changed files causing this feature to blame this result.
Arg:
suspect (Suspect): the suspect being blamed.
+ touched_file_to_stack_infos (dict): Dict mapping file name to
+ ``StackInfo``s.
chanli 2017/01/23 04:37:45 DistanceInfo?
Sharu Jiang 2017/01/23 21:44:46 Oops. Done.
+ crashed_version (str): Crashed version.
Returns:
List of ``ChangedFile`` objects sorted by frame index. For example:
@@ -94,34 +228,29 @@ class MinDistanceFeature(Feature):
reasons = ['Minimum distance (LOC) 1, frame #5']
)]
"""
- index_to_changed_files = {}
-
- for file_path, analysis_info in suspect.file_to_analysis_info.iteritems():
- file_name = file_path.split('/')[-1]
- frame = analysis_info.min_distance_frame
- if frame is None: # pragma: no cover
- logging.warning('Missing the min_distance_frame for %s'
- % str(analysis_info))
- continue
+ frame_index_to_changed_files = {}
- # It is possible that a changelog doesn't show in the blame of a file,
- # in this case, treat the changelog as if it didn't change the file.
- if math.isinf(analysis_info.min_distance): # pragma: no cover
- logging.warning('min_distance is infinite for %s' % str(analysis_info))
+ for touched_file, distance_info in (
+ touched_file_to_distance_info.iteritems()):
+ file_name = touched_file.new_path.split('/')[-1]
+ if distance_info.frame is None: # pragma: no cover
+ logging.warning('Missing the min_distance_frame for file %s' %
+ file_name)
continue
- index_to_changed_files[frame.index] = ChangedFile(
- name = file_name,
- blame_url = frame.BlameUrl(suspect.changelog.revision),
- reasons = ['Minimum distance (LOC) %d, frame #%d' % (
- analysis_info.min_distance, frame.index)]
- )
+ frame_index_to_changed_files[distance_info.frame.index] = ChangedFile(
+ name=file_name,
+ blame_url=distance_info.frame.BlameUrl(crashed_version),
+ reasons=['Distance from touched lines and crashed lines is %d, in '
+ 'frame #%d' % (distance_info.distance,
+ distance_info.frame.index)])
- if not index_to_changed_files: # pragma: no cover
+ if not frame_index_to_changed_files: # pragma: no cover
logging.warning('Found no changed files for suspect: %s', str(suspect))
+ return []
# Sort changed file by frame index.
- _, changed_files = zip(*sorted(index_to_changed_files.items(),
+ _, changed_files = zip(*sorted(frame_index_to_changed_files.iteritems(),
key=lambda x: x[0]))
return list(changed_files)

Powered by Google App Engine
This is Rietveld 408576698