Chromium Code Reviews| Index: appengine/findit/crash/loglinear/changelist_features/min_distance.py |
| diff --git a/appengine/findit/crash/loglinear/changelist_features/min_distance.py b/appengine/findit/crash/loglinear/changelist_features/min_distance.py |
| index 480cf8961c120ad78813103382e8e11d0b78999f..88a2668c4324c355c54ae9a77ce3ae94a688ec9a 100644 |
| --- a/appengine/findit/crash/loglinear/changelist_features/min_distance.py |
| +++ b/appengine/findit/crash/loglinear/changelist_features/min_distance.py |
| @@ -5,15 +5,63 @@ |
| import logging |
| import math |
| +from crash.loglinear.changelist_classifier import StackInfo |
| from crash.loglinear.feature import ChangedFile |
| from crash.loglinear.feature import Feature |
| from crash.loglinear.feature import FeatureValue |
| from crash.loglinear.feature import LogLinearlyScaled |
| +from libs.gitiles.diff import ChangeType |
| import libs.math.logarithms as lmath |
| -# N.B., this must not be infinity, else we'll start getting NaN values |
| -# from LinearMinDistanceFeature (and SquaredMinDistanceFeature). |
| -DEFAULT_MAXIMUM = 50 |
| + |
| +# TODO(katesonia): we should change things to use integers with None as |
|
Martin Barbella
2017/01/24 22:06:40
Could you move this to IsInfinity?
Sharu Jiang
2017/01/25 19:13:31
Done.
|
| +# \"infinity\", rather than using floats. |
| +# TODO(http://crbug.com/644476): this class needs a better name. |
| +class DistanceInfo(object): |
|
Martin Barbella
2017/01/24 22:06:40
Agreed that DistanceInfo isn't the best name for t
Sharu Jiang
2017/01/25 19:13:31
Changed to ``ModifiedFrameInfo``.
|
| + |
| + def __init__(self, distance, frame): |
| + self.distance = distance |
| + self.frame = frame |
| + |
| + def Update(self, distance, frame): |
| + if distance < self.distance: |
| + self.distance = distance |
| + self.frame = frame |
| + |
| + def IsInfinity(self): |
| + return math.isinf(self.distance) |
| + |
| + def __str__(self): # pragma: no cover |
| + return 'Min distance(distance = %f, frame = %s)' % (float(self.distance), |
| + str(self.frame)) |
| + |
| + def __eq__(self, other): |
| + return self.distance == other.distance and self.frame == other.frame |
| + |
| + |
| +def DistanceBetweenLineRanges((start1, end1), (start2, end2)): |
| + """Given two ranges, compute the (unsigned) distance between them. |
| + |
| + Args: |
| + start1 (int): the first line included in the first range. |
| + end1 (int): the last line included in the first range. Must be |
| + greater than or equal to ``start1``. |
| + start2 (int): the first line included in the second range. |
| + end2 (int): the last line included in the second range. Must be |
| + greater than or equal to ``start1``. |
| + |
| + Returns: |
| + If the end of the earlier range comes before the start of the later |
| + range, then the difference between those points. Otherwise, returns |
| + zero (because the ranges overlap). |
| + """ |
| + if end1 < start1: |
| + raise ValueError('the first range is empty: %d < %d' % (end1, start1)) |
| + if end2 < start2: |
| + raise ValueError('the second range is empty: %d < %d' % (end2, start2)) |
| + # There are six possible cases, but in all the cases where the two |
| + # ranges overlap, the latter two differences will be negative. |
| + return max(0, start2 - end1, start1 - end2) |
| class MinDistanceFeature(Feature): |
| @@ -34,23 +82,69 @@ class MinDistanceFeature(Feature): |
| between we scale the normal-domain values linearly, which means the |
| log-domain values are scaled exponentially. |
| """ |
| - def __init__(self, maximum=None): |
| + def __init__(self, get_repository, maximum): |
| """ |
| Args: |
| - maximum (float): An upper bound on the min_distance to |
| - consider. This argument is optional and defaults to |
| - ``DEFAULT_MAXIMUM``. |
| + maximum (float): An upper bound on the min_distance to consider. |
| """ |
| - if maximum is None: |
| - maximum = DEFAULT_MAXIMUM |
| + self._get_repository = get_repository |
| self._maximum = maximum |
| @property |
| def name(self): |
| return 'MinDistance' |
| + def DistanceInfoBetweenTouchedFileAndStacktrace( |
|
Martin Barbella
2017/01/24 22:06:40
"Info" seems unnecessary to me, especially since i
Sharu Jiang
2017/01/25 19:13:31
Deleted it.
|
| + self, revision, touched_file, stack_infos, crash_dependency): |
| + """Gets ``DistanceInfo`` between touched lines and crashed lines for a file. |
| + |
| + Args: |
| + revision (str): The revision of the suspect. |
| + touched_file (FileChangeInfo): The file touched by the suspect. |
| + stack_infos (list of StackInfos): List of information of frames in the |
| + stacktrace which contains ``touched_file``. |
| + crash_dependency (Dependency): The depedency of crashed revision. N.B. The |
| + crashed revision is the revision where crash happens, however the |
| + first parameter ``revision`` is the revision of the suspect cl, which |
| + must be before the crashed revision. |
| + |
| + Returns: |
| + ``DistanceInfo`` object of touched file and stacktrace. |
| + """ |
| + # TODO(katesonia) ``GetBlame`` is called for the same file everytime |
| + # there is a suspect that touched it, which can be very expensive. |
| + # The blame information can either be cached through repository (cached |
| + # by memcache based on repo url, revision and file path), or this |
| + # function can have a static in-memory cache to cache blame for touched |
| + # files, however since blame information is big, it's not a good idea to |
| + # keep it in memory. |
| + repository = self._get_repository(crash_dependency.repo_url) |
| + blame = repository.GetBlame(touched_file.new_path, |
| + crash_dependency.revision) |
| + if not blame: |
| + logging.warning('Failed to get blame information for %s', |
| + touched_file.new_path) |
| + return None |
| + |
| + # Distance of this file. |
| + distance_info = DistanceInfo(float('inf'), None) |
| + for region in blame: |
| + if region.revision != revision: |
| + continue |
| + |
| + region_start = region.start |
| + region_end = region_start + region.count - 1 |
| + for stack_info in stack_infos: |
| + frame_start = stack_info.frame.crashed_line_numbers[0] |
| + frame_end = stack_info.frame.crashed_line_numbers[-1] |
| + distance = DistanceBetweenLineRanges((frame_start, frame_end), |
| + (region_start, region_end)) |
| + distance_info.Update(distance, stack_info.frame) |
| + |
| + return distance_info |
| + |
| def __call__(self, report): |
| - """Returns the scaled min ``AnalysisInfo.min_distance`` across all files. |
| + """Returns the scaled min ``DistanceInfo.distance`` across all files. |
| Args: |
| report (CrashReport): the crash report being analyzed. |
| @@ -60,30 +154,65 @@ class MinDistanceFeature(Feature): |
| for) a stack frame in that suspect and the CL in that suspect, as a |
| log-domain ``float``. |
| """ |
| - def FeatureValueGivenReport(suspect): |
| - analyses = suspect.file_to_analysis_info |
| - if not analyses: |
| - message = 'No AnalysisInfo for any file in suspect: %s' % str(suspect) |
| - logging.warning(message) |
| - return FeatureValue(self.name, lmath.LOG_ZERO, message, None) |
| - |
| - min_distance = min(per_file_analysis.min_distance |
| - for per_file_analysis in analyses.itervalues()) |
| + def FeatureValueGivenReport(suspect, touched_file_to_stack_infos): |
| + """Function mapping suspect related data to MinDistance FeatureValue. |
| + |
| + Args: |
| + suspect (Suspect): The suspected changelog and some meta information |
| + about it. |
| + touched_file_to_stack_infos(dict): Dict mapping ``FileChangeInfo`` to |
| + a list of ``StackInfo``s representing all the frames that the suspect |
| + touched. |
| + |
| + Returns: |
| + The ``FeatureValue`` of this feature. |
| + """ |
| + if not touched_file_to_stack_infos: |
| + FeatureValue(self.name, lmath.LOG_ZERO, |
| + 'No file got touched by the suspect.', None) |
| + |
| + distance_info = DistanceInfo(float('inf'), None) |
| + touched_file_to_distance_info = {} |
| + for touched_file, stack_infos in touched_file_to_stack_infos.iteritems(): |
| + distance_info_per_file = ( |
| + self.DistanceInfoBetweenTouchedFileAndStacktrace( |
| + suspect.changelog.revision, touched_file, stack_infos, |
| + report.dependencies[suspect.dep_path])) |
| + # Failed to get blame information of a file. |
| + if not distance_info_per_file: |
| + logging.warning('suspect\'s change cannot be blamed due to lack of' |
| + 'blame information for crashed file %s' % |
| + touched_file.new_path) |
| + continue |
| + |
| + # It is possible that a changelog doesn't show in the blame of a file, |
| + # in this case, treat the changelog as if it didn't change the file. |
| + if distance_info_per_file.IsInfinity(): |
| + continue |
| + |
| + touched_file_to_distance_info[touched_file] = distance_info_per_file |
| + distance_info.Update(distance_info_per_file.distance, |
| + distance_info_per_file.frame) |
| return FeatureValue( |
| name = self.name, |
| - value = LogLinearlyScaled(float(min_distance), float(self._maximum)), |
| - reason = ('Minimum distance is %d' % min_distance), |
| - changed_files = self._ChangedFiles(suspect), |
| - ) |
| + value = LogLinearlyScaled(float(distance_info.distance), |
| + float(self._maximum)), |
| + reason = ('Minimum distance is %f' % float(distance_info.distance)), |
| + changed_files = MinDistanceFeature.ChangedFiles( |
| + suspect, touched_file_to_distance_info, report.crashed_version)) |
| return FeatureValueGivenReport |
| - def _ChangedFiles(self, suspect): |
| - """Get all the changed files causing this feature to blame this suspect. |
| + @staticmethod |
| + def ChangedFiles(suspect, touched_file_to_distance_info, crashed_version): |
| + """Get all the changed files causing this feature to blame this result. |
| Arg: |
| suspect (Suspect): the suspect being blamed. |
| + touched_file_to_distance_info (dict): Dict mapping file name to |
| + ``DistanceInfo``s. |
| + crashed_version (str): Crashed version. |
| Returns: |
| List of ``ChangedFile`` objects sorted by frame index. For example: |
| @@ -94,34 +223,29 @@ class MinDistanceFeature(Feature): |
| reasons = ['Minimum distance (LOC) 1, frame #5'] |
| )] |
| """ |
| - index_to_changed_files = {} |
| - |
| - for file_path, analysis_info in suspect.file_to_analysis_info.iteritems(): |
| - file_name = file_path.split('/')[-1] |
| - frame = analysis_info.min_distance_frame |
| - if frame is None: # pragma: no cover |
| - logging.warning('Missing the min_distance_frame for %s' |
| - % str(analysis_info)) |
| - continue |
| - |
| - # It is possible that a changelog doesn't show in the blame of a file, |
| - # in this case, treat the changelog as if it didn't change the file. |
| - if math.isinf(analysis_info.min_distance): # pragma: no cover |
| - logging.warning('min_distance is infinite for %s' % str(analysis_info)) |
| + frame_index_to_changed_files = {} |
| + |
| + for touched_file, distance_info in ( |
| + touched_file_to_distance_info.iteritems()): |
| + file_name = touched_file.new_path.split('/')[-1] |
| + if distance_info.frame is None: # pragma: no cover |
| + logging.warning('Missing the min_distance_frame for file %s' % |
| + file_name) |
| continue |
| - index_to_changed_files[frame.index] = ChangedFile( |
| - name = file_name, |
| - blame_url = frame.BlameUrl(suspect.changelog.revision), |
| - reasons = ['Minimum distance (LOC) %d, frame #%d' % ( |
| - analysis_info.min_distance, frame.index)] |
| - ) |
| + frame_index_to_changed_files[distance_info.frame.index] = ChangedFile( |
| + name=file_name, |
| + blame_url=distance_info.frame.BlameUrl(crashed_version), |
| + reasons=['Distance from touched lines and crashed lines is %d, in ' |
| + 'frame #%d' % (distance_info.distance, |
| + distance_info.frame.index)]) |
| - if not index_to_changed_files: # pragma: no cover |
| + if not frame_index_to_changed_files: # pragma: no cover |
| logging.warning('Found no changed files for suspect: %s', str(suspect)) |
| + return [] |
| # Sort changed file by frame index. |
| - _, changed_files = zip(*sorted(index_to_changed_files.items(), |
| + _, changed_files = zip(*sorted(frame_index_to_changed_files.iteritems(), |
| key=lambda x: x[0])) |
| return list(changed_files) |