Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(86)

Side by Side Diff: appengine/findit/crash/loglinear/changelist_features/min_distance.py

Issue 2613153006: [Predator] Add TouchCrashedFileMetaFeature. (Closed)
Patch Set: Fix nits. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import logging 5 import logging
6 import math 6 import math
7 7
8 from crash.loglinear.changelist_classifier import StackInfo
8 from crash.loglinear.feature import ChangedFile 9 from crash.loglinear.feature import ChangedFile
9 from crash.loglinear.feature import Feature 10 from crash.loglinear.feature import Feature
10 from crash.loglinear.feature import FeatureValue 11 from crash.loglinear.feature import FeatureValue
11 from crash.loglinear.feature import LogLinearlyScaled 12 from crash.loglinear.feature import LogLinearlyScaled
13 from libs.gitiles.diff import ChangeType
12 import libs.math.logarithms as lmath 14 import libs.math.logarithms as lmath
13 15
14 # N.B., this must not be infinity, else we'll start getting NaN values 16
15 # from LinearMinDistanceFeature (and SquaredMinDistanceFeature). 17 # TODO(katesonia): we should change things to use integers with None as
Martin Barbella 2017/01/24 22:06:40 Could you move this to IsInfinity?
Sharu Jiang 2017/01/25 19:13:31 Done.
16 DEFAULT_MAXIMUM = 50 18 # \"infinity\", rather than using floats.
19 # TODO(http://crbug.com/644476): this class needs a better name.
20 class DistanceInfo(object):
Martin Barbella 2017/01/24 22:06:40 Agreed that DistanceInfo isn't the best name for t
Sharu Jiang 2017/01/25 19:13:31 Changed to ``ModifiedFrameInfo``.
21
22 def __init__(self, distance, frame):
23 self.distance = distance
24 self.frame = frame
25
26 def Update(self, distance, frame):
27 if distance < self.distance:
28 self.distance = distance
29 self.frame = frame
30
31 def IsInfinity(self):
32 return math.isinf(self.distance)
33
34 def __str__(self): # pragma: no cover
35 return 'Min distance(distance = %f, frame = %s)' % (float(self.distance),
36 str(self.frame))
37
38 def __eq__(self, other):
39 return self.distance == other.distance and self.frame == other.frame
40
41
42 def DistanceBetweenLineRanges((start1, end1), (start2, end2)):
43 """Given two ranges, compute the (unsigned) distance between them.
44
45 Args:
46 start1 (int): the first line included in the first range.
47 end1 (int): the last line included in the first range. Must be
48 greater than or equal to ``start1``.
49 start2 (int): the first line included in the second range.
50 end2 (int): the last line included in the second range. Must be
51 greater than or equal to ``start1``.
52
53 Returns:
54 If the end of the earlier range comes before the start of the later
55 range, then the difference between those points. Otherwise, returns
56 zero (because the ranges overlap).
57 """
58 if end1 < start1:
59 raise ValueError('the first range is empty: %d < %d' % (end1, start1))
60 if end2 < start2:
61 raise ValueError('the second range is empty: %d < %d' % (end2, start2))
62 # There are six possible cases, but in all the cases where the two
63 # ranges overlap, the latter two differences will be negative.
64 return max(0, start2 - end1, start1 - end2)
17 65
18 66
19 class MinDistanceFeature(Feature): 67 class MinDistanceFeature(Feature):
20 """Returns the minimum min_distance scaled between -inf and 0. 68 """Returns the minimum min_distance scaled between -inf and 0.
21 69
22 That is, the normal-domain value is scaled linearly between 0 and 1, 70 That is, the normal-domain value is scaled linearly between 0 and 1,
23 but since we want to return a log-domain value we take the logarithm 71 but since we want to return a log-domain value we take the logarithm
24 of that (hence -inf to 0). This ensures that when a suspect has a 72 of that (hence -inf to 0). This ensures that when a suspect has a
25 linearly-scaled value of 0 (aka log-scaled value of -inf) we absolutely 73 linearly-scaled value of 0 (aka log-scaled value of -inf) we absolutely
26 refuse to blame that suspect. This heuristic behavior is intended. Before 74 refuse to blame that suspect. This heuristic behavior is intended. Before
27 changing it to be less aggressive about refusing to blame the suspect, 75 changing it to be less aggressive about refusing to blame the suspect,
28 we should delta test to be sure the new heuristic acts as indented. 76 we should delta test to be sure the new heuristic acts as indented.
29 77
30 When the actual minimum min_distance is zero, we return the log-domain 78 When the actual minimum min_distance is zero, we return the log-domain
31 value 0 (aka normal-domain value of 1). When the suspect has no files 79 value 0 (aka normal-domain value of 1). When the suspect has no files
32 or the actual minimum min_distance is greater than the ``maximum``, 80 or the actual minimum min_distance is greater than the ``maximum``,
33 we return the log-domain value -inf (aka normal-domain value of 0). In 81 we return the log-domain value -inf (aka normal-domain value of 0). In
34 between we scale the normal-domain values linearly, which means the 82 between we scale the normal-domain values linearly, which means the
35 log-domain values are scaled exponentially. 83 log-domain values are scaled exponentially.
36 """ 84 """
37 def __init__(self, maximum=None): 85 def __init__(self, get_repository, maximum):
38 """ 86 """
39 Args: 87 Args:
40 maximum (float): An upper bound on the min_distance to 88 maximum (float): An upper bound on the min_distance to consider.
41 consider. This argument is optional and defaults to
42 ``DEFAULT_MAXIMUM``.
43 """ 89 """
44 if maximum is None: 90 self._get_repository = get_repository
45 maximum = DEFAULT_MAXIMUM
46 self._maximum = maximum 91 self._maximum = maximum
47 92
48 @property 93 @property
49 def name(self): 94 def name(self):
50 return 'MinDistance' 95 return 'MinDistance'
51 96
97 def DistanceInfoBetweenTouchedFileAndStacktrace(
Martin Barbella 2017/01/24 22:06:40 "Info" seems unnecessary to me, especially since i
Sharu Jiang 2017/01/25 19:13:31 Deleted it.
98 self, revision, touched_file, stack_infos, crash_dependency):
99 """Gets ``DistanceInfo`` between touched lines and crashed lines for a file.
100
101 Args:
102 revision (str): The revision of the suspect.
103 touched_file (FileChangeInfo): The file touched by the suspect.
104 stack_infos (list of StackInfos): List of information of frames in the
105 stacktrace which contains ``touched_file``.
106 crash_dependency (Dependency): The depedency of crashed revision. N.B. The
107 crashed revision is the revision where crash happens, however the
108 first parameter ``revision`` is the revision of the suspect cl, which
109 must be before the crashed revision.
110
111 Returns:
112 ``DistanceInfo`` object of touched file and stacktrace.
113 """
114 # TODO(katesonia) ``GetBlame`` is called for the same file everytime
115 # there is a suspect that touched it, which can be very expensive.
116 # The blame information can either be cached through repository (cached
117 # by memcache based on repo url, revision and file path), or this
118 # function can have a static in-memory cache to cache blame for touched
119 # files, however since blame information is big, it's not a good idea to
120 # keep it in memory.
121 repository = self._get_repository(crash_dependency.repo_url)
122 blame = repository.GetBlame(touched_file.new_path,
123 crash_dependency.revision)
124 if not blame:
125 logging.warning('Failed to get blame information for %s',
126 touched_file.new_path)
127 return None
128
129 # Distance of this file.
130 distance_info = DistanceInfo(float('inf'), None)
131 for region in blame:
132 if region.revision != revision:
133 continue
134
135 region_start = region.start
136 region_end = region_start + region.count - 1
137 for stack_info in stack_infos:
138 frame_start = stack_info.frame.crashed_line_numbers[0]
139 frame_end = stack_info.frame.crashed_line_numbers[-1]
140 distance = DistanceBetweenLineRanges((frame_start, frame_end),
141 (region_start, region_end))
142 distance_info.Update(distance, stack_info.frame)
143
144 return distance_info
145
52 def __call__(self, report): 146 def __call__(self, report):
53 """Returns the scaled min ``AnalysisInfo.min_distance`` across all files. 147 """Returns the scaled min ``DistanceInfo.distance`` across all files.
54 148
55 Args: 149 Args:
56 report (CrashReport): the crash report being analyzed. 150 report (CrashReport): the crash report being analyzed.
57 151
58 Returns: 152 Returns:
59 A function from ``Suspect`` to the minimum distance between (the code 153 A function from ``Suspect`` to the minimum distance between (the code
60 for) a stack frame in that suspect and the CL in that suspect, as a 154 for) a stack frame in that suspect and the CL in that suspect, as a
61 log-domain ``float``. 155 log-domain ``float``.
62 """ 156 """
63 def FeatureValueGivenReport(suspect): 157 def FeatureValueGivenReport(suspect, touched_file_to_stack_infos):
64 analyses = suspect.file_to_analysis_info 158 """Function mapping suspect related data to MinDistance FeatureValue.
65 if not analyses:
66 message = 'No AnalysisInfo for any file in suspect: %s' % str(suspect)
67 logging.warning(message)
68 return FeatureValue(self.name, lmath.LOG_ZERO, message, None)
69 159
70 min_distance = min(per_file_analysis.min_distance 160 Args:
71 for per_file_analysis in analyses.itervalues()) 161 suspect (Suspect): The suspected changelog and some meta information
162 about it.
163 touched_file_to_stack_infos(dict): Dict mapping ``FileChangeInfo`` to
164 a list of ``StackInfo``s representing all the frames that the suspect
165 touched.
166
167 Returns:
168 The ``FeatureValue`` of this feature.
169 """
170 if not touched_file_to_stack_infos:
171 FeatureValue(self.name, lmath.LOG_ZERO,
172 'No file got touched by the suspect.', None)
173
174 distance_info = DistanceInfo(float('inf'), None)
175 touched_file_to_distance_info = {}
176 for touched_file, stack_infos in touched_file_to_stack_infos.iteritems():
177 distance_info_per_file = (
178 self.DistanceInfoBetweenTouchedFileAndStacktrace(
179 suspect.changelog.revision, touched_file, stack_infos,
180 report.dependencies[suspect.dep_path]))
181 # Failed to get blame information of a file.
182 if not distance_info_per_file:
183 logging.warning('suspect\'s change cannot be blamed due to lack of'
184 'blame information for crashed file %s' %
185 touched_file.new_path)
186 continue
187
188 # It is possible that a changelog doesn't show in the blame of a file,
189 # in this case, treat the changelog as if it didn't change the file.
190 if distance_info_per_file.IsInfinity():
191 continue
192
193 touched_file_to_distance_info[touched_file] = distance_info_per_file
194 distance_info.Update(distance_info_per_file.distance,
195 distance_info_per_file.frame)
72 196
73 return FeatureValue( 197 return FeatureValue(
74 name = self.name, 198 name = self.name,
75 value = LogLinearlyScaled(float(min_distance), float(self._maximum)), 199 value = LogLinearlyScaled(float(distance_info.distance),
76 reason = ('Minimum distance is %d' % min_distance), 200 float(self._maximum)),
77 changed_files = self._ChangedFiles(suspect), 201 reason = ('Minimum distance is %f' % float(distance_info.distance)),
78 ) 202 changed_files = MinDistanceFeature.ChangedFiles(
203 suspect, touched_file_to_distance_info, report.crashed_version))
79 204
80 return FeatureValueGivenReport 205 return FeatureValueGivenReport
81 206
82 def _ChangedFiles(self, suspect): 207 @staticmethod
83 """Get all the changed files causing this feature to blame this suspect. 208 def ChangedFiles(suspect, touched_file_to_distance_info, crashed_version):
209 """Get all the changed files causing this feature to blame this result.
84 210
85 Arg: 211 Arg:
86 suspect (Suspect): the suspect being blamed. 212 suspect (Suspect): the suspect being blamed.
213 touched_file_to_distance_info (dict): Dict mapping file name to
214 ``DistanceInfo``s.
215 crashed_version (str): Crashed version.
87 216
88 Returns: 217 Returns:
89 List of ``ChangedFile`` objects sorted by frame index. For example: 218 List of ``ChangedFile`` objects sorted by frame index. For example:
90 219
91 [ChangedFile( 220 [ChangedFile(
92 file = 'render_frame_impl.cc', 221 file = 'render_frame_impl.cc',
93 blame_url = 'https://chr.com/../render_frame_impl.cc#1586', 222 blame_url = 'https://chr.com/../render_frame_impl.cc#1586',
94 reasons = ['Minimum distance (LOC) 1, frame #5'] 223 reasons = ['Minimum distance (LOC) 1, frame #5']
95 )] 224 )]
96 """ 225 """
97 index_to_changed_files = {} 226 frame_index_to_changed_files = {}
98 227
99 for file_path, analysis_info in suspect.file_to_analysis_info.iteritems(): 228 for touched_file, distance_info in (
100 file_name = file_path.split('/')[-1] 229 touched_file_to_distance_info.iteritems()):
101 frame = analysis_info.min_distance_frame 230 file_name = touched_file.new_path.split('/')[-1]
102 if frame is None: # pragma: no cover 231 if distance_info.frame is None: # pragma: no cover
103 logging.warning('Missing the min_distance_frame for %s' 232 logging.warning('Missing the min_distance_frame for file %s' %
104 % str(analysis_info)) 233 file_name)
105 continue 234 continue
106 235
107 # It is possible that a changelog doesn't show in the blame of a file, 236 frame_index_to_changed_files[distance_info.frame.index] = ChangedFile(
108 # in this case, treat the changelog as if it didn't change the file. 237 name=file_name,
109 if math.isinf(analysis_info.min_distance): # pragma: no cover 238 blame_url=distance_info.frame.BlameUrl(crashed_version),
110 logging.warning('min_distance is infinite for %s' % str(analysis_info)) 239 reasons=['Distance from touched lines and crashed lines is %d, in '
111 continue 240 'frame #%d' % (distance_info.distance,
241 distance_info.frame.index)])
112 242
113 index_to_changed_files[frame.index] = ChangedFile( 243 if not frame_index_to_changed_files: # pragma: no cover
114 name = file_name,
115 blame_url = frame.BlameUrl(suspect.changelog.revision),
116 reasons = ['Minimum distance (LOC) %d, frame #%d' % (
117 analysis_info.min_distance, frame.index)]
118 )
119
120 if not index_to_changed_files: # pragma: no cover
121 logging.warning('Found no changed files for suspect: %s', str(suspect)) 244 logging.warning('Found no changed files for suspect: %s', str(suspect))
245 return []
122 246
123 # Sort changed file by frame index. 247 # Sort changed file by frame index.
124 _, changed_files = zip(*sorted(index_to_changed_files.items(), 248 _, changed_files = zip(*sorted(frame_index_to_changed_files.iteritems(),
125 key=lambda x: x[0])) 249 key=lambda x: x[0]))
126 250
127 return list(changed_files) 251 return list(changed_files)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698