Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(670)

Side by Side Diff: appengine/findit/crash/loglinear/changelist_features/min_distance.py

Issue 2613153006: [Predator] Add TouchCrashedFileMetaFeature. (Closed)
Patch Set: Add comments. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import logging 5 import logging
6 import math 6 import math
7 7
8 from crash.loglinear.changelist_classifier import StackInfo
8 from crash.loglinear.feature import ChangedFile 9 from crash.loglinear.feature import ChangedFile
9 from crash.loglinear.feature import Feature 10 from crash.loglinear.feature import Feature
10 from crash.loglinear.feature import FeatureValue 11 from crash.loglinear.feature import FeatureValue
11 from crash.loglinear.feature import LogLinearlyScaled 12 from crash.loglinear.feature import LogLinearlyScaled
13 from libs.gitiles.diff import ChangeType
12 import libs.math.logarithms as lmath 14 import libs.math.logarithms as lmath
13 15
14 # N.B., this must not be infinity, else we'll start getting NaN values 16
15 # from LinearMinDistanceFeature (and SquaredMinDistanceFeature). 17
16 DEFAULT_MAXIMUM = 50 18 class ModifiedFrameInfo(object):
19 """Represents the closest frame to a changelog which modified it.
20
21 The "closest" means that the distance between crashed lines in the frame and
22 touched lines in a changelog is minimum.
23
24 Properties:
25 distance (int or float('inf')): The distance between crashed lines and
26 touched lines, if a changelog doesn't show in blame of the crashed file of
27 the crashed version (either it didn't touch the crashed file or it got
28 overwritten by other cls), the distance would be infinite.
29 frame (StackFrame): The frame which got modified.
30 """
31
32 def __init__(self, distance, frame):
33 self.distance = distance
34 self.frame = frame
35
36 def Update(self, distance, frame):
37 if distance < self.distance:
38 self.distance = distance
39 self.frame = frame
40
41 # TODO(katesonia): we should change things to use integers with None as
42 # \"infinity\", rather than using floats.
43 def IsInfinity(self):
44 return math.isinf(self.distance)
45
46 def __str__(self): # pragma: no cover
47 return 'Min distance(distance = %f, frame = %s)' % (float(self.distance),
48 str(self.frame))
49
50 def __eq__(self, other):
51 return self.distance == other.distance and self.frame == other.frame
52
53
54 def DistanceBetweenLineRanges((start1, end1), (start2, end2)):
55 """Given two ranges, compute the (unsigned) distance between them.
56
57 Args:
58 start1 (int): the first line included in the first range.
59 end1 (int): the last line included in the first range. Must be
60 greater than or equal to ``start1``.
61 start2 (int): the first line included in the second range.
62 end2 (int): the last line included in the second range. Must be
63 greater than or equal to ``start1``.
64
65 Returns:
66 If the end of the earlier range comes before the start of the later
67 range, then the difference between those points. Otherwise, returns
68 zero (because the ranges overlap).
69 """
70 if end1 < start1:
71 raise ValueError('the first range is empty: %d < %d' % (end1, start1))
72 if end2 < start2:
73 raise ValueError('the second range is empty: %d < %d' % (end2, start2))
74 # There are six possible cases, but in all the cases where the two
75 # ranges overlap, the latter two differences will be negative.
76 return max(0, start2 - end1, start1 - end2)
17 77
18 78
19 class MinDistanceFeature(Feature): 79 class MinDistanceFeature(Feature):
20 """Returns the minimum min_distance scaled between -inf and 0. 80 """Returns the minimum min_distance scaled between -inf and 0.
21 81
22 That is, the normal-domain value is scaled linearly between 0 and 1, 82 That is, the normal-domain value is scaled linearly between 0 and 1,
23 but since we want to return a log-domain value we take the logarithm 83 but since we want to return a log-domain value we take the logarithm
24 of that (hence -inf to 0). This ensures that when a suspect has a 84 of that (hence -inf to 0). This ensures that when a suspect has a
25 linearly-scaled value of 0 (aka log-scaled value of -inf) we absolutely 85 linearly-scaled value of 0 (aka log-scaled value of -inf) we absolutely
26 refuse to blame that suspect. This heuristic behavior is intended. Before 86 refuse to blame that suspect. This heuristic behavior is intended. Before
27 changing it to be less aggressive about refusing to blame the suspect, 87 changing it to be less aggressive about refusing to blame the suspect,
28 we should delta test to be sure the new heuristic acts as indented. 88 we should delta test to be sure the new heuristic acts as indented.
29 89
30 When the actual minimum min_distance is zero, we return the log-domain 90 When the actual minimum min_distance is zero, we return the log-domain
31 value 0 (aka normal-domain value of 1). When the suspect has no files 91 value 0 (aka normal-domain value of 1). When the suspect has no files
32 or the actual minimum min_distance is greater than the ``maximum``, 92 or the actual minimum min_distance is greater than the ``maximum``,
33 we return the log-domain value -inf (aka normal-domain value of 0). In 93 we return the log-domain value -inf (aka normal-domain value of 0). In
34 between we scale the normal-domain values linearly, which means the 94 between we scale the normal-domain values linearly, which means the
35 log-domain values are scaled exponentially. 95 log-domain values are scaled exponentially.
36 """ 96 """
37 def __init__(self, maximum=None): 97 def __init__(self, get_repository, maximum):
38 """ 98 """
39 Args: 99 Args:
40 maximum (float): An upper bound on the min_distance to 100 maximum (float): An upper bound on the min_distance to consider.
41 consider. This argument is optional and defaults to
42 ``DEFAULT_MAXIMUM``.
43 """ 101 """
44 if maximum is None: 102 self._get_repository = get_repository
45 maximum = DEFAULT_MAXIMUM
46 self._maximum = maximum 103 self._maximum = maximum
47 104
48 @property 105 @property
49 def name(self): 106 def name(self):
50 return 'MinDistance' 107 return 'MinDistance'
51 108
109 def DistanceBetweenTouchedFileAndStacktrace(
110 self, revision, touched_file, stack_infos, crash_dependency):
111 """Gets ``ModifiedFrameInfo`` between touched and crashed lines in a file.
112
113 Args:
114 revision (str): The revision of the suspect.
115 touched_file (FileChangeInfo): The file touched by the suspect.
116 stack_infos (list of StackInfos): List of information of frames in the
117 stacktrace which contains ``touched_file``.
118 crash_dependency (Dependency): The depedency of crashed revision. N.B. The
119 crashed revision is the revision where crash happens, however the
120 first parameter ``revision`` is the revision of the suspect cl, which
121 must be before the crashed revision.
122
123 Returns:
124 ``ModifiedFrameInfo`` object of touched file and stacktrace.
125 """
126 # TODO(katesonia) ``GetBlame`` is called for the same file everytime
127 # there is a suspect that touched it, which can be very expensive.
128 # The blame information can either be cached through repository (cached
129 # by memcache based on repo url, revision and file path), or this
130 # function can have a static in-memory cache to cache blame for touched
131 # files, however since blame information is big, it's not a good idea to
132 # keep it in memory.
133 repository = self._get_repository(crash_dependency.repo_url)
134 blame = repository.GetBlame(touched_file.new_path,
135 crash_dependency.revision)
136 if not blame:
137 logging.warning('Failed to get blame information for %s',
138 touched_file.new_path)
139 return None
140
141 # Distance of this file.
142 modified_frame_info = ModifiedFrameInfo(float('inf'), None)
143 for region in blame:
144 if region.revision != revision:
145 continue
146
147 region_start = region.start
148 region_end = region_start + region.count - 1
149 for stack_info in stack_infos:
150 frame_start = stack_info.frame.crashed_line_numbers[0]
151 frame_end = stack_info.frame.crashed_line_numbers[-1]
152 distance = DistanceBetweenLineRanges((frame_start, frame_end),
153 (region_start, region_end))
154 modified_frame_info.Update(distance, stack_info.frame)
155
156 return modified_frame_info
157
52 def __call__(self, report): 158 def __call__(self, report):
53 """Returns the scaled min ``AnalysisInfo.min_distance`` across all files. 159 """Returns the scaled min ``ModifiedFrameInfo.distance`` across all files.
54 160
55 Args: 161 Args:
56 report (CrashReport): the crash report being analyzed. 162 report (CrashReport): the crash report being analyzed.
57 163
58 Returns: 164 Returns:
59 A function from ``Suspect`` to the minimum distance between (the code 165 A function from ``Suspect`` to the minimum distance between (the code
60 for) a stack frame in that suspect and the CL in that suspect, as a 166 for) a stack frame in that suspect and the CL in that suspect, as a
61 log-domain ``float``. 167 log-domain ``float``.
62 """ 168 """
63 def FeatureValueGivenReport(suspect): 169 def FeatureValueGivenReport(suspect, touched_file_to_stack_infos):
64 analyses = suspect.file_to_analysis_info 170 """Function mapping suspect related data to MinDistance FeatureValue.
65 if not analyses:
66 message = 'No AnalysisInfo for any file in suspect: %s' % str(suspect)
67 logging.warning(message)
68 return FeatureValue(self.name, lmath.LOG_ZERO, message, None)
69 171
70 min_distance = min(per_file_analysis.min_distance 172 Args:
71 for per_file_analysis in analyses.itervalues()) 173 suspect (Suspect): The suspected changelog and some meta information
174 about it.
175 touched_file_to_stack_infos(dict): Dict mapping ``FileChangeInfo`` to
176 a list of ``StackInfo``s representing all the frames that the suspect
177 touched.
178
179 Returns:
180 The ``FeatureValue`` of this feature.
181 """
182 if not touched_file_to_stack_infos:
183 FeatureValue(self.name, lmath.LOG_ZERO,
184 'No file got touched by the suspect.', None)
185
186 modified_frame_info = ModifiedFrameInfo(float('inf'), None)
187 touched_file_to_modified_frame_info = {}
188 for touched_file, stack_infos in touched_file_to_stack_infos.iteritems():
189 # Records the closest frame (the frame has minimum distance between
190 # crashed lines and touched lines) for each touched file of the suspect.
191 modified_frame_info_per_file = (
192 self.DistanceBetweenTouchedFileAndStacktrace(
193 suspect.changelog.revision, touched_file, stack_infos,
194 report.dependencies[suspect.dep_path]))
195 # Failed to get blame information of a file.
196 if not modified_frame_info_per_file:
197 logging.warning('suspect\'s change cannot be blamed due to lack of'
198 'blame information for crashed file %s' %
199 touched_file.new_path)
200 continue
201
202 # It is possible that a changelog doesn't show in the blame of a file,
203 # in this case, treat the changelog as if it didn't change the file.
204 if modified_frame_info_per_file.IsInfinity():
205 continue
206
207 touched_file_to_modified_frame_info[
208 touched_file] = modified_frame_info_per_file
209 modified_frame_info.Update(modified_frame_info_per_file.distance,
210 modified_frame_info_per_file.frame)
72 211
73 return FeatureValue( 212 return FeatureValue(
74 name = self.name, 213 name = self.name,
75 value = LogLinearlyScaled(float(min_distance), float(self._maximum)), 214 value = LogLinearlyScaled(float(modified_frame_info.distance),
76 reason = ('Minimum distance is %d' % min_distance 215 float(self._maximum)),
77 if not math.isinf(min_distance) else 216 reason = ('Minimum distance is %d' % int(modified_frame_info.distance)
217 if not math.isinf(modified_frame_info.distance) else
78 'Minimum distance is infinity'), 218 'Minimum distance is infinity'),
79 changed_files = self._ChangedFiles(suspect), 219 changed_files = MinDistanceFeature.ChangedFiles(
80 ) 220 suspect, touched_file_to_modified_frame_info,
221 report.crashed_version))
81 222
82 return FeatureValueGivenReport 223 return FeatureValueGivenReport
83 224
84 def _ChangedFiles(self, suspect): 225 @staticmethod
85 """Get all the changed files causing this feature to blame this suspect. 226 def ChangedFiles(suspect, touched_file_to_modified_frame_info,
227 crashed_version):
228 """Get all the changed files causing this feature to blame this result.
86 229
87 Arg: 230 Arg:
88 suspect (Suspect): the suspect being blamed. 231 suspect (Suspect): the suspect being blamed.
232 touched_file_to_modified_frame_info (dict): Dict mapping file name to
233 ``ModifiedFrameInfo``s.
234 crashed_version (str): Crashed version.
89 235
90 Returns: 236 Returns:
91 List of ``ChangedFile`` objects sorted by frame index. For example: 237 List of ``ChangedFile`` objects sorted by frame index. For example:
92 238
93 [ChangedFile( 239 [ChangedFile(
94 file = 'render_frame_impl.cc', 240 file = 'render_frame_impl.cc',
95 blame_url = 'https://chr.com/../render_frame_impl.cc#1586', 241 blame_url = 'https://chr.com/../render_frame_impl.cc#1586',
96 reasons = ['Minimum distance (LOC) 1, frame #5'] 242 reasons = ['Minimum distance (LOC) 1, frame #5']
97 )] 243 )]
98 """ 244 """
99 index_to_changed_files = {} 245 frame_index_to_changed_files = {}
100 246
101 for file_path, analysis_info in suspect.file_to_analysis_info.iteritems(): 247 for touched_file, modified_frame_info in (
102 file_name = file_path.split('/')[-1] 248 touched_file_to_modified_frame_info.iteritems()):
103 frame = analysis_info.min_distance_frame 249 file_name = touched_file.new_path.split('/')[-1]
104 if frame is None: # pragma: no cover 250 if modified_frame_info.frame is None: # pragma: no cover
105 logging.warning('Missing the min_distance_frame for %s' 251 logging.warning('Missing the min_distance_frame for file %s' %
106 % str(analysis_info)) 252 file_name)
107 continue 253 continue
108 254
109 # It is possible that a changelog doesn't show in the blame of a file, 255 frame_index_to_changed_files[
110 # in this case, treat the changelog as if it didn't change the file. 256 modified_frame_info.frame.index] = ChangedFile(
111 if math.isinf(analysis_info.min_distance): # pragma: no cover 257 name=file_name,
112 logging.warning('min_distance is infinite for %s' % str(analysis_info)) 258 blame_url=modified_frame_info.frame.BlameUrl(crashed_version),
113 continue 259 reasons=['Distance from touched lines and crashed lines is %d, in'
260 ' frame #%d' % (modified_frame_info.distance,
261 modified_frame_info.frame.index)])
114 262
115 index_to_changed_files[frame.index] = ChangedFile( 263 if not frame_index_to_changed_files: # pragma: no cover
116 name = file_name,
117 blame_url = frame.BlameUrl(suspect.changelog.revision),
118 reasons = ['Minimum distance (LOC) %d, frame #%d' % (
119 analysis_info.min_distance, frame.index)]
120 )
121
122 if not index_to_changed_files: # pragma: no cover
123 logging.warning('Found no changed files for suspect: %s', str(suspect)) 264 logging.warning('Found no changed files for suspect: %s', str(suspect))
265 return []
124 266
125 # Sort changed file by frame index. 267 # Sort changed file by frame index.
126 _, changed_files = zip(*sorted(index_to_changed_files.items(), 268 _, changed_files = zip(*sorted(frame_index_to_changed_files.iteritems(),
127 key=lambda x: x[0])) 269 key=lambda x: x[0]))
128 270
129 return list(changed_files) 271 return list(changed_files)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698