Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(201)

Side by Side Diff: appengine/findit/crash/loglinear/changelist_features/min_distance.py

Issue 2613153006: [Predator] Add TouchCrashedFileMetaFeature. (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import logging 5 import logging
6 import math 6 import math
7 7
8 from crash.loglinear.changelist_classifier import StackInfo
8 from crash.loglinear.feature import ChangedFile 9 from crash.loglinear.feature import ChangedFile
9 from crash.loglinear.feature import Feature 10 from crash.loglinear.feature import Feature
10 from crash.loglinear.feature import FeatureValue 11 from crash.loglinear.feature import FeatureValue
11 from crash.loglinear.feature import LogLinearlyScaled 12 from crash.loglinear.feature import LogLinearlyScaled
13 from libs.gitiles.diff import ChangeType
12 import libs.math.logarithms as lmath 14 import libs.math.logarithms as lmath
13 15
14 # N.B., this must not be infinity, else we'll start getting NaN values 16 # N.B., this must not be infinity, else we'll start getting NaN values
15 # from LinearMinDistanceFeature (and SquaredMinDistanceFeature). 17 # from LinearMinDistanceFeature (and SquaredMinDistanceFeature).
16 DEFAULT_MAXIMUM = 50 18 DEFAULT_MAXIMUM = 50
17 19
18 20
21 # TODO(wrengr): we should change things to use integers with None as
chanli 2017/01/23 04:37:45 Nit: since wrengr has left the team, change the TO
Sharu Jiang 2017/01/23 21:44:46 Done.
22 # \"infinity\", rather than using floats.
23 # TODO(http://crbug.com/644476): this class needs a better name.
24 class DistanceInfo(object):
25
26 def __init__(self, distance, frame):
27 self.distance = distance
28 self.frame = frame
29
30 def Update(self, distance, frame):
31 if distance < self.distance:
32 self.distance = distance
33 self.frame = frame
34
35 def IsInfinity(self):
36 return math.isinf(self.distance)
37
38 def __str__(self): # pragma: no cover
39 return 'Min distance(distance = %f, frame = %s)' % (float(self.distance),
40 str(self.frame))
41
42 def __eq__(self, other):
43 return self.distance == other.distance and self.frame == other.frame
44
45
46 def DistanceBetweenLineRanges((start1, end1), (start2, end2)):
47 """Given two ranges, compute the (unsigned) distance between them.
48
49 Args:
50 start1 (int): the first line included in the first range.
51 end1 (int): the last line included in the first range. Must be
52 greater than or equal to ``start1``.
53 start2 (int): the first line included in the second range.
54 end2 (int): the last line included in the second range. Must be
55 greater than or equal to ``start1``.
56
57 Returns:
58 If the end of the earlier range comes before the start of the later
59 range, then the difference between those points. Otherwise, returns
60 zero (because the ranges overlap).
61 """
62 if end1 < start1:
63 raise ValueError('the first range is empty: %d < %d' % (end1, start1))
64 if end2 < start2:
65 raise ValueError('the second range is empty: %d < %d' % (end2, start2))
66 # There are six possible cases, but in all the cases where the two
67 # ranges overlap, the latter two differences will be negative.
68 return max(0, start2 - end1, start1 - end2)
69
70
19 class MinDistanceFeature(Feature): 71 class MinDistanceFeature(Feature):
20 """Returns the minimum min_distance scaled between -inf and 0. 72 """Returns the minimum min_distance scaled between -inf and 0.
21 73
22 That is, the normal-domain value is scaled linearly between 0 and 1, 74 That is, the normal-domain value is scaled linearly between 0 and 1,
23 but since we want to return a log-domain value we take the logarithm 75 but since we want to return a log-domain value we take the logarithm
24 of that (hence -inf to 0). This ensures that when a suspect has a 76 of that (hence -inf to 0). This ensures that when a suspect has a
25 linearly-scaled value of 0 (aka log-scaled value of -inf) we absolutely 77 linearly-scaled value of 0 (aka log-scaled value of -inf) we absolutely
26 refuse to blame that suspect. This heuristic behavior is intended. Before 78 refuse to blame that suspect. This heuristic behavior is intended. Before
27 changing it to be less aggressive about refusing to blame the suspect, 79 changing it to be less aggressive about refusing to blame the suspect,
28 we should delta test to be sure the new heuristic acts as indented. 80 we should delta test to be sure the new heuristic acts as indented.
29 81
30 When the actual minimum min_distance is zero, we return the log-domain 82 When the actual minimum min_distance is zero, we return the log-domain
31 value 0 (aka normal-domain value of 1). When the suspect has no files 83 value 0 (aka normal-domain value of 1). When the suspect has no files
32 or the actual minimum min_distance is greater than the ``maximum``, 84 or the actual minimum min_distance is greater than the ``maximum``,
33 we return the log-domain value -inf (aka normal-domain value of 0). In 85 we return the log-domain value -inf (aka normal-domain value of 0). In
34 between we scale the normal-domain values linearly, which means the 86 between we scale the normal-domain values linearly, which means the
35 log-domain values are scaled exponentially. 87 log-domain values are scaled exponentially.
36 """ 88 """
37 def __init__(self, maximum=None): 89 def __init__(self, get_repository, maximum=None):
38 """ 90 """
39 Args: 91 Args:
40 maximum (float): An upper bound on the min_distance to 92 maximum (float): An upper bound on the min_distance to
41 consider. This argument is optional and defaults to 93 consider. This argument is optional and defaults to
42 ``DEFAULT_MAXIMUM``. 94 ``DEFAULT_MAXIMUM``.
43 """ 95 """
96 self._get_repository = get_repository
44 if maximum is None: 97 if maximum is None:
45 maximum = DEFAULT_MAXIMUM 98 maximum = DEFAULT_MAXIMUM
46 self._maximum = maximum 99 self._maximum = maximum
chanli 2017/01/23 04:37:45 Nit: this can be simplified to self._maximum = max
Sharu Jiang 2017/01/23 21:44:46 They are not exactly the same. If maximum is 0, we
47 100
48 @property 101 @property
49 def name(self): 102 def name(self):
50 return 'MinDistance' 103 return 'MinDistance'
51 104
105 def DistanceInfoBetweenTouchedFileAndStacktrace(
106 self, revision, touched_file, stack_infos, crash_dependency):
107 """Gets ``DistanceInfo`` between touched lines and crashed lines for a file.
108
109 Args:
chanli 2017/01/23 04:37:45 Nit: add docstring for revision
Sharu Jiang 2017/01/23 21:44:46 Done.
110 touched_file (FileChangeInfo): The file touched by a suspect.
111 stack_infos (list of StackInfos): Information about frames that contains
112 this touched file and the priorities of their callstacks.
113 crash_dependency (Dependency): The depedency of crashed revision.
114
115 Returns:
116 ``DistanceInfo`` object of touched file and stacktrace.
117 """
118 # TODO(katesonia) ``GetBlame`` is called for the same file everytime
119 # there is a suspect that touched it, which can be very expensive.
120 # The blame information can either be cached through repository (cached
121 # by memcache based on repo url, revision and file path), or this
122 # function can have a static in-memory cache to cache blame for touched
123 # files, however since blame information is big, it's not a good idea to
124 # keep it in memory.
125 repository = self._get_repository(crash_dependency.repo_url)
126 blame = repository.GetBlame(touched_file.new_path,
127 crash_dependency.revision)
chanli 2017/01/23 04:37:45 Why not use revision directly? And crash_dependenc
Sharu Jiang 2017/01/23 21:44:46 We also need ``crash_dependency.repo_url`` in the
128 if not blame:
129 logging.warning('Failed to get blame information for %s',
130 touched_file.new_path)
131 return None
132
133 # Distance of this file.
134 distance_info = DistanceInfo(float('inf'), None)
135 for region in blame:
136 if region.revision != revision:
137 continue
138
139 region_start = region.start
140 region_end = region_start + region.count - 1
141 for stack_info in stack_infos:
142 frame_start = stack_info.frame.crashed_line_numbers[0]
143 frame_end = stack_info.frame.crashed_line_numbers[-1]
144 distance = DistanceBetweenLineRanges((frame_start, frame_end),
145 (region_start, region_end))
146 distance_info.Update(distance, stack_info.frame)
147
148 return distance_info
149
52 def __call__(self, report): 150 def __call__(self, report):
53 """Returns the scaled min ``AnalysisInfo.min_distance`` across all files. 151 """Returns the scaled min ``DistanceInfo.distance`` across all files.
54 152
55 Args: 153 Args:
56 report (CrashReport): the crash report being analyzed. 154 report (CrashReport): the crash report being analyzed.
57 155
58 Returns: 156 Returns:
59 A function from ``Suspect`` to the minimum distance between (the code 157 A function from ``Suspect`` to the minimum distance between (the code
60 for) a stack frame in that suspect and the CL in that suspect, as a 158 for) a stack frame in that suspect and the CL in that suspect, as a
61 log-domain ``float``. 159 log-domain ``float``.
62 """ 160 """
63 def FeatureValueGivenReport(suspect): 161 def FeatureValueGivenReport(suspect, touched_file_to_stack_infos):
64 analyses = suspect.file_to_analysis_info 162 """Function mapping suspect related data to MinDistance FeatureValue.
65 if not analyses:
66 message = 'No AnalysisInfo for any file in suspect: %s' % str(suspect)
67 logging.warning(message)
68 return FeatureValue(self.name, lmath.LOG_ZERO, message, None)
69 163
70 min_distance = min(per_file_analysis.min_distance 164 Args:
71 for per_file_analysis in analyses.itervalues()) 165 suspect (Suspect): The suspected changelog and some meta information
166 about it.
167 touched_file_to_stack_infos(dict): Dict mapping ``FileChangeInfo`` to
168 a list of ``StackInfo``s representing all the frames that the suspect
169 touched.
170
171 Returns:
172 The ``FeatureValue`` of this feature.
173 """
174 if not touched_file_to_stack_infos:
175 FeatureValue(self.name, lmath.LOG_ZERO,
176 'No file got touched by the suspect.', None)
177
178 distance_info = DistanceInfo(float('inf'), None)
179 touched_file_to_distance_info = {}
180 for touched_file, stack_infos in touched_file_to_stack_infos.iteritems():
181 distance_info_per_file = (
182 self.DistanceInfoBetweenTouchedFileAndStacktrace(
183 suspect.changelog.revision, touched_file, stack_infos,
184 report.dependencies[suspect.dep_path]))
185 # It is possible that a changelog doesn't show in the blame of a file,
186 # in this case, treat the changelog as if it didn't change the file.
187 if not distance_info_per_file:
188 logging.warning('suspect\'s change cannot be blamed due to lack of'
189 'blame information for crashed file %s' %
190 touched_file.new_path)
191 continue
192
193 # This file change shouldn't be blamed because it's a blame cl of the
194 # crashed file.
chanli 2017/01/23 04:37:45 I don't quite understand this comment... IIUC, dis
Sharu Jiang 2017/01/23 21:44:46 That's right, wrong comment... corrected it.
195 if distance_info_per_file.IsInfinity():
196 continue
197
198 touched_file_to_distance_info[touched_file] = distance_info_per_file
199 distance_info.Update(distance_info_per_file.distance,
200 distance_info_per_file.frame)
72 201
73 return FeatureValue( 202 return FeatureValue(
74 name = self.name, 203 name = self.name,
75 value = LogLinearlyScaled(float(min_distance), float(self._maximum)), 204 value = LogLinearlyScaled(float(distance_info.distance),
76 reason = ('Minimum distance is %d' % min_distance), 205 float(self._maximum)),
77 changed_files = self._ChangedFiles(suspect), 206 reason = ('Minimum distance is %f' % float(distance_info.distance)),
78 ) 207 changed_files = MinDistanceFeature.ChangedFiles(
208 suspect, touched_file_to_distance_info, report.crashed_version))
79 209
80 return FeatureValueGivenReport 210 return FeatureValueGivenReport
81 211
82 def _ChangedFiles(self, suspect): 212 @staticmethod
83 """Get all the changed files causing this feature to blame this suspect. 213 def ChangedFiles(suspect, touched_file_to_distance_info, crashed_version):
214 """Get all the changed files causing this feature to blame this result.
84 215
85 Arg: 216 Arg:
86 suspect (Suspect): the suspect being blamed. 217 suspect (Suspect): the suspect being blamed.
218 touched_file_to_stack_infos (dict): Dict mapping file name to
219 ``StackInfo``s.
chanli 2017/01/23 04:37:45 DistanceInfo?
Sharu Jiang 2017/01/23 21:44:46 Oops. Done.
220 crashed_version (str): Crashed version.
87 221
88 Returns: 222 Returns:
89 List of ``ChangedFile`` objects sorted by frame index. For example: 223 List of ``ChangedFile`` objects sorted by frame index. For example:
90 224
91 [ChangedFile( 225 [ChangedFile(
92 file = 'render_frame_impl.cc', 226 file = 'render_frame_impl.cc',
93 blame_url = 'https://chr.com/../render_frame_impl.cc#1586', 227 blame_url = 'https://chr.com/../render_frame_impl.cc#1586',
94 reasons = ['Minimum distance (LOC) 1, frame #5'] 228 reasons = ['Minimum distance (LOC) 1, frame #5']
95 )] 229 )]
96 """ 230 """
97 index_to_changed_files = {} 231 frame_index_to_changed_files = {}
98 232
99 for file_path, analysis_info in suspect.file_to_analysis_info.iteritems(): 233 for touched_file, distance_info in (
100 file_name = file_path.split('/')[-1] 234 touched_file_to_distance_info.iteritems()):
101 frame = analysis_info.min_distance_frame 235 file_name = touched_file.new_path.split('/')[-1]
102 if frame is None: # pragma: no cover 236 if distance_info.frame is None: # pragma: no cover
103 logging.warning('Missing the min_distance_frame for %s' 237 logging.warning('Missing the min_distance_frame for file %s' %
104 % str(analysis_info)) 238 file_name)
105 continue 239 continue
106 240
107 # It is possible that a changelog doesn't show in the blame of a file, 241 frame_index_to_changed_files[distance_info.frame.index] = ChangedFile(
108 # in this case, treat the changelog as if it didn't change the file. 242 name=file_name,
109 if math.isinf(analysis_info.min_distance): # pragma: no cover 243 blame_url=distance_info.frame.BlameUrl(crashed_version),
110 logging.warning('min_distance is infinite for %s' % str(analysis_info)) 244 reasons=['Distance from touched lines and crashed lines is %d, in '
111 continue 245 'frame #%d' % (distance_info.distance,
246 distance_info.frame.index)])
112 247
113 index_to_changed_files[frame.index] = ChangedFile( 248 if not frame_index_to_changed_files: # pragma: no cover
114 name = file_name,
115 blame_url = frame.BlameUrl(suspect.changelog.revision),
116 reasons = ['Minimum distance (LOC) %d, frame #%d' % (
117 analysis_info.min_distance, frame.index)]
118 )
119
120 if not index_to_changed_files: # pragma: no cover
121 logging.warning('Found no changed files for suspect: %s', str(suspect)) 249 logging.warning('Found no changed files for suspect: %s', str(suspect))
250 return []
122 251
123 # Sort changed file by frame index. 252 # Sort changed file by frame index.
124 _, changed_files = zip(*sorted(index_to_changed_files.items(), 253 _, changed_files = zip(*sorted(frame_index_to_changed_files.iteritems(),
125 key=lambda x: x[0])) 254 key=lambda x: x[0]))
126 255
127 return list(changed_files) 256 return list(changed_files)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698