Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(543)

Side by Side Diff: tools/findit/findit_for_crash.py

Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: addressed codereview Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 import os
6 from threading import Lock, Thread
7
8 import blame
9 from common import utils
10 import component_dictionary
11 import crash_utils
12 import git_repository_parser
13 import match_set
14 import svn_repository_parser
15
16
17 LINE_CHANGE_PRIORITY = 1
18 FILE_CHANGE_PRIORITY = 2
19 _THIS_DIR = os.path.abspath(os.path.dirname(__file__))
20 PARSED_CONFIG = crash_utils.ParseURLsFromConfig(os.path.join(_THIS_DIR,
21 'config.ini'))
22
23
24 def GenerateMatchEntry(
25 matches, revision, cl, file_path, file_name, function, component_path,
stgao 2014/08/18 22:37:09 |revision| -> |revision_info|? |cl| -> |cl_revisio
jeun 2014/08/18 23:52:15 Done.
26 component_name, crashed_lines, stack_frame_indices, file_change_type,
27 repository_parser):
28 """Generates a match object and adds it to the match set.
29
30 Args:
31 matches: A matchset object, a map from CL to a match object.
32 revision: The revision information, as returned from a function
33 ParseChangelog in parser.
34 cl: The SVN revision number or git hash.
35 file_path: The path of the file.
36 file_name: The name of the file that this CL might be the culprit cl.
37 function: The function that caused an crash.
38 component_path: The path of the component this file is from.
39 component_name: The name of the component the file is from.
40 crashed_lines: The list of the lines in the file that caused the crash.
stgao 2014/08/18 22:37:08 Line number or line content?
jeun 2014/08/18 23:52:15 Changed to crashed_line_numbers
41 stack_frame_indices: The list of positions of this file within a stack.
42 file_change_type: Whether file is modified, added or deleted.
43 repository_parser: The parser object to parse line diff.
44 """
45 # Check if this CL should be ignored.
46 with matches.matches_lock:
47 if cl in matches.cls_to_ignore:
48 return
49
50 # If this CL is not already identified as suspected, create a new entry.
51 if cl not in matches.matches:
52 match = match_set.Match(revision, component_name)
53 message = revision['message']
54 # TODO(jeun): Don't hold lock while issuing http request.
55 match.ParseMessage(message, matches.codereview_api_url)
56
57 # If this match is a revert, add to the set of CLs to be ignored.
58 if match.is_reverted:
59 matches.cls_to_ignore.add(cl)
60
61 # If this match has info on what CL it is reverted from, add that CL.
62 if match.revert_of:
63 matches.cls_to_ignore.add(match.revert_of)
stgao 2014/08/18 22:37:07 What if |match.revert_of| is None? It is not true
jeun 2014/08/18 23:52:14 I check in line 62 if match.revert_of is None
64
65 # Return because we do not need to look at this CL anymore.
66 return
67
68 # Add this CL to the set of matches.
69 matches.matches[cl] = match
70
71 # Else, bring in the existing result.
72 else:
73 match = matches.matches[cl]
74
75 # Parse line diff information for this file.
76 (diff_url, changed_line_numbers, changed_line_contents) = (
77 repository_parser.ParseLineDiff(
78 file_path, component_path, file_change_type, cl))
79
80 # Ignore this match if the component is not supported for svn.
81 if not diff_url:
82 return
83
84 # Find the intersection between the lines that this file crashed on and
85 # the changed lines.
86 (line_intersection, stack_frame_index_intersection) = (
stgao 2014/08/18 22:37:07 line_number_intersection?
jeun 2014/08/18 23:52:14 Done.
87 crash_utils.Intersection(
88 crashed_lines, stack_frame_indices, changed_line_numbers))
89
90 # Find the minimum distance between the changed lines and crashed lines.
91 min_distance = crash_utils.FindMinLineDistance(crashed_lines,
92 changed_line_numbers)
93
94 # Check whether this CL changes the crashed lines or not.
95 if line_intersection:
96 priority = LINE_CHANGE_PRIORITY
97 else:
98 priority = FILE_CHANGE_PRIORITY
99
100 # Add the parsed information to the object.
101 with matches.matches_lock:
102 match.crashed_line_numbers.append(line_intersection)
103 match.changed_files.append(file_name)
104
105 # Update the min distance only if it is less than the current one.
106 if min_distance < match.min_distance:
107 match.min_distance = min_distance
108
109 # If this CL does not change the crashed line, all occurrence of this
110 # file in the stack has the same priority.
111 if not stack_frame_index_intersection:
112 stack_frame_index_intersection = stack_frame_indices
113 match.stack_frame_indices.append(stack_frame_index_intersection)
114 match.changed_file_urls.append(diff_url)
115
116 # Only record the highest rank of this CL.
117 if priority < match.rank:
stgao 2014/08/18 22:37:08 If priority and rank are the same, could we just u
jeun 2014/08/18 23:52:14 match.rank is same as the highest priority. I will
118 match.rank = priority
119 match.priorities.append(priority)
120 match.function_list.append(function)
121
122
123 def FindMatch(revisions, file_map, file_dict, component_path, component_name,
124 repository_parser, codereview_api_url):
125 """Finds a CL that modifies file in the stacktrace.
126
127 Args:
128 revisions: A dictionary mapping revision number to the CL information.
stgao 2014/08/18 22:37:09 revision_info_map?
jeun 2014/08/18 23:52:14 Done.
129 file_map: A dictionary mapping file to the revision that modifies it.
stgao 2014/08/18 22:37:08 file_to_revision_info?
jeun 2014/08/18 23:52:14 Done.
130 file_dict: A dictionary mapping file to its occurrence in stacktrace.
stgao 2014/08/18 22:37:08 file_to_frames?
jeun 2014/08/18 23:52:14 Renamed to file_to_crash_map
131 component_path: The path of the component to search for.
132 component_name: The name of the component to search for.
133 repository_parser: The parser object to parse the line diff.
134 codereview_api_url: A code review url to retrieve data from.
135
136 Returns:
137 Matches, a set of match objects.
138 """
139 matches = match_set.MatchSet(codereview_api_url)
140 threads = []
141
142 # For all files in stacktrace, if it is not header file, check against
143 # all CLs to see if any CL changes this file. If so, add to the set of
144 # suspected CLs.
145 for file_name in file_dict:
146 # Ignore header file.
147 if file_name.endswith('.h'):
148 continue
149
150 # If the file in the stacktrace is not changed in any commits, continue.
151 if file_name not in file_map:
152 continue
153
154 path_dict = file_dict.GetPathDic(file_name)
155 # There can be two files that are in different path but have same
156 # names, so also has to check path.
157 for path in path_dict:
158 crashed_lines = file_dict.GetCrashedLineNumbers(path)
159 stack_frame_nums = file_dict.GetCrashStackFrameindex(path)
160 function = file_dict.GetCrashFunction(path)
161
162 # Iterate through the revisions that this file is changed.
163 for (cl, file_change_type, file_path) in file_map[file_name]:
164 # If path of two files are different or if a commit deletes a file,
165 # continue.
166 if not crash_utils.GuessIfSameSubPath(path, file_path) or (
167 file_change_type == 'D'):
168 continue
169
170 revision = revisions[cl]
171
172 # Create a new thread for adding this CL.
173 match_thread = Thread(
174 target=GenerateMatchEntry,
175 args=[matches, revision, cl, file_path, file_name, function,
176 component_path, component_name, crashed_lines,
177 stack_frame_nums, file_change_type,
178 repository_parser])
179 threads.append(match_thread)
180 match_thread.start()
181
182 # Make sure all threads finish before returning.
183 for match_thread in threads:
184 match_thread.join()
185
186 # Remove CLs that are revert.
187 matches.RemoveRevertedCLs()
188
189 return matches
190
191
192 def FindMatchForComponent(component_path, file_dict, changelog,
193 callstack_priority, results, results_lock):
194 """Parses changelog and finds suspected CLs for a given component.
195
196 Args:
197 component_path: The path of component to look for the culprit CL.
198 file_dict: A dictionary mapping file to its occurrence in stackframe.
stgao 2014/08/18 22:37:08 file_to_frames? Same for other occurrences of thi
jeun 2014/08/18 23:52:15 Done.
199 changelog: The parsed changelog for this component.
200 callstack_priority: The priority of this call stack, 0 if from crash stack,
201 1 if from freed, 2 if from previously allocated.
202 results: A dictionary to store the result.
203 results_lock: A lock that guards results.
204 """
205 (repository_parser, component_name, revisions, file_to_revision_map) = \
206 changelog
207
208 # Find match for this component.
209 codereview_api_url = PARSED_CONFIG['codereview']['review_url']
210 component_result = FindMatch(
211 revisions, file_to_revision_map, file_dict, component_path,
212 component_name, repository_parser, codereview_api_url)
213 matches = component_result.matches
214
215 # For all the match results in a dictionary, add to the list so that it
216 # can be sorted.
217 with results_lock:
218 for cl in matches:
219 match = matches[cl]
220 results.append((callstack_priority, cl, match))
221
222
223 def FindMatchForCallstack(
224 callstack, components, component_to_changelog_map, results,
225 results_lock):
226 """Finds culprit cl for a stack within a stacktrace.
227
228 For each components to look for, create new thread that computes the matches
229 and join the results at the end.
230 Args:
stgao 2014/08/18 22:37:09 One empty line before this.
jeun 2014/08/18 23:52:14 Done.
231 callstack: A callstack in a stacktrace to find the result for.
232 components: A set of components to look for.
233 component_to_changelog_map: A map from component to its parsed changelog.
234 results: A list to aggregrate results from all stacktraces.
235 results_lock: A lock that guards results.
236 """
237 # Create component dictionary from the component and call stack.
238 component_dict = component_dictionary.ComponentDictionary(callstack,
239 components)
240 callstack_priority = callstack.priority
241
242 # Iterate through all components and create new thread for each component.
243 threads = []
244 for component_path in component_dict:
245 file_dict = component_dict.GetFileDict(component_path)
stgao 2014/08/18 22:37:08 Not clear for var name |file_dict|. Please rename
jeun 2014/08/18 23:52:14 Done.
246
247 if component_path not in component_to_changelog_map:
248 continue
stgao 2014/08/18 22:37:08 Why continue? Please add a comment.
jeun 2014/08/18 23:52:15 Done.
249 changelog = component_to_changelog_map[component_path]
250
251 t = Thread(
252 target=FindMatchForComponent,
253 args=[component_path, file_dict, changelog,
254 callstack_priority, results, results_lock])
255 threads.append(t)
256 t.start()
257
258 # Join the threads before returning.
259 for t in threads:
260 t.join()
261
262
263 def FindMatchForStacktrace(stacktrace, components, regression_dict):
stgao 2014/08/18 22:37:07 stack_trace?
jeun 2014/08/18 23:52:15 I will go consistent with the variable name in oth
264 """Finds the culprit CL for stacktrace.
265
266 The passed stacktrace is either from release build stacktrace
267 or debug build stacktrace.
268
269 Args:
270 stacktrace: A list of parsed stacks within a stacktrace.
271 components: A set of components to look for.
272 regression_dict: A dictionary mapping component path to its regression.
273
274 Returns:
275 A list of match results from all stacks.
276 """
277 # A list to aggregate results from the whole stacktrace.
stgao 2014/08/18 22:37:07 multiple call stacks in the whole stack trace?
jeun 2014/08/18 23:52:14 Done.
278 results = []
279 results_lock = Lock()
280
281 # Parse changelog for each of the components and store it.
282 svn_parser = svn_repository_parser.SVNParser(PARSED_CONFIG['svn'])
283 git_parser = git_repository_parser.GitParser(regression_dict,
284 PARSED_CONFIG['git'])
285
286 component_to_changelog_map = {}
287 for component_path in components:
288 component_object = regression_dict[component_path]
289 range_start = component_object['old_revision']
290 range_end = component_object['new_revision']
291 if range_start == '0':
292 continue
stgao 2014/08/18 22:37:09 Why continue? Should we just return a message "Reg
jeun 2014/08/18 23:52:15 We are continuing because this loop is going over
293
294 component_name = regression_dict[component_path]['name']
295
296 # Check if the regression is git hash or a revision number, and create
297 # appropriate parser object.
298 is_git = utils.IsGitHash(range_start)
299 if is_git:
300 repository_parser = git_parser
301 else:
302 repository_parser = svn_parser
303
304 # Parse revisions and changed files in regression.
305 (revisions, file_to_revision_map) = repository_parser.ParseChangelog(
306 component_path, range_start, range_end)
307
308 if not (revisions and file_to_revision_map):
309 continue
stgao 2014/08/18 22:37:07 Why continue? Please add a comment. For other co
jeun 2014/08/18 23:52:15 Done.
310
311 component_to_changelog_map[component_path] = (repository_parser,
312 component_name,
313 revisions,
314 file_to_revision_map)
315
316 # Create separate threads for each of the call stack in the stacktrace.
317 threads = []
318 for callstack in stacktrace.stack_list:
319 t = Thread(
320 target=FindMatchForCallstack,
321 args=[callstack, components, component_to_changelog_map,
322 results, results_lock])
323 threads.append(t)
324 t.start()
325
326 # Join the threads before returning.
327 for t in threads:
328 t.join()
329
330 return results
331
332
333 def SortAndFilterMatches(matches):
334 """Filters the list of potential culprit CLs to remove noise.
335
336 Args:
337 matches: A list containing match results.
338
339 Returns:
340 Filtered match results.
341 """
342 new_matches = []
343 line_changed = False
344 in_crash_state = False
stgao 2014/08/18 22:37:09 What's crash state? Do you need this concept? If s
jeun 2014/08/18 23:52:14 Added additional parameter to the function and exp
345 highest_priority_stack = crash_utils.INFINITY
346
347 # Sort the matches in specific order.
stgao 2014/08/18 22:37:09 Please add more details on what order it is and wh
jeun 2014/08/18 23:52:15 Done.
348 matches.sort(key=lambda (stack_index, cl, match):
349 (match.rank, stack_index,
350 crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,
351 match.priorities),
352 -len(match.changed_files), match.min_distance))
353
354 # Iterate through the matches to find out what results are significant.
355 for stack_index, cl, match in matches:
356 # Check if the current match changes crashed line.
357 is_line_change = (match.rank == LINE_CHANGE_PRIORITY)
358
359 # Check which thread this match is from.
360 current_stack = stack_index
361 if current_stack < highest_priority_stack:
stgao 2014/08/18 22:37:07 Will two stacks have the same priority? If so, wha
jeun 2014/08/18 23:52:15 Added more detailed comments. Two stacks having sa
362 highest_priority_stack = current_stack
363
364 # Check if current match changes a file that occurs in crash state.
365 flattened_stack_frame_indices = [i for sl in match.stack_frame_indices
stgao 2014/08/18 22:37:09 Use full name for |sl|.
jeun 2014/08/18 23:52:15 Done.
366 for i in sl]
367 current_in_crash_state = min(flattened_stack_frame_indices) < 5
368
369 # This match and anything lower than this should be ignored if:
370 # - Current match does not change crashed lines but there are matches
371 # that do so.
372 # - Current match is not in crash state but there are matches in it.
373 # - There are other matches that came from higher priority stack.
374 if (line_changed and not is_line_change) or (
375 in_crash_state and not current_in_crash_state) or (
376 current_stack > highest_priority_stack):
377 break
378
379 # Update the variables.
380 if is_line_change:
381 line_changed = True
382 if current_in_crash_state:
383 in_crash_state = True
384
385 # Add current match to the filtered result.
386 new_matches.append((stack_index, cl, match))
387
388 return new_matches
389
390
391 def GenerateReasonForMatches(matches):
392 """Generates a reason that a match (CL) is a culprit cl.
393
394 Args:
395 matches: A list of match objects.
396 """
397 # Iterate through the matches in the list.
398 for i, _, match in matches:
399 reason = []
400
401 # Zip the files in the match by the reason they are suspected
402 # (how the file is modified) and sort it in specific order.
stgao 2014/08/18 22:37:08 What order it is and why?
jeun 2014/08/18 23:52:14 Done.
403 match_by_priority = zip(
404 match.priorities, match.crashed_line_numbers, match.changed_files,
405 match.changed_file_urls)
406
407 match_by_priority.sort(
408 key=lambda (priority, crashed_line_number, file_name, file_url):
409 (priority, len(file_name)))
410
411 # Iterate through the sorted match.
412 for i in range(len(match_by_priority)):
413 (priority, crashed_line_number, file_name, file_url) = \
414 match_by_priority[i]
415
416 # If the file in the match is a line change, append a explanation.
417 if priority == LINE_CHANGE_PRIORITY:
418 reason.append(
419 ' Line %s of file %s which caused the crash according '
stgao 2014/08/18 22:37:09 Why blank space at the beginning? Formatting like
jeun 2014/08/18 23:52:14 Done.
420 'to the stacktrace, is changed in this cl.\n' %
421 (crash_utils.PrettifyList(crashed_line_number),
422 crash_utils.PrettifyFiles([(file_name, file_url)])))
423
424 else:
425 # Get all the files that are not line change.
426 rest_of_the_files = match_by_priority[i:]
427
428 # Take care of single/multiple files in string.
429 if len(rest_of_the_files) == 1:
430 file_string = ' File %s is changed in this cl.\n'
stgao 2014/08/18 22:37:07 Same as above.
jeun 2014/08/18 23:52:15 Done.
431 else:
432 file_string = ' Files %s are changed in this cl.\n'
433
434 # Create a list of file name and its url, and prettify the list.
435 file_name_with_url = [(file_name, file_url)
436 for (_, _, file_name, file_url)
437 in rest_of_the_files]
438 pretty_file_name_url = crash_utils.PrettifyFiles(file_name_with_url)
439
440 # Add the reason, break because we took care of the rest of the files.
441 reason.append(file_string % pretty_file_name_url)
442 break
443
444 # Set the reason as string.
445 match.reason = ''.join(reason)
446
447
448 def CombineMatches(matches):
stgao 2014/08/18 22:37:08 CombineReasonsForMatches?
jeun 2014/08/18 23:52:15 This is not only combining reasons, but also combi
449 """Combine possible duplicates in matches.
450
451 Args:
452 matches: A list of matches object, along with its callstack priority and
453 CL it is from.
454 Returns:
455 A combined list of matches.
456 """
457 combined_matches = []
458
459 for stack_index, cl, match in matches:
460 found_match = None
461
462 # Iterate through the list of combined matches.
463 for _, cl_combined, match_combined in combined_matches:
464 # Check for if current match is already higher up in the result.
465 if cl == cl_combined:
466 found_match = match_combined
stgao 2014/08/18 22:37:09 add a break?
jeun 2014/08/18 23:52:14 Done.
467
468 # If current match is not already in, add it to the list of matches.
469 if not found_match:
470 combined_matches.append((stack_index, cl, match))
471 continue
472
473 # Combine the reason if the current match is already in there.
474 found_match.reason += match.reason
475
476 return combined_matches
477
478
479 def FilterAndGenerateReasonForMatches(result):
480 """A wrapper function.
481
482 It generates reasons for the matches and returns string representation
483 of filtered results.
484
485 Args:
486 result: A list of match objects.
487
488 Returns:
489 A string representation of filtered results.
490 """
491 new_result = SortAndFilterMatches(result)
492 GenerateReasonForMatches(new_result)
493 combined_matches = CombineMatches(new_result)
494 return crash_utils.MatchListToResultList(combined_matches)
495
496
497 def ParseCrashComponents(main_stack, top_n_frames=3):
stgao 2014/08/18 22:37:09 Why default is 3? Would you mind a comment and exp
jeun 2014/08/18 23:52:14 It is an arbitrary number (it is because clusterfu
498 """Parses the crashing component.
499
500 Args:
501 main_stack: Main stack from the stacktrace.
502 top_n_frames: A number of frames to regard as crashing frame.
503 Returns:
504 A set of components.
505 """
506 frame_list = main_stack.GetTopNFrames(top_n_frames)
507 components = set()
508
509 for frame in frame_list:
510 components.add(frame.component_path)
511
512 return components
513
514
515 def GenerateAndFilterBlameList(callstack, crash_revision_dict, regression_dict):
516 """A wrapper function.
517
518 Finds blame information for stack and returns string representation.
519
520 Args:
521 callstack: A callstack to find the blame information.
522 crash_revision_dict: A dictionary mapping component to its crash revision.
stgao 2014/08/18 22:37:08 component_to_revision_info?
jeun 2014/08/18 23:52:14 Done.
523 regression_dict: A dictionary mapping component to its regression.
524
525 Returns:
526 A list of blame results.
527 """
528 # Setup parser objects to use for parsing blame information.
529 svn_parser = svn_repository_parser.SVNParser(PARSED_CONFIG['svn'])
530 git_parser = git_repository_parser.GitParser(regression_dict,
531 PARSED_CONFIG['git'])
532 parsers = {}
533 parsers['svn'] = svn_parser
534 parsers['git'] = git_parser
535
536 # Create and generate the blame objects from the callstack.
537 blame_list = blame.BlameList()
538 blame_list.FindBlame(callstack, crash_revision_dict, regression_dict,
539 parsers)
540
541 blame_list.FilterAndSortBlameList()
542 return crash_utils.BlameListToResultList(blame_list)
543
544
545 def FindItForCrash(stacktrace_list,
546 callstack,
547 parsed_regression,
548 parsed_crash_revision):
549 """Finds the culprit CL from the list of stacktrace.
550
551 Args:
552 stacktrace_list: A list of stacktraces to look for, in the order of
553 decreasing significance.
554 callstack: A callstack object to show blame information for, if there are
555 no results for all stacktraces in the stacktrace_list.
556 parsed_regression: A parsed regression information as a result of parsing
stgao 2014/08/18 22:37:08 component_to_regression_info?
jeun 2014/08/18 23:52:14 Done.
557 DEPS file.
558 parsed_crash_revision: A parsed crash revision information.
stgao 2014/08/18 22:37:09 Prefix "parsed" is not needed. Same for other occ
jeun 2014/08/18 23:52:14 Done.
559
560 Returns:
561 A list of result objects, with the message how the result is created.
562 """
563 # If regression information is not available, return blame information.
564 if not parsed_regression:
565 return_message = (
566 'Regression information is not available. The result is '
567 'the blame information.')
568 result = GenerateAndFilterBlameList(callstack, parsed_crash_revision,
569 parsed_regression)
570 return (return_message, result)
571
572 for stacktrace in stacktrace_list:
573 # Check the next stacktrace if current one is empty.
574 if not stacktrace.stack_list:
575 continue
576
577 # Get the crash stack for this stacktrace, and extract crashing components
578 # from it.
579 main_stack = stacktrace.GetCrashStack()
580 components = ParseCrashComponents(main_stack)
stgao 2014/08/18 22:37:08 Why do we handle the first 3 frames only? Add an e
jeun 2014/08/18 23:52:15 Done.
581
582 result_for_stacktrace = FindMatchForStacktrace(stacktrace, components,
583 parsed_regression)
584
585 # If the result is empty, check the next stacktrace. Else, return the
586 # filtered result.
587 if not result_for_stacktrace:
588 continue
589
590 return_message = (
591 'The result is a list of CLs that change the crashed files.')
stgao 2014/08/18 22:37:08 Also add "Wrong result? Please email findit-for-me
jeun 2014/08/18 23:52:15 I will add it after discussing with Abhishek/Marty
592 result = FilterAndGenerateReasonForMatches(result_for_stacktrace)
593 return (return_message, result)
594
595 # If not match is found, return the blame information for the input
stgao 2014/08/18 22:37:08 "not" -> "no"?
jeun 2014/08/18 23:52:14 Done.
596 # callstack.
597 return_message = (
598 'There are no CLs that change the crashed files. The result is the '
599 'blame information.')
600 result = GenerateAndFilterBlameList(callstack, parsed_crash_revision,
601 parsed_regression)
602 return (return_message, result)
603
OLDNEW
« tools/findit/findit_for_clusterfuzz.py ('K') | « tools/findit/findit_for_clusterfuzz.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698