Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(411)

Side by Side Diff: tools/findit/findit_for_crash.py

Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: addressed codereview and changed the algorithm to look for file path rather than file name Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « tools/findit/findit_for_clusterfuzz.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 import os
6 from threading import Lock, Thread
7
8 import blame
9 from common import utils
10 import component_dictionary
11 import crash_utils
12 import git_repository_parser
13 import match_set
14 import svn_repository_parser
15
16
17 LINE_CHANGE_PRIORITY = 1
18 FILE_CHANGE_PRIORITY = 2
19 _THIS_DIR = os.path.abspath(os.path.dirname(__file__))
20 CONFIG = crash_utils.ParseURLsFromConfig(os.path.join(_THIS_DIR,
21 'config.ini'))
22
23
24 def GenerateMatchEntry(
25 matches, revision_info, revision_number, file_path, function,
26 component_path, component_name, crashed_line_numbers, stack_frame_indices,
27 file_change_type, repository_parser):
28 """Generates a match object and adds it to the match set.
29
30 Args:
31 matches: A matchset object, a map from CL to a match object.
32 revision_info: The revision information, a map from fields (message,
33 changed files, etc) to its values.
34 revision_number: The SVN revision number or git hash.
35 file_path: The path of the file.
36 function: The function that caused an crash.
37 component_path: The path of the component this file is from.
38 component_name: The name of the component the file is from.
39 crashed_line_numbers: The list of the lines in the file that caused
40 the crash.
41 stack_frame_indices: The list of positions of this file within a stack.
42 file_change_type: Whether file is modified, added or deleted.
43 repository_parser: The parser object to parse line diff.
44 """
45 # Check if this CL should be ignored.
46 with matches.matches_lock:
47 if revision_number in matches.cls_to_ignore:
48 return
49
50 # If this CL is not already identified as suspected, create a new entry.
51 if revision_number not in matches.matches:
52 match = match_set.Match(revision_info, component_name)
53 message = revision_info['message']
54 # TODO(jeun): Don't hold lock while issuing http request.
55 match.ParseMessage(message, matches.codereview_api_url)
56
57 # If this match is a revert, add to the set of CLs to be ignored.
58 if match.is_reverted:
Martin Barbella 2014/08/19 16:11:33 s/is_reverted/is_revert/
jeun 2014/08/19 18:30:06 Done.
59 matches.cls_to_ignore.add(revision_number)
60
61 # If this match has info on what CL it is reverted from, add that CL.
62 if match.revert_of:
63 matches.cls_to_ignore.add(match.revert_of)
64
65 # Return because we do not need to look at this CL anymore.
66 return
67
68 # Add this CL to the set of matches.
69 matches.matches[revision_number] = match
70
71 # Else, bring in the existing result.
72 else:
73 match = matches.matches[revision_number]
74
75 # Parse line diff information for this file.
76 (diff_url, changed_line_numbers, changed_line_contents) = (
77 repository_parser.ParseLineDiff(
78 file_path, component_path, file_change_type, revision_number))
79
80 # Ignore this match if the component is not supported for svn.
Martin Barbella 2014/08/19 16:11:33 Is this correct? If so, is there more work that ne
jeun 2014/08/19 18:30:06 Yes this is correct, for SVN we need to write a pa
81 if not diff_url:
82 return
83
84 # Find the intersection between the lines that this file crashed on and
85 # the changed lines.
86 (line_number_intersection, stack_frame_index_intersection) = (
87 crash_utils.Intersection(
88 crashed_line_numbers, stack_frame_indices, changed_line_numbers))
89
90 # Find the minimum distance between the changed lines and crashed lines.
91 min_distance = crash_utils.FindMinLineDistance(crashed_line_numbers,
92 changed_line_numbers)
93
94 # Check whether this CL changes the crashed lines or not.
95 if line_number_intersection:
96 priority = LINE_CHANGE_PRIORITY
97 else:
98 priority = FILE_CHANGE_PRIORITY
99
100 # Add the parsed information to the object.
101 with matches.matches_lock:
102 match.crashed_line_numbers.append(line_number_intersection)
103
104 file_name = os.path.basename(file_path)
Martin Barbella 2014/08/19 16:11:33 Could you confirm that this works on Windows? The
jeun 2014/08/19 18:30:06 Changed all occurrences to file_path.split('/')[-1
105 match.changed_files.append(file_name)
106
107 # Update the min distance only if it is less than the current one.
108 if min_distance < match.min_distance:
109 match.min_distance = min_distance
110
111 # If this CL does not change the crashed line, all occurrence of this
112 # file in the stack has the same priority.
113 if not stack_frame_index_intersection:
114 stack_frame_index_intersection = stack_frame_indices
115 match.stack_frame_indices.append(stack_frame_index_intersection)
116 match.changed_file_urls.append(diff_url)
117 match.priorities.append(priority)
118 match.function_list.append(function)
119
120
121 def FindMatch(revisions_info_map, file_to_revision_info, file_to_crash_info,
122 component_path, component_name, repository_parser,
123 codereview_api_url):
124 """Finds a CL that modifies file in the stacktrace.
125
126 Args:
127 revisions_info_map: A dictionary mapping revision number to the CL
128 information.
129 file_to_revision_info: A dictionary mapping file to the revision that
130 modifies it.
131 file_to_crash_info: A dictionary mapping file to its occurrence in
132 stacktrace.
133 component_path: The path of the component to search for.
134 component_name: The name of the component to search for.
135 repository_parser: The parser object to parse the line diff.
136 codereview_api_url: A code review url to retrieve data from.
137
138 Returns:
139 Matches, a set of match objects.
140 """
141 matches = match_set.MatchSet(codereview_api_url)
142 threads = []
143
144 # For all files in stacktrace, if it is not header file, check against
145 # all CLs to see if any CL changes this file. If so, add to the set of
146 # suspected CLs.
147 for crashed_file_path in file_to_crash_info:
148 # Ignore header file.
149 if crashed_file_path.endswith('.h'):
150 continue
151
152 # If the file in the stacktrace is not changed in any commits, continue.
153 for changed_file_path in file_to_revision_info:
154 changed_file_name = os.path.basename(changed_file_path)
155 crashed_file_name = os.path.basename(crashed_file_path)
156
157 # If the two file path does not have the same file name, continue.
158 if changed_file_name != crashed_file_name:
159 continue
160
161 # If the path of the two files are not same, continue.
162 if not crash_utils.GuessIfSameSubPath(
163 changed_file_path, crashed_file_path):
164 continue
165
166 crashed_line_numbers = file_to_crash_info.GetCrashedLineNumbers(
167 crashed_file_path)
168 stack_frame_nums = file_to_crash_info.GetCrashStackFrameIndices(
169 crashed_file_path)
170 functions = file_to_crash_info.GetCrashFunctions(crashed_file_path)
171
172 # Iterate through the CLs that this file path is changed.
173 for (cl, file_change_type) in file_to_revision_info[changed_file_path]:
174 # If the file change is delete, ignore this CL.
175 if file_change_type == 'D':
Martin Barbella 2014/08/19 16:11:32 Should this be startswith('D'), or are other parts
jeun 2014/08/19 18:30:06 Yes the other parts are ignored when the check hap
176 continue
177
178 revision = revisions_info_map[cl]
179
180 match_thread = Thread(
181 target=GenerateMatchEntry,
182 args=[matches, revision, cl, changed_file_path, functions,
183 component_path, component_name, crashed_line_numbers,
184 stack_frame_nums, file_change_type,
185 repository_parser])
186 threads.append(match_thread)
187 match_thread.start()
188
189 # Make sure all threads finish before returning.
190 for match_thread in threads:
191 match_thread.join()
192
193 # Remove CLs that are revert.
194 matches.RemoveRevertedCLs()
195
196 return matches
197
198
199 def FindMatchForComponent(component_path, file_to_crash_info, changelog,
200 callstack_priority, results, results_lock):
201 """Parses changelog and finds suspected CLs for a given component.
202
203 Args:
204 component_path: The path of component to look for the culprit CL.
205 file_to_crash_info: A dictionary mapping file to its occurrence in
206 stackframe.
207 changelog: The parsed changelog for this component.
208 callstack_priority: The priority of this call stack, 0 if from crash stack,
209 1 if from freed, 2 if from previously allocated.
210 results: A dictionary to store the result.
211 results_lock: A lock that guards results.
212 """
213 (repository_parser, component_name, revisions, file_to_revision_map) = \
214 changelog
215
216 # Find match for this component.
217 codereview_api_url = CONFIG['codereview']['review_url']
218 component_result = FindMatch(
219 revisions, file_to_revision_map, file_to_crash_info, component_path,
220 component_name, repository_parser, codereview_api_url)
221 matches = component_result.matches
222
223 # For all the match results in a dictionary, add to the list so that it
224 # can be sorted.
225 with results_lock:
226 for cl in matches:
227 match = matches[cl]
228 results.append((callstack_priority, cl, match))
229
230
231 def FindMatchForCallstack(
232 callstack, components, component_to_changelog_map, results,
233 results_lock):
234 """Finds culprit cl for a stack within a stacktrace.
235
236 For each components to look for, create new thread that computes the matches
237 and join the results at the end.
238
239 Args:
240 callstack: A callstack in a stacktrace to find the result for.
241 components: A set of components to look for.
242 component_to_changelog_map: A map from component to its parsed changelog.
243 results: A list to aggregrate results from all stacktraces.
244 results_lock: A lock that guards results.
245 """
246 # Create component dictionary from the component and call stack.
247 component_dict = component_dictionary.ComponentDictionary(callstack,
248 components)
249 callstack_priority = callstack.priority
250
251 # Iterate through all components and create new thread for each component.
252 threads = []
253 for component_path in component_dict:
254 # If the component to consider in this callstack is not in the parsed list
255 # of components, ignore this one.
256 if component_path not in component_to_changelog_map:
257 continue
258
259 changelog = component_to_changelog_map[component_path]
260 file_to_crash_info = component_dict.GetFileDict(component_path)
261 t = Thread(
262 target=FindMatchForComponent,
263 args=[component_path, file_to_crash_info, changelog,
264 callstack_priority, results, results_lock])
265 threads.append(t)
266 t.start()
267
268 # Join the threads before returning.
269 for t in threads:
270 t.join()
271
272
273 def FindMatchForStacktrace(stacktrace, components,
274 component_to_regression_dict):
275 """Finds the culprit CL for stacktrace.
276
277 The passed stacktrace is either from release build stacktrace
278 or debug build stacktrace.
279
280 Args:
281 stacktrace: A list of parsed stacks within a stacktrace.
282 components: A set of components to look for.
283 component_to_regression_dict: A dictionary mapping component path to
284 its regression.
285
286 Returns:
287 A list of match results from all stacks.
288 """
289 # A list to aggregate results from all the callstacks in the stacktrace.
290 results = []
291 results_lock = Lock()
292
293 # Parse changelog for each of the components and store it.
294 svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
295 git_parser = git_repository_parser.GitParser(component_to_regression_dict,
296 CONFIG['git'])
297
298 component_to_changelog_map = {}
299 for component_path in components:
300 component_object = component_to_regression_dict[component_path]
301 range_start = component_object['old_revision']
302 range_end = component_object['new_revision']
303
304 # If range start is 0, it means regression is not reliable, so ignore.
Martin Barbella 2014/08/19 16:11:32 Again, 0 isn't about reliability. It just means th
jeun 2014/08/19 18:30:06 Done.
305 if range_start == '0':
306 continue
307
308 component_name = component_to_regression_dict[component_path]['name']
309
310 # Check if the regression is git hash or a revision number, and create
311 # appropriate parser object.
312 is_git = utils.IsGitHash(range_start)
313 if is_git:
314 repository_parser = git_parser
315 else:
316 repository_parser = svn_parser
317
318 # Parse revisions and changed files in regression.
319 (revisions, file_to_revision_map) = repository_parser.ParseChangelog(
320 component_path, range_start, range_end)
321
322 # If the returned map from ParseChangeLog is empty, we don't need to look
323 # further because either the parsing failed or the changelog is empty.
324 if not (revisions and file_to_revision_map):
325 continue
326
327 component_to_changelog_map[component_path] = (repository_parser,
328 component_name,
329 revisions,
330 file_to_revision_map)
331
332 # Create separate threads for each of the call stack in the stacktrace.
333 threads = []
334 for callstack in stacktrace.stack_list:
335 t = Thread(
336 target=FindMatchForCallstack,
337 args=[callstack, components, component_to_changelog_map,
338 results, results_lock])
339 threads.append(t)
340 t.start()
341
342 # Join the threads before returning.
343 for t in threads:
344 t.join()
345
346 return results
347
348
349 def SortAndFilterMatches(matches, num_important_frames=5):
350 """Filters the list of potential culprit CLs to remove noise.
351
352 Args:
353 matches: A list containing match results.
354 num_important_frames: A number of frames on the top of the frame to Check
355 for when filtering the results. A match with a file
356 that is in top num_important_frames of the stacktrace
357 is regarded more probable then others.
358
359 Returns:
360 Filtered match results.
361 """
362 new_matches = []
363 line_changed = False
364 is_important_frame = False
365 highest_priority_stack = crash_utils.INFINITY
366
367 # Sort the matches in specific order. It is sorted by
368 # 1) The highest priority file change in the CL (changing crashed line is
369 # higher priority than just changing the file)
370 # 2) The callstack this match is computed (crash stack, freed, allocation)
Martin Barbella 2014/08/19 16:11:33 Was this updated so that free/allocation stacks ha
jeun 2014/08/19 18:30:06 Yes it has been updated. I changed the name to sta
371 # 3) The minimum stack frame number of the changed file in the match
372 # 4) The number of files this CL changes (higher the better)
373 # 5) The minimum distance between the lines that the CL changes and crashed
374 # lines
375 matches.sort(key=lambda (stack_index, cl, match):
Martin Barbella 2014/08/19 16:11:33 Using a lambda here is pretty ugly. Please just de
jeun 2014/08/19 18:30:06 Done.
376 (min(match.priorities), stack_index,
377 crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,
378 match.priorities),
379 -len(match.changed_files), match.min_distance))
380
381 # Iterate through the matches to find out what results are significant.
382 for stack_index, cl, match in matches:
383 # Check if the current match changes crashed line.
384 is_line_change = (min(match.priorities) == LINE_CHANGE_PRIORITY)
385
386 # Check which stack this match is from, and finds the highest priority
387 # callstack up to this point.
388 current_stack = stack_index
389 if current_stack < highest_priority_stack:
390 highest_priority_stack = current_stack
391
392 # Check if current match changes a file that occurs in crash state.
393 flattened_stack_frame_indices = [frame for frame_indices in
394 match.stack_frame_indices
395 for frame in frame_indices]
396 current_is_important = (
397 min(flattened_stack_frame_indices) < num_important_frames)
398
399 # This match and anything lower than this should be ignored if:
400 # - Current match does not change crashed lines but there are matches
401 # that do so.
402 # - Current match is not in crash state but there are matches in it.
403 # - There are other matches that came from higher priority stack.
404 if (line_changed and not is_line_change) or (
405 is_important_frame and not current_is_important) or (
406 current_stack > highest_priority_stack):
407 break
408
409 # Update the variables.
410 if is_line_change:
411 line_changed = True
412 if current_is_important:
413 is_important_frame = True
414
415 # Add current match to the filtered result.
416 new_matches.append((stack_index, cl, match))
417
418 return new_matches
419
420
421 def GenerateReasonForMatches(matches):
422 """Generates a reason that a match (CL) is a culprit cl.
423
424 Args:
425 matches: A list of match objects.
426 """
427 # Iterate through the matches in the list.
428 for i, _, match in matches:
429 reason = []
430
431 # Zip the files in the match by the reason they are suspected
432 # (how the file is modified).
433 match_by_priority = zip(
434 match.priorities, match.crashed_line_numbers, match.changed_files,
435 match.changed_file_urls)
436
437 # Sort the zipped changed files in the match by their priority so that the
438 # changed lines comes first in the reason.
439 match_by_priority.sort(
440 key=lambda (priority, crashed_line_number, file_name, url): priority)
441
442 # Iterate through the sorted match.
443 for i in range(len(match_by_priority)):
444 (priority, crashed_line_number, file_name, file_url) = \
445 match_by_priority[i]
446
447 # If the file in the match is a line change, append a explanation.
448 if priority == LINE_CHANGE_PRIORITY:
449 reason.append(
450 'Line %s of file %s which caused the crash according '
Martin Barbella 2014/08/19 16:11:33 I'm not sure about saying "caused the crash" here.
jeun 2014/08/19 18:30:06 Done.
451 'to the stacktrace, is changed in this cl.\n' %
452 (crash_utils.PrettifyList(crashed_line_number),
453 crash_utils.PrettifyFiles([(file_name, file_url)])))
454
455 else:
456 # Get all the files that are not line change.
457 rest_of_the_files = match_by_priority[i:]
458
459 # Take care of single/multiple files in string.
460 if len(rest_of_the_files) == 1:
461 file_string = 'File %s is changed in this cl.\n'
462 else:
463 file_string = 'Files %s are changed in this cl.\n'
464
465 # Create a list of file name and its url, and prettify the list.
466 file_name_with_url = [(file_name, file_url)
467 for (_, _, file_name, file_url)
468 in rest_of_the_files]
469 pretty_file_name_url = crash_utils.PrettifyFiles(file_name_with_url)
470
471 # Add the reason, break because we took care of the rest of the files.
472 reason.append(file_string % pretty_file_name_url)
473 break
474
475 # Set the reason as string.
476 match.reason = ''.join(reason)
477
478
479 def CombineMatches(matches):
480 """Combine possible duplicates in matches.
481
482 Args:
483 matches: A list of matches object, along with its callstack priority and
484 CL it is from.
485 Returns:
486 A combined list of matches.
487 """
488 combined_matches = []
489
490 for stack_index, cl, match in matches:
491 found_match = None
492
493 # Iterate through the list of combined matches.
494 for _, cl_combined, match_combined in combined_matches:
495 # Check for if current match is already higher up in the result.
496 if cl == cl_combined:
497 found_match = match_combined
498 break
499
500 # If current match is not already in, add it to the list of matches.
Martin Barbella 2014/08/19 16:11:33 In general, I think this file had a lot of unneces
jeun 2014/08/19 18:30:06 I removed a lot of self-explanatory comments.
501 if not found_match:
502 combined_matches.append((stack_index, cl, match))
503 continue
504
505 # Combine the reason if the current match is already in there.
506 found_match.reason += match.reason
507
508 return combined_matches
509
510
511 def FilterAndGenerateReasonForMatches(result):
512 """A wrapper function.
513
514 It generates reasons for the matches and returns string representation
515 of filtered results.
516
517 Args:
518 result: A list of match objects.
519
520 Returns:
521 A string representation of filtered results.
522 """
523 new_result = SortAndFilterMatches(result)
524 GenerateReasonForMatches(new_result)
525 combined_matches = CombineMatches(new_result)
526 return crash_utils.MatchListToResultList(combined_matches)
527
528
529 def ParseCrashComponents(main_stack, top_n_frames=3):
530 """Parses the crashing component.
531
532 Crashing components is a component that top_n_frames of the stacktrace is
533 from.
534
535 Args:
536 main_stack: Main stack from the stacktrace.
537 top_n_frames: A number of frames to regard as crashing frame.
538
539 Returns:
540 A set of components.
541 """
542 frame_list = main_stack.GetTopNFrames(top_n_frames)
543 components = set()
544
545 for frame in frame_list:
546 components.add(frame.component_path)
547
548 return components
549
550
551 def GenerateAndFilterBlameList(callstack, component_to_crash_revision_dict,
552 component_to_regression_dict):
553 """A wrapper function.
554
555 Finds blame information for stack and returns string representation.
556
557 Args:
558 callstack: A callstack to find the blame information.
559 component_to_crash_revision_dict: A dictionary mapping component to its
560 crash revision.
561 component_to_regression_dict: A dictionary mapping component to its
562 regression.
563
564 Returns:
565 A list of blame results.
566 """
567 # Setup parser objects to use for parsing blame information.
568 svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
569 git_parser = git_repository_parser.GitParser(component_to_regression_dict,
570 CONFIG['git'])
571 parsers = {}
572 parsers['svn'] = svn_parser
573 parsers['git'] = git_parser
574
575 # Create and generate the blame objects from the callstack.
576 blame_list = blame.BlameList()
577 blame_list.FindBlame(callstack, component_to_crash_revision_dict,
578 component_to_regression_dict,
579 parsers)
580
581 blame_list.FilterAndSortBlameList()
582 return crash_utils.BlameListToResultList(blame_list)
583
584
585 def FindItForCrash(stacktrace_list,
586 callstack,
587 component_to_regression_dict,
588 component_to_crash_revision_dict):
589 """Finds the culprit CL from the list of stacktrace.
590
591 Args:
592 stacktrace_list: A list of stacktraces to look for, in the order of
593 decreasing significance.
594 callstack: A callstack object to show blame information for, if there are
595 no results for all stacktraces in the stacktrace_list.
596 component_to_regression_dict: A parsed regression information as a
597 result of parsing DEPS file.
598 component_to_crash_revision_dict: A parsed crash revision information.
599
600 Returns:
601 A list of result objects, with the message how the result is created.
602 """
603 # If regression information is not available, return blame information.
604 if not component_to_regression_dict:
605 return_message = (
606 'Regression information is not available. The result is '
607 'the blame information.')
608 result = GenerateAndFilterBlameList(callstack,
609 component_to_crash_revision_dict,
610 component_to_regression_dict)
611 return (return_message, result)
612
613 for stacktrace in stacktrace_list:
614 # Check the next stacktrace if current one is empty.
615 if not stacktrace.stack_list:
616 continue
617
618 # Get the crash stack for this stacktrace, and extract crashing components
619 # from it.
620 main_stack = stacktrace.GetCrashStack()
621 components = ParseCrashComponents(main_stack)
622
623 result_for_stacktrace = FindMatchForStacktrace(
624 stacktrace, components, component_to_regression_dict)
625
626 # If the result is empty, check the next stacktrace. Else, return the
627 # filtered result.
628 if not result_for_stacktrace:
629 continue
630
631 return_message = (
632 'The result is a list of CLs that change the crashed files.')
633 result = FilterAndGenerateReasonForMatches(result_for_stacktrace)
634 return (return_message, result)
635
636 # If no match is found, return the blame information for the input
637 # callstack.
638 return_message = (
639 'There are no CLs that change the crashed files. The result is the '
640 'blame information.')
641 result = GenerateAndFilterBlameList(
642 callstack, component_to_crash_revision_dict,
643 component_to_regression_dict)
644 return (return_message, result)
645
OLDNEW
« no previous file with comments | « tools/findit/findit_for_clusterfuzz.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698