tools/findit/findit_for_crash.py - Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation.

Side by Side Diff: tools/findit/findit_for_crash.py

Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressed codereview. Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import os

	6 from threading import Lock, Thread

	7

	8 import blame

	9 from common import utils

	10 import component_dictionary

	11 import crash_utils

	12 import git_repository_parser

	13 import match_set

	14 import svn_repository_parser

	15

	16

	17 LINE_CHANGE_PRIORITY = 1

	18 FILE_CHANGE_PRIORITY = 2

	19 _THIS_DIR = os.path.abspath(os.path.dirname(__file__))

	20 CONFIG = crash_utils.ParseURLsFromConfig(os.path.join(_THIS_DIR,

	21 'config.ini'))

	22

	23

	24 def GenerateMatchEntry(

	25 matches, revision_info, revision_number, file_path, function,

	26 component_path, component_name, crashed_line_numbers, stack_frame_indices,

	27 file_change_type, repository_parser):

	28 """Generates a match object and adds it to the match set.

	29

	30 Args:

	31 matches: A matchset object, a map from CL to a match object.

	32 revision_info: The revision information, a map from fields (message,

	33 changed files, etc) to its values.

	34 revision_number: The SVN revision number or git hash.

	35 file_path: The path of the file.

	36 function: The function that caused an crash.

	37 component_path: The path of the component this file is from.

	38 component_name: The name of the component the file is from.

	39 crashed_line_numbers: The list of the lines in the file that caused

	40 the crash.

	41 stack_frame_indices: The list of positions of this file within a stack.

	42 file_change_type: Whether file is modified, added or deleted.

	43 repository_parser: The parser object to parse line diff.

	44 """

	45 # Check if this CL should be ignored.

	46 with matches.matches_lock:

	47 if revision_number in matches.cls_to_ignore:

	48 return

	49

	50 # If this CL is not already identified as suspected, create a new entry.

	51 if revision_number not in matches.matches:

	52 match = match_set.Match(revision_info, component_name)

	53 message = revision_info['message']

	54 # TODO(jeun): Don't hold lock while issuing http request.

	55 match.ParseMessage(message, matches.codereview_api_url)

	56

	57 # If this match is a revert, add to the set of CLs to be ignored.

	58 if match.is_revert:

	59 matches.cls_to_ignore.add(revision_number)

	60

	61 # If this match has info on what CL it is reverted from, add that CL.

	62 if match.revert_of:

	63 matches.cls_to_ignore.add(match.revert_of)

	64

	65 return

	66

	67 matches.matches[revision_number] = match

	68

	69 else:

	70 match = matches.matches[revision_number]

	71

	72 (diff_url, changed_line_numbers, changed_line_contents) = (

	73 repository_parser.ParseLineDiff(

	74 file_path, component_path, file_change_type, revision_number))

	75

	76 # Ignore this match if the component is not supported for svn.

	77 if not diff_url:

	78 return

	79

	80 # Find the intersection between the lines that this file crashed on and

	81 # the changed lines.

	82 (line_number_intersection, stack_frame_index_intersection) = (

	83 crash_utils.Intersection(

	84 crashed_line_numbers, stack_frame_indices, changed_line_numbers))

	85

	86 # Find the minimum distance between the changed lines and crashed lines.

	87 min_distance = crash_utils.FindMinLineDistance(crashed_line_numbers,

	88 changed_line_numbers)

	89

	90 # Check whether this CL changes the crashed lines or not.

	91 if line_number_intersection:

	92 priority = LINE_CHANGE_PRIORITY

	93 else:

	94 priority = FILE_CHANGE_PRIORITY

	95

	96 # Add the parsed information to the object.

	97 with matches.matches_lock:

	98 match.crashed_line_numbers.append(line_number_intersection)

	99

	100 file_name = file_path.split('/')[-1]

	101 match.changed_files.append(file_name)

	102

	103 # Update the min distance only if it is less than the current one.

	104 if min_distance < match.min_distance:

	105 match.min_distance = min_distance

	106

	107 # If this CL does not change the crashed line, all occurrence of this

	108 # file in the stack has the same priority.

	109 if not stack_frame_index_intersection:

	110 stack_frame_index_intersection = stack_frame_indices

	111 match.stack_frame_indices.append(stack_frame_index_intersection)

	112 match.changed_file_urls.append(diff_url)

	113 match.priorities.append(priority)

	114 match.function_list.append(function)

	115

	116

	117 def FindMatch(revisions_info_map, file_to_revision_info, file_to_crash_info,

	118 component_path, component_name, repository_parser,

	119 codereview_api_url):

	120 """Finds a CL that modifies file in the stacktrace.

	121

	122 Args:

	123 revisions_info_map: A dictionary mapping revision number to the CL

	124 information.

	125 file_to_revision_info: A dictionary mapping file to the revision that

	126 modifies it.

	127 file_to_crash_info: A dictionary mapping file to its occurrence in

	128 stacktrace.

	129 component_path: The path of the component to search for.

	130 component_name: The name of the component to search for.

	131 repository_parser: The parser object to parse the line diff.

	132 codereview_api_url: A code review url to retrieve data from.

	133

	134 Returns:

	135 Matches, a set of match objects.

	136 """

	137 matches = match_set.MatchSet(codereview_api_url)

	138 threads = []

	139

	140 # Iterate through the crashed files in the stacktrace.

	141 for crashed_file_path in file_to_crash_info:

	142 # Ignore header file.

	143 if crashed_file_path.endswith('.h'):

	144 continue

	145

	146 # If the file in the stacktrace is not changed in any commits, continue.

	147 for changed_file_path in file_to_revision_info:

	148 changed_file_name = changed_file_path.split('/')[-1]

	149 crashed_file_name = crashed_file_path.split('/')[-1]

	150

	151 if changed_file_name != crashed_file_name:

	152 continue

	153

	154 if not crash_utils.GuessIfSameSubPath(

	155 changed_file_path, crashed_file_path):

	156 continue

	157

	158 crashed_line_numbers = file_to_crash_info.GetCrashedLineNumbers(

	159 crashed_file_path)

	160 stack_frame_nums = file_to_crash_info.GetCrashStackFrameIndices(

	161 crashed_file_path)

	162 functions = file_to_crash_info.GetCrashFunctions(crashed_file_path)

	163

	164 # Iterate through the CLs that this file path is changed.

	165 for (cl, file_change_type) in file_to_revision_info[changed_file_path]:

	166 # If the file change is delete, ignore this CL.

	167 if file_change_type == 'D':

	168 continue

	169

	170 revision = revisions_info_map[cl]

	171

	172 match_thread = Thread(

	173 target=GenerateMatchEntry,

	174 args=[matches, revision, cl, changed_file_path, functions,

	175 component_path, component_name, crashed_line_numbers,

	176 stack_frame_nums, file_change_type,

	177 repository_parser])

	178 threads.append(match_thread)

	179 match_thread.start()

	180

	181 for match_thread in threads:

	182 match_thread.join()

	183

	184 matches.RemoveRevertedCLs()

	185

	186 return matches

	187

	188

	189 def FindMatchForComponent(component_path, file_to_crash_info, changelog,

	190 callstack_priority, results, results_lock):

	191 """Parses changelog and finds suspected CLs for a given component.

	192

	193 Args:

	194 component_path: The path of component to look for the culprit CL.

	195 file_to_crash_info: A dictionary mapping file to its occurrence in

	196 stackframe.

	197 changelog: The parsed changelog for this component.

	198 callstack_priority: The priority of this call stack, 0 if from crash stack,

	199 1 if from freed, 2 if from previously allocated.

	200 results: A dictionary to store the result.

	201 results_lock: A lock that guards results.

	202 """

	203 (repository_parser, component_name, revisions, file_to_revision_map) = \

	204 changelog

	205

	206 # Find match for this component.

	207 codereview_api_url = CONFIG['codereview']['review_url']

	208 component_result = FindMatch(

	209 revisions, file_to_revision_map, file_to_crash_info, component_path,

	210 component_name, repository_parser, codereview_api_url)

	211 matches = component_result.matches

	212

	213 # For all the match results in a dictionary, add to the list so that it

	214 # can be sorted.

	215 with results_lock:

	216 for cl in matches:

	217 match = matches[cl]

	218 results.append((callstack_priority, cl, match))

	219

	220

	221 def FindMatchForCallstack(

	222 callstack, components, component_to_changelog_map, results,

	223 results_lock):

	224 """Finds culprit cl for a stack within a stacktrace.

	225

	226 For each components to look for, create new thread that computes the matches

	227 and join the results at the end.

	228

	229 Args:

	230 callstack: A callstack in a stacktrace to find the result for.

	231 components: A set of components to look for.

	232 component_to_changelog_map: A map from component to its parsed changelog.

	233 results: A list to aggregrate results from all stacktraces.

	234 results_lock: A lock that guards results.

	235 """

	236 # Create component dictionary from the component and call stack.

	237 component_dict = component_dictionary.ComponentDictionary(callstack,

	238 components)

	239 callstack_priority = callstack.priority

	240

	241 # Iterate through all components and create new thread for each component.

	242 threads = []

	243 for component_path in component_dict:

	244 # If the component to consider in this callstack is not in the parsed list

	245 # of components, ignore this one.

	246 if component_path not in component_to_changelog_map:

	247 continue

	248

	249 changelog = component_to_changelog_map[component_path]

	250 file_to_crash_info = component_dict.GetFileDict(component_path)

	251 t = Thread(

	252 target=FindMatchForComponent,

	253 args=[component_path, file_to_crash_info, changelog,

	254 callstack_priority, results, results_lock])

	255 threads.append(t)

	256 t.start()

	257

	258 for t in threads:

	259 t.join()

	260

	261

	262 def FindMatchForStacktrace(stacktrace, components,

	263 component_to_regression_dict):

	264 """Finds the culprit CL for stacktrace.

	265

	266 The passed stacktrace is either from release build stacktrace

	267 or debug build stacktrace.

	268

	269 Args:

	270 stacktrace: A list of parsed stacks within a stacktrace.

	271 components: A set of components to look for.

	272 component_to_regression_dict: A dictionary mapping component path to

	273 its regression.

	274

	275 Returns:

	276 A list of match results from all stacks.

	277 """

	278 # A list to aggregate results from all the callstacks in the stacktrace.

	279 results = []

	280 results_lock = Lock()

	281

	282 # Setup parsers.

	283 svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])

	284 git_parser = git_repository_parser.GitParser(component_to_regression_dict,

	285 CONFIG['git'])

	286

	287 # Create a cache of parsed revisions.

	288 component_to_changelog_map = {}

	289 for component_path in components:

	290 component_object = component_to_regression_dict[component_path]

	291 range_start = component_object['old_revision']

	292 range_end = component_object['new_revision']

	293

	294 # If range start is 0, the range is too large and the crash has been

	295 # introduced the archived build, so ignore this case.

	296 if range_start == '0':

	297 continue

	298

	299 component_name = component_to_regression_dict[component_path]['name']

	300

	301 is_git = utils.IsGitHash(range_start)

	302 if is_git:

	303 repository_parser = git_parser

	304 else:

	305 repository_parser = svn_parser

	306

	307 (revisions, file_to_revision_map) = repository_parser.ParseChangelog(

	308 component_path, range_start, range_end)

	309

	310 # If the returned map from ParseChangeLog is empty, we don't need to look

	311 # further because either the parsing failed or the changelog is empty.

	312 if not (revisions and file_to_revision_map):

	313 continue

	314

	315 component_to_changelog_map[component_path] = (repository_parser,

	316 component_name,

	317 revisions,

	318 file_to_revision_map)

	319

	320 # Create separate threads for each of the call stack in the stacktrace.

	321 threads = []

	322 for callstack in stacktrace.stack_list:

	323 t = Thread(

	324 target=FindMatchForCallstack,

	325 args=[callstack, components, component_to_changelog_map,

	326 results, results_lock])

	327 threads.append(t)

	328 t.start()

	329

	330 for t in threads:

	331 t.join()

	332

	333 return results

	334

	335

	336 def SortMatchesFunction(match_with_stack_priority):

	337 """A function to sort the match triple.

	338

	339 Currently, it sorts the list by:

	340 1) The highest priority file change in the CL (changing crashed line is

	341 higher priority than just changing the file).

	342 2) The callstack this match is computed (crash stack, freed, allocation).

	343 3) The minimum stack frame number of the changed file in the match.

	344 4) The number of files this CL changes (higher the better).

	345 5) The minimum distance between the lines that the CL changes and crashed

	346 lines.

	347

	348 Args:

	349 match_with_stack_priority: A match object, with the CL it is from and what

	350 callstack it is from.

	351

	352 Returns:

	353 A sort key.

	354 """

	355 (stack_priority, _, match) = match_with_stack_priority

	356

	357 return (min(match.priorities),

	358 stack_priority,

	359 crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,

	360 match.priorities),

	361 -len(match.changed_files), match.min_distance)

	362

	363

	364 def SortAndFilterMatches(matches, num_important_frames=5):

	365 """Filters the list of potential culprit CLs to remove noise.

	366

	367 Args:

	368 matches: A list containing match results.

	369 num_important_frames: A number of frames on the top of the frame to Check

	370 for when filtering the results. A match with a file

	371 that is in top num_important_frames of the stacktrace

	372 is regarded more probable then others.

	373

	374 Returns:

	375 Filtered match results.

	376 """

	377 new_matches = []

	378 line_changed = False

	379 is_important_frame = False

	380 highest_priority_stack = crash_utils.INFINITY

	381

	382 matches.sort(key=SortMatchesFunction)

	383

	384 # Iterate through the matches to find out what results are significant.

	385 for stack_priority, cl, match in matches:

	386 # Check if the current match changes crashed line.

	387 is_line_change = (min(match.priorities) == LINE_CHANGE_PRIORITY)

	388

	389 # Check which stack this match is from, and finds the highest priority

	390 # callstack up to this point.

	391 current_stack = stack_priority

	392 if current_stack < highest_priority_stack:

	393 highest_priority_stack = current_stack

	394

	395 # Check if current match changes a file that occurs in crash state.

	396 flattened_stack_frame_indices = [frame for frame_indices in

	397 match.stack_frame_indices

	398 for frame in frame_indices]

	399 current_is_important = (

	400 min(flattened_stack_frame_indices) < num_important_frames)

	401

	402 # This match and anything lower than this should be ignored if:

	403 # - Current match does not change crashed lines but there are matches

	404 # that do so.

	405 # - Current match is not in crash state but there are matches in it.

	406 # - There are other matches that came from higher priority stack.

	407 if (line_changed and not is_line_change) or (

	408 is_important_frame and not current_is_important) or (

	409 current_stack > highest_priority_stack):

	410 break

	411

	412 # Update the variables.

	413 if is_line_change:

	414 line_changed = True

	415 if current_is_important:

	416 is_important_frame = True

	417

	418 # Add current match to the filtered result.

	419 new_matches.append((stack_priority, cl, match))

	420

	421 return new_matches

	422

	423

	424 def GenerateReasonForMatches(matches):

	425 """Generates a reason that a match (CL) is a culprit cl.

	426

	427 Args:

	428 matches: A list of match objects.

	429 """

	430 # Iterate through the matches in the list.

	431 for i, _, match in matches:

	432 reason = []

	433

	434 # Zip the files in the match by the reason they are suspected

	435 # (how the file is modified).

	436 match_by_priority = zip(

	437 match.priorities, match.crashed_line_numbers, match.changed_files,

	438 match.changed_file_urls)

	439

	440 # Sort the zipped changed files in the match by their priority so that the

	441 # changed lines comes first in the reason.

	442 match_by_priority.sort(

	443 key=lambda (priority, crashed_line_number, file_name, url): priority)

	444

	445 # Iterate through the sorted match.

	446 for i in range(len(match_by_priority)):

	447 (priority, crashed_line_number, file_name, file_url) = \

	448 match_by_priority[i]

	449

	450 # If the file in the match is a line change, append a explanation.

	451 if priority == LINE_CHANGE_PRIORITY:

	452 reason.append(

	453 'Line %s of file %s which potentially caused the crash '

	454 'according to the stacktrace, is changed in this cl.\n' %

	455 (crash_utils.PrettifyList(crashed_line_number),

	456 crash_utils.PrettifyFiles([(file_name, file_url)])))

	457

	458 else:

	459 # Get all the files that are not line change.

	460 rest_of_the_files = match_by_priority[i:]

	461

	462 if len(rest_of_the_files) == 1:

	463 file_string = 'File %s is changed in this cl.\n'

	464 else:

	465 file_string = 'Files %s are changed in this cl.\n'

	466

	467 # Create a list of file name and its url, and prettify the list.

	468 file_name_with_url = [(file_name, file_url)

	469 for (_, _, file_name, file_url)

	470 in rest_of_the_files]

	471 pretty_file_name_url = crash_utils.PrettifyFiles(file_name_with_url)

	472

	473 # Add the reason, break because we took care of the rest of the files.

	474 reason.append(file_string % pretty_file_name_url)

	475 break

	476

	477 # Set the reason as string.

	478 match.reason = ''.join(reason)

	479

	480

	481 def CombineMatches(matches):

	482 """Combine possible duplicates in matches.

	483

	484 Args:

	485 matches: A list of matches object, along with its callstack priority and

	486 CL it is from.

	487 Returns:

	488 A combined list of matches.

	489 """

	490 combined_matches = []

	491

	492 for stack_index, cl, match in matches:

	493 found_match = None

	494

	495 # Iterate through the list of combined matches.

	496 for _, cl_combined, match_combined in combined_matches:

	497 # Check for if current CL is already higher up in the result.

	498 if cl == cl_combined:

	499 found_match = match_combined

	500 break

	501

	502 # If current match is not already in, add it to the list of matches.

	503 if not found_match:

	504 combined_matches.append((stack_index, cl, match))

	505 continue

	506

	507 # Combine the reason if the current match is already in there.

	508 found_match.reason += match.reason

	509

	510 return combined_matches

	511

	512

	513 def FilterAndGenerateReasonForMatches(result):

	514 """A wrapper function.

	515

	516 It generates reasons for the matches and returns string representation

	517 of filtered results.

	518

	519 Args:

	520 result: A list of match objects.

	521

	522 Returns:

	523 A string representation of filtered results.

	524 """

	525 new_result = SortAndFilterMatches(result)

	526 GenerateReasonForMatches(new_result)

	527 combined_matches = CombineMatches(new_result)

	528 return crash_utils.MatchListToResultList(combined_matches)

	529

	530

	531 def ParseCrashComponents(main_stack, top_n_frames=3):
	Martin Barbella 2014/08/19 19:52:12 I discussed this with Shuotao a bit and we think i I discussed this with Shuotao a bit and we think it might not be necessary to limit this. You could check for all of the components and ignore ones that did not roll in the regression range provided. I think for most cases, this should be fine as-is. You could clean this up in a separate CL (probably prioritized under getting Android and Syzyasan working). jeun 2014/08/19 21:37:43 Done. Show quoted text On 2014/08/19 19:52:12, mbarbella wrote: > I discussed this with Shuotao a bit and we think it might not be necessary to > limit this. You could check for all of the components and ignore ones that did > not roll in the regression range provided. > > I think for most cases, this should be fine as-is. You could clean this up in a > separate CL (probably prioritized under getting Android and Syzyasan working). Done.
	532 """Parses the crashing component.

	533

	534 Crashing components is a component that top_n_frames of the stacktrace is

	535 from.

	536

	537 Args:

	538 main_stack: Main stack from the stacktrace.

	539 top_n_frames: A number of frames to regard as crashing frame.

	540

	541 Returns:

	542 A set of components.

	543 """

	544 frame_list = main_stack.GetTopNFrames(top_n_frames)

	545 components = set()

	546

	547 for frame in frame_list:

	548 components.add(frame.component_path)

	549

	550 return components

	551

	552

	553 def GenerateAndFilterBlameList(callstack, component_to_crash_revision_dict,

	554 component_to_regression_dict):

	555 """A wrapper function.

	556

	557 Finds blame information for stack and returns string representation.

	558

	559 Args:

	560 callstack: A callstack to find the blame information.

	561 component_to_crash_revision_dict: A dictionary mapping component to its

	562 crash revision.

	563 component_to_regression_dict: A dictionary mapping component to its

	564 regression.

	565

	566 Returns:

	567 A list of blame results.

	568 """

	569 # Setup parser objects to use for parsing blame information.

	570 svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])

	571 git_parser = git_repository_parser.GitParser(component_to_regression_dict,

	572 CONFIG['git'])

	573 parsers = {}

	574 parsers['svn'] = svn_parser

	575 parsers['git'] = git_parser

	576

	577 # Create and generate the blame objects from the callstack.

	578 blame_list = blame.BlameList()

	579 blame_list.FindBlame(callstack, component_to_crash_revision_dict,

	580 component_to_regression_dict,

	581 parsers)

	582

	583 blame_list.FilterAndSortBlameList()

	584 return crash_utils.BlameListToResultList(blame_list)

	585

	586

	587 def FindItForCrash(stacktrace_list,

	588 callstack,

	589 component_to_regression_dict,

	590 component_to_crash_revision_dict):

	591 """Finds the culprit CL from the list of stacktrace.

	592

	593 Args:

	594 stacktrace_list: A list of stacktraces to look for, in the order of

	595 decreasing significance.

	596 callstack: A callstack object to show blame information for, if there are

	597 no results for all stacktraces in the stacktrace_list.

	598 component_to_regression_dict: A parsed regression information as a

	599 result of parsing DEPS file.

	600 component_to_crash_revision_dict: A parsed crash revision information.

	601

	602 Returns:

	603 A list of result objects, with the message how the result is created.

	604 """

	605 # If regression information is not available, return blame information.

	606 if not component_to_regression_dict:

	607 return_message = (

	608 'Regression information is not available. The result is '

	609 'the blame information.')

	610 result = GenerateAndFilterBlameList(callstack,

	611 component_to_crash_revision_dict,

	612 component_to_regression_dict)

	613 return (return_message, result)

	614

	615 for stacktrace in stacktrace_list:

	616 # Check the next stacktrace if current one is empty.

	617 if not stacktrace.stack_list:

	618 continue

	619

	620 # Get the crash stack for this stacktrace, and extract crashing components

	621 # from it.

	622 main_stack = stacktrace.GetCrashStack()

	623 components = ParseCrashComponents(main_stack)

	624

	625 result_for_stacktrace = FindMatchForStacktrace(

	626 stacktrace, components, component_to_regression_dict)

	627

	628 # If the result is empty, check the next stacktrace. Else, return the

	629 # filtered result.

	630 if not result_for_stacktrace:

	631 continue

	632

	633 return_message = (

	634 'The result is a list of CLs that change the crashed files.')

	635 result = FilterAndGenerateReasonForMatches(result_for_stacktrace)

	636 return (return_message, result)

	637

	638 # If no match is found, return the blame information for the input

	639 # callstack.

	640 return_message = (

	641 'There are no CLs that change the crashed files. The result is the '

	642 'blame information.')

	643 result = GenerateAndFilterBlameList(

	644 callstack, component_to_crash_revision_dict,

	645 component_to_regression_dict)

	646 return (return_message, result)

	647

OLD	NEW

« tools/findit/findit_for_clusterfuzz.py ('K') | « tools/findit/findit_for_clusterfuzz.py ('k') | no next file » | no next file with comments »