tools/findit/findit_for_crash.py - Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation.

Unified Diff: tools/findit/findit_for_crash.py

Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressed code review. Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/findit/findit_for_crash.py

diff --git a/tools/findit/findit_for_crash.py b/tools/findit/findit_for_crash.py

new file mode 100644

index 0000000000000000000000000000000000000000..f63b9a20bb61691b6a47c9df3e02fbda8a048d57

--- /dev/null

+++ b/tools/findit/findit_for_crash.py

@@ -0,0 +1,601 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+import logging

+import os

+from threading import Lock, Thread

+import blame

+import component_dictionary

+import crash_utils

+import git_repository_parser

+import match_set

+import svn_repository_parser

+import utils

+LINE_CHANGE_PRIORITY = 1

+FILE_CHANGE_PRIORITY = 2

+URL_MAP = crash_utils.ParseURLsFromConfig('urls.config')

stgao 2014/08/18 22:37:06 The logic to handle the absolute path of the confi

+def GenerateMatchEntry(

+ matches, revisions, cl, file_path, file_name, function, component_path,

+ component_name, crashed_lines, stack_frame_indices, file_action,

+ repository_parser):

+ """Generates a match object and adds it to the match set.

+ Args:

+ matches: A matchset object, a map from CL to a match object.

+ revisions: The dictionary mapping cl's number to the revision information,

+ as returned from a function ParseChangelog in parser.

+ cl: The SVN revision number or git hash.

stgao 2014/08/18 22:37:06 What's the difference between cls in |revisions| a

+ file_path: The path of the file.

+ file_name: The name of the file that this CL might be the culprit cl.

+ function: The function that caused an crash.

+ component_path: The path of the component this file is from.

+ component_name: The name of the component the file is from.

+ crashed_lines: The list of the lines in the file that caused the crash.

+ stack_frame_indices: The list of positions of this file within a stack.

+ file_action: Whether file is modified, added or deleted.

stgao 2014/08/18 22:37:06 file_change_type?

+ repository_parser: The parser object to parse line diff.

+ """

+ # Check if this CL should be ignored.

+ with matches.matches_lock:

+ if cl in matches.cls_to_ignore:

+ return

+ # If this CL is not already identified as suspected, create a new entry.

+ if cl not in matches.matches:

+ revision = revisions[cl]

+ match = match_set.Match(revision, component_name)

+ message = revisions[cl]['message']

+ # TODO(jeun): Don't hold lock while issuing http request.

+ match.ParseMessage(message, matches.codereview_api_url)

+ # If this match is a revert, add to the set of CLs to be ignored.

+ if match.is_reverted:

+ matches.cls_to_ignore.add(cl)

+ # If this match has info on what CL it is reverted from, add that CL.

+ if match.revert_of:

+ matches.cls_to_ignore.add(match.revert_of)

+ # Return because we do not need to look at this CL anymore.

+ return

+ # Add this CL to the set of matches.

+ matches.matches[cl] = match

+ # Else, bring in the existing result.

+ else:

+ match = matches.matches[cl]

+ # Parse line diff information for this file.

+ (diff_url, changed_line_numbers, changed_line_contents) = (

+ repository_parser.ParseLineDiff(

+ file_path, component_path, file_action, cl))

+ # Ignore this match if the component is not supported for svn.

+ if not diff_url:

+ return

+ # Find the intersection between the lines that this file crashed on and

+ # the changed lines.

+ (line_intersection, stack_frame_index_intersection) = (

+ crash_utils.Intersection(

+ crashed_lines, stack_frame_indices, changed_line_numbers))

+ # Find the minimum distance between the changed lines and crashed lines.

+ min_distance = crash_utils.FindMinLineDistance(crashed_lines,

+ changed_line_numbers)

+ # Check whether this CL changes the crashed lines or not.

+ if line_intersection:

+ priority = LINE_CHANGE_PRIORITY

+ else:

+ priority = FILE_CHANGE_PRIORITY

+ # Add the parsed information to the object.

+ with matches.matches_lock:

+ match.line_of_crash.append(line_intersection)

+ match.files.append(file_name)

+ # Update the min distance only if it is less than the current one.

+ if min_distance < match.min_distance:

+ match.min_distance = min_distance

+ # If this CL does not change the crashed line, all occurrence of this

+ # file in the stack has the same priority.

+ if not stack_frame_index_intersection:

+ stack_frame_index_intersection = stack_frame_indices

+ match.stack_frame_indices.append(stack_frame_index_intersection)

+ match.file_urls.append(diff_url)

+ # Only record the highest rank of this CL.

+ if priority < match.rank:

+ match.rank = priority

+ match.priorities.append(priority)

+ match.function.append(function)

+def FindMatch(revisions, file_map, file_dict, component_path, component_name,

+ repository_parser, codereview_api_url):

+ """Finds a CL that modifies file in the stacktrace.

+ Args:

+ revisions: A dictionary mapping revision number to the CL information.

+ file_map: A dictionary mapping file to the revision that modifies it.

+ file_dict: A dictionary mapping file to its occurrence in stacktrace.

+ component_path: The path of the component to search for.

+ component_name: The name of the component to search for.

+ repository_parser: The parser object to parse the line diff.

+ codereview_api_url: A code review url to retrieve data from.

+ Returns:

+ Matches, a set of match objects.

+ """

+ matches = match_set.MatchSet(codereview_api_url)

+ threads = []

+ # For all files in stacktrace, if it is not header file, check against

+ # all CLs to see if any CL changes this file. If so, add to the set of

+ # suspected CLs.

+ for file_name in file_dict:

+ # Ignore header file.

+ if file_name.endswith('.h'):

+ continue

+ # If the file in the stacktrace is not changed in any commits, continue.

+ if file_name not in file_map:

+ continue

+ path_dict = file_dict.GetPathDic(file_name)

+ # There can be two files that are in different path but have same

+ # names, so also has to check path.

+ for path in path_dict:

+ crashed_lines = file_dict.GetCrashedLineNumbers(path)

+ stack_frame_nums = file_dict.GetCrashStackFrameindex(path)

+ function = file_dict.GetCrashFunction(path)

+ # Iterate through the revisions that this file is changed.

+ for (cl, file_action, file_path) in file_map[file_name]:

+ # If path of two files are different or if a commit deletes a file,

+ # continue.

+ if not crash_utils.GuessIfSameSubPath(path, file_path) or (

+ file_action == 'D'):

+ continue

+ # Create a new thread for adding this CL.

+ match_thread = Thread(

+ target=GenerateMatchEntry,

+ args=[matches, revisions, cl, file_path, file_name, function,

+ component_path, component_name, crashed_lines,

+ stack_frame_nums, file_action,

+ repository_parser])

+ threads.append(match_thread)

+ match_thread.start()

+ # Make sure all threads finish before returning.

+ for match_thread in threads:

+ match_thread.join()

+ # Remove CLs that are revert.

+ matches.RemoveReverts()

+ return matches

+def FindMatchForComponent(component_path, file_dict, changelog,

+ callstack_priority, results, results_lock):

+ """Parses changelog and finds suspected CLs for a given component.

+ Args:

+ component_path: The path of component to look for the culprit CL.

+ file_dict: A dictionary mapping file to its occurrence in stackframe.

+ changelog: The parsed changelog for this component.

+ callstack_priority: The priority of this call stack, 0 if from crash stack,

+ 1 if from freed, 2 if from previously allocated.

+ results: A dictionary to store the result.

+ results_lock: A lock that guards results.

+ """

+ (repository_parser, component_name, revisions, file_to_revision_map) = \

+ changelog

+ # Find match for this component.

+ codereview_api_url = URL_MAP['codereview']['review_url']

+ component_result = FindMatch(

+ revisions, file_to_revision_map, file_dict, component_path,

+ component_name, repository_parser, codereview_api_url)

+ matches = component_result.matches

+ # For all the match results in a dictionary, add to the list so that it

+ # can be sorted.

+ with results_lock:

+ for cl in matches:

+ match = matches[cl]

+ results.append((callstack_priority, cl, match))

+def FindMatchForCallstack(

+ callstack, components, component_to_changelog_map, results,

+ results_lock):

+ """Finds culprit cl for a stack within a stacktrace.

+ For each components to look for, create new thread that computes the matches

+ and join the results at the end.

+ Args:

+ callstack: A callstack in a stacktrace to find the result for.

+ components: A set of components to look for.

+ component_to_changelog_map: A map from component to its parsed changelog.

+ results: A list to aggregrate results from all stacktraces.

+ results_lock: A lock that guards results.

+ """

+ # Create component dictionary from the component and call stack.

+ component_dict = component_dictionary.ComponentDictionary(callstack,

+ components)

+ callstack_priority = callstack.priority

+ # Iterate through all components and create new thread for each component.

+ threads = []

+ for component_path in component_dict:

+ file_dict = component_dict.GetFileDict(component_path)

+ if component_path not in component_to_changelog_map:

+ continue

+ changelog = component_to_changelog_map[component_path]

+ t = Thread(

+ target=FindMatchForComponent,

+ args=[component_path, file_dict, changelog,

+ callstack_priority, results, results_lock])

+ threads.append(t)

+ t.start()

+ # Join the threads before returning.

+ for t in threads:

+ t.join()

+def FindMatchForStacktrace(stacktrace, components, regression_dict):

+ """Finds the culprit CL for stacktrace.

+ The passed stacktrace is either from release build stacktrace

+ or debug build stacktrace.

+ Args:

+ stacktrace: A list of parsed stacks within a stacktrace.

+ components: A set of components to look for.

+ regression_dict: A dictionary mapping component path to its regression.

+ Returns:

+ A list of match results from all stacks.

+ """

+ # A list to aggregate results from the whole stacktrace.

+ results = []

+ results_lock = Lock()

+ # Parse changelog for each of the components and store it.

+ svn_parser = svn_repository_parser.SVNParser(URL_MAP['svn'])

+ git_parser = git_repository_parser.GitParser(regression_dict, URL_MAP['git'])

+ component_to_changelog_map = {}

+ for component_path in components:

+ component_object = regression_dict[component_path]

+ range_start = component_object['old_revision']

+ range_end = component_object['new_revision']

+ if range_start == '0':

+ continue

+ component_name = regression_dict[component_path]['name']

+ # Check if the regression is git hash or a revision number, and create

+ # appropriate parser object.

+ is_git = utils.IsGitHash(range_start)

+ if is_git:

+ repository_parser = git_parser

+ else:

+ repository_parser = svn_parser

+ # Parse revisions and changed files in regression.

+ (revisions, file_to_revision_map) = repository_parser.ParseChangelog(

+ component_path, range_start, range_end)

+ if not (revisions and file_to_revision_map):

+ continue

+ component_to_changelog_map[component_path] = (repository_parser,

+ component_name,

+ revisions,

+ file_to_revision_map)

+ # Create separate threads for each of the call stack in the stacktrace.

+ threads = []

+ for callstack in stacktrace.stack_list:

+ t = Thread(

+ target=FindMatchForCallstack,

+ args=[callstack, components, component_to_changelog_map,

+ results, results_lock])

+ threads.append(t)

+ t.start()

+ # Join the threads before returning.

+ for t in threads:

+ t.join()

+ return results

+def SortAndFilterMatches(matches):

+ """Filters the list of potential culprit CLs to remove noise.

+ Args:

+ matches: A list containing match results.

+ Returns:

+ Filtered match results.

+ """

+ new_matches = []

+ line_changed = False

+ in_crash_state = False

+ highest_priority_stack = crash_utils.INFINITY

+ # Sort the matches in specific order.

+ matches.sort(key=lambda (stack_index, cl, match):

+ (match.rank, stack_index,

+ crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,

+ match.priorities),

+ -len(match.files), match.min_distance))

+ # Iterate through the matches to find out what results are significant.

+ for stack_index, cl, match in matches:

+ # Check if the current match changes crashed line.

+ is_line_change = (match.rank == LINE_CHANGE_PRIORITY)

+ # Check which thread this match is from.

+ current_stack = stack_index

+ if current_stack < highest_priority_stack:

+ highest_priority_stack = current_stack

+ # Check if current match changes a file that occurs in crash state.

+ flattened_stack_frame_indices = [i for sl in match.stack_frame_indices

+ for i in sl]

+ current_in_crash_state = min(flattened_stack_frame_indices) < 5

+ # This match and anything lower than this should be ignored if:

+ # - Current match does not change crashed lines but there are matches

+ # that do so.

+ # - Current match is not in crash state but there are matches in it.

+ # - There are other matches that came from higher priority stack.

+ if (line_changed and not is_line_change) or (

+ in_crash_state and not current_in_crash_state) or (

+ current_stack > highest_priority_stack):

+ break

+ # Update the variables.

+ if is_line_change:

+ line_changed = True

+ if current_in_crash_state:

+ in_crash_state = True

+ # Add current match to the filtered result.

+ new_matches.append((stack_index, cl, match))

+ return new_matches

+def GenerateReasonForMatches(matches):

+ """Generates a reason that a match (CL) is a culprit cl.

+ Args:

+ matches: A list of match objects.

+ """

+ # Iterate through the matches in the list.

+ for i, _, match in matches:

+ reason = []

+ # Zip the files in the match by the reason they are suspected

+ # (how the file is modified) and sort it in specific order.

+ match_by_priority = zip(

+ match.priorities, match.line_of_crash, match.files,

+ match.file_urls)

+ match_by_priority.sort(

+ key=lambda (priority, crashed_line_number, file_name, file_url):

+ (priority, len(file_name)))

+ # Iterate through the sorted match.

+ for i in range(len(match_by_priority)):

+ (priority, crashed_line_number, file_name, file_url) = \

+ match_by_priority[i]

+ # If the file in the match is a line change, append a explanation.

+ if priority == LINE_CHANGE_PRIORITY:

+ reason.append(

+ ' Line %s of file %s which caused the crash according '

+ 'to the stacktrace, is changed in this cl.\n' %

+ (crash_utils.PrettifyList(crashed_line_number),

+ crash_utils.PrettifyFiles([(file_name, file_url)])))

+ else:

+ # Get all the files that are not line change.

+ rest_of_the_files = match_by_priority[i:]

+ # Take care of single/multiple files in string.

+ if len(rest_of_the_files) == 1:

+ file_string = ' File %s is changed in this cl.\n'

+ else:

+ file_string = ' Files %s are changed in this cl.\n'

+ # Create a list of file name and its url, and prettify the list.

+ file_name_with_url = [(file_name, file_url)

+ for (_, _, file_name, file_url)

+ in rest_of_the_files]

+ pretty_file_name_url = crash_utils.PrettifyFiles(file_name_with_url)

+ # Add the reason, break because we took care of the rest of the files.

+ reason.append(file_string % pretty_file_name_url)

+ break

+ # Set the reason as string.

+ match.reason = ''.join(reason)

+def CombineMatches(matches):

+ """Combine possible duplicates in matches.

+ Args:

+ matches: A list of matches object, along with its callstack priority and

+ CL it is from.

+ Returns:

+ A combined list of matches.

+ """

+ combined_matches = []

+ for stack_index, cl, match in matches:

+ found_match = None

+ # Iterate through the list of combined matches.

+ for _, cl_combined, match_combined in combined_matches:

+ # Check for if current match is already higher up in the result.

+ if cl == cl_combined:

+ found_match = match_combined

+ # If current match is not already in, add it to the list of matches.

+ if not found_match:

+ combined_matches.append((stack_index, cl, match))

+ continue

+ # Combine the reason if the current match is already in there.

+ found_match.reason += match.reason

+ return combined_matches

+def FilterAndGenerateReasonForMatches(result):

+ """A wrapper function.

+ It generates reasons for the matches and returns string representation

+ of filtered results.

+ Args:

+ result: A list of match objects.

+ Returns:

+ A string representation of filtered results.

+ """

+ new_result = SortAndFilterMatches(result)

+ GenerateReasonForMatches(new_result)

+ combined_matches = CombineMatches(new_result)

+ return crash_utils.MatchListToResultList(combined_matches)

+def ParseCrashComponents(main_stack, top_n_frames=3):

+ """Parses the crashing component.

+ Args:

+ main_stack: Main stack from the stacktrace.

+ top_n_frames: A number of frames to regard as crashing frame.

+ Returns:

+ A set of components.

+ """

+ frame_list = main_stack.GetTopNFrames(top_n_frames)

+ components = set()

+ for frame in frame_list:

+ components.add(frame.component_path)

+ return components

+def GenerateAndFilterBlameList(callstack, crash_revision_dict, regression_dict):

+ """A wrapper function.

+ Finds blame information for stack and returns string representation.

+ Args:

+ callstack: A callstack to find the blame information.

+ crash_revision_dict: A dictionary mapping component to its crash revision.

+ regression_dict: A dictionary mapping component to its regression.

+ Returns:

+ A list of blame results.

+ """

+ # Setup parser objects to use for parsing blame information.

+ svn_parser = svn_repository_parser.SVNParser(URL_MAP['svn'])

+ git_parser = git_repository_parser.GitParser(regression_dict, URL_MAP['git'])

+ parsers = [svn_parser, git_parser]

+ # Create and generate the blame objects from the callstack.

+ blame_list = blame.BlameList()

+ blame_list.FindBlame(callstack, crash_revision_dict, regression_dict,

+ parsers)

+ blame_list.FilterAndSortBlameList()

+ return crash_utils.BlameListToResultList(blame_list)

+def FindItForCrash(stacktrace_list,

+ callstack,

+ parsed_regression,

+ parsed_crash_revision):

+ """Finds the culprit CL from the list of stacktrace.

+ Args:

+ stacktrace_list: A list of stacktraces to look for, in the order of

+ decreasing significance.

+ callstack: A callstack object to show blame information for, if there are

+ no results for all stacktraces in the stacktrace_list.

+ parsed_regression: A parsed regression information as a result of parsing

+ DEPS file.

+ parsed_crash_revision: A parsed crash revision information.

+ Returns:

+ A list of result objects, with the message how the result is created.

+ """

+ # If regression information is not available, return blame information.

+ if not parsed_regression:

+ return_message = \

+ """

+ Regression information is not available. The result is

+ the blame information.

+ """

+ result = GenerateAndFilterBlameList(callstack, parsed_crash_revision,

+ parsed_regression)

+ return (return_message, result)

+ for stacktrace in stacktrace_list:

+ # Check the next stacktrace if current one is empty.

+ if not stacktrace.stack_list:

+ continue

+ # Get the crash stack for this stacktrace, and extract crashing components

+ # from it.

+ main_stack = stacktrace.GetCrashStack()

+ components = ParseCrashComponents(main_stack)

+ result_for_stacktrace = FindMatchForStacktrace(stacktrace, components,

+ parsed_regression)

+ # If the result is empty, check the next stacktrace. Else, return the

+ # filtered result.

+ if not result_for_stacktrace:

+ continue

+ return_message = \

+ 'The result is a list of CLs that change the crashed files.'

+ result = FilterAndGenerateReasonForMatches(result_for_stacktrace)

+ return (return_message, result)

+ # If not match is found, return the blame information for the input

+ # callstack.

+ return_message = \

+ """

+ There are no CLs that change the crashed files. The result is the

+ blame information.

+ """

+ result = GenerateAndFilterBlameList(callstack, parsed_crash_revision,

+ parsed_regression)

+ return (return_message, result)

« no previous file with comments | « tools/findit/findit_for_clusterfuzz.py ('k') | tools/findit/urls.config » ('j') | no next file with comments »