tools/findit/findit_for_crash.py - Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation.

Side by Side Diff: tools/findit/findit_for_crash.py

Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressed code review. Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import logging

	6 import os

	7 from threading import Lock, Thread

	8

	9 import blame

	10 import component_dictionary

	11 import crash_utils

	12 import git_repository_parser

	13 import match_set

	14 import svn_repository_parser

	15 import utils

	16

	17

	18 LINE_CHANGE_PRIORITY = 1

	19 FILE_CHANGE_PRIORITY = 2

	20 URL_MAP = crash_utils.ParseURLsFromConfig('urls.config')
	stgao 2014/08/18 22:37:06 The logic to handle the absolute path of the confi The logic to handle the absolute path of the config file should be in the code that use crash_utils.ParseURLsFromConfig, and not in crash_utils.ParseURLsFromConfig. Please make the change in crash_utils.py too.
	21

	22

	23 def GenerateMatchEntry(

	24 matches, revisions, cl, file_path, file_name, function, component_path,

	25 component_name, crashed_lines, stack_frame_indices, file_action,

	26 repository_parser):

	27 """Generates a match object and adds it to the match set.

	28

	29 Args:

	30 matches: A matchset object, a map from CL to a match object.

	31 revisions: The dictionary mapping cl's number to the revision information,

	32 as returned from a function ParseChangelog in parser.

	33 cl: The SVN revision number or git hash.
	stgao 2014/08/18 22:37:06 What's the difference between cls in \|revisions\| a What's the difference between cls in \|revisions\| and \|cl\|? As revisions is used to retrieve revisions[cl], why not just pass in revisions[cl] instead of both \|revisions\| and \|cl\|?
	34 file_path: The path of the file.

	35 file_name: The name of the file that this CL might be the culprit cl.

	36 function: The function that caused an crash.

	37 component_path: The path of the component this file is from.

	38 component_name: The name of the component the file is from.

	39 crashed_lines: The list of the lines in the file that caused the crash.

	40 stack_frame_indices: The list of positions of this file within a stack.

	41 file_action: Whether file is modified, added or deleted.
	stgao 2014/08/18 22:37:06 file_change_type? file_change_type?
	42 repository_parser: The parser object to parse line diff.

	43 """

	44 # Check if this CL should be ignored.

	45 with matches.matches_lock:

	46 if cl in matches.cls_to_ignore:

	47 return

	48

	49 # If this CL is not already identified as suspected, create a new entry.

	50 if cl not in matches.matches:

	51 revision = revisions[cl]

	52 match = match_set.Match(revision, component_name)

	53 message = revisions[cl]['message']

	54 # TODO(jeun): Don't hold lock while issuing http request.

	55 match.ParseMessage(message, matches.codereview_api_url)

	56

	57 # If this match is a revert, add to the set of CLs to be ignored.

	58 if match.is_reverted:

	59 matches.cls_to_ignore.add(cl)

	60

	61 # If this match has info on what CL it is reverted from, add that CL.

	62 if match.revert_of:

	63 matches.cls_to_ignore.add(match.revert_of)

	64

	65 # Return because we do not need to look at this CL anymore.

	66 return

	67

	68 # Add this CL to the set of matches.

	69 matches.matches[cl] = match

	70

	71 # Else, bring in the existing result.

	72 else:

	73 match = matches.matches[cl]

	74

	75 # Parse line diff information for this file.

	76 (diff_url, changed_line_numbers, changed_line_contents) = (

	77 repository_parser.ParseLineDiff(

	78 file_path, component_path, file_action, cl))

	79

	80 # Ignore this match if the component is not supported for svn.

	81 if not diff_url:

	82 return

	83

	84 # Find the intersection between the lines that this file crashed on and

	85 # the changed lines.

	86 (line_intersection, stack_frame_index_intersection) = (

	87 crash_utils.Intersection(

	88 crashed_lines, stack_frame_indices, changed_line_numbers))

	89

	90 # Find the minimum distance between the changed lines and crashed lines.

	91 min_distance = crash_utils.FindMinLineDistance(crashed_lines,

	92 changed_line_numbers)

	93

	94 # Check whether this CL changes the crashed lines or not.

	95 if line_intersection:

	96 priority = LINE_CHANGE_PRIORITY

	97 else:

	98 priority = FILE_CHANGE_PRIORITY

	99

	100 # Add the parsed information to the object.

	101 with matches.matches_lock:

	102 match.line_of_crash.append(line_intersection)

	103 match.files.append(file_name)

	104

	105 # Update the min distance only if it is less than the current one.

	106 if min_distance < match.min_distance:

	107 match.min_distance = min_distance

	108

	109 # If this CL does not change the crashed line, all occurrence of this

	110 # file in the stack has the same priority.

	111 if not stack_frame_index_intersection:

	112 stack_frame_index_intersection = stack_frame_indices

	113 match.stack_frame_indices.append(stack_frame_index_intersection)

	114 match.file_urls.append(diff_url)

	115

	116 # Only record the highest rank of this CL.

	117 if priority < match.rank:

	118 match.rank = priority

	119 match.priorities.append(priority)

	120 match.function.append(function)

	121

	122

	123 def FindMatch(revisions, file_map, file_dict, component_path, component_name,

	124 repository_parser, codereview_api_url):

	125 """Finds a CL that modifies file in the stacktrace.

	126

	127 Args:

	128 revisions: A dictionary mapping revision number to the CL information.

	129 file_map: A dictionary mapping file to the revision that modifies it.

	130 file_dict: A dictionary mapping file to its occurrence in stacktrace.

	131 component_path: The path of the component to search for.

	132 component_name: The name of the component to search for.

	133 repository_parser: The parser object to parse the line diff.

	134 codereview_api_url: A code review url to retrieve data from.

	135

	136 Returns:

	137 Matches, a set of match objects.

	138 """

	139 matches = match_set.MatchSet(codereview_api_url)

	140 threads = []

	141

	142 # For all files in stacktrace, if it is not header file, check against

	143 # all CLs to see if any CL changes this file. If so, add to the set of

	144 # suspected CLs.

	145 for file_name in file_dict:

	146 # Ignore header file.

	147 if file_name.endswith('.h'):

	148 continue

	149

	150 # If the file in the stacktrace is not changed in any commits, continue.

	151 if file_name not in file_map:

	152 continue

	153

	154 path_dict = file_dict.GetPathDic(file_name)

	155 # There can be two files that are in different path but have same

	156 # names, so also has to check path.

	157 for path in path_dict:

	158 crashed_lines = file_dict.GetCrashedLineNumbers(path)

	159 stack_frame_nums = file_dict.GetCrashStackFrameindex(path)

	160 function = file_dict.GetCrashFunction(path)

	161

	162 # Iterate through the revisions that this file is changed.

	163 for (cl, file_action, file_path) in file_map[file_name]:

	164 # If path of two files are different or if a commit deletes a file,

	165 # continue.

	166 if not crash_utils.GuessIfSameSubPath(path, file_path) or (

	167 file_action == 'D'):

	168 continue

	169

	170 # Create a new thread for adding this CL.

	171 match_thread = Thread(

	172 target=GenerateMatchEntry,

	173 args=[matches, revisions, cl, file_path, file_name, function,

	174 component_path, component_name, crashed_lines,

	175 stack_frame_nums, file_action,

	176 repository_parser])

	177 threads.append(match_thread)

	178 match_thread.start()

	179

	180 # Make sure all threads finish before returning.

	181 for match_thread in threads:

	182 match_thread.join()

	183

	184 # Remove CLs that are revert.

	185 matches.RemoveReverts()

	186

	187 return matches

	188

	189

	190 def FindMatchForComponent(component_path, file_dict, changelog,

	191 callstack_priority, results, results_lock):

	192 """Parses changelog and finds suspected CLs for a given component.

	193

	194 Args:

	195 component_path: The path of component to look for the culprit CL.

	196 file_dict: A dictionary mapping file to its occurrence in stackframe.

	197 changelog: The parsed changelog for this component.

	198 callstack_priority: The priority of this call stack, 0 if from crash stack,

	199 1 if from freed, 2 if from previously allocated.

	200 results: A dictionary to store the result.

	201 results_lock: A lock that guards results.

	202 """

	203 (repository_parser, component_name, revisions, file_to_revision_map) = \

	204 changelog

	205

	206 # Find match for this component.

	207 codereview_api_url = URL_MAP['codereview']['review_url']

	208 component_result = FindMatch(

	209 revisions, file_to_revision_map, file_dict, component_path,

	210 component_name, repository_parser, codereview_api_url)

	211 matches = component_result.matches

	212

	213 # For all the match results in a dictionary, add to the list so that it

	214 # can be sorted.

	215 with results_lock:

	216 for cl in matches:

	217 match = matches[cl]

	218 results.append((callstack_priority, cl, match))

	219

	220

	221 def FindMatchForCallstack(

	222 callstack, components, component_to_changelog_map, results,

	223 results_lock):

	224 """Finds culprit cl for a stack within a stacktrace.

	225

	226 For each components to look for, create new thread that computes the matches

	227 and join the results at the end.

	228 Args:

	229 callstack: A callstack in a stacktrace to find the result for.

	230 components: A set of components to look for.

	231 component_to_changelog_map: A map from component to its parsed changelog.

	232 results: A list to aggregrate results from all stacktraces.

	233 results_lock: A lock that guards results.

	234 """

	235 # Create component dictionary from the component and call stack.

	236 component_dict = component_dictionary.ComponentDictionary(callstack,

	237 components)

	238 callstack_priority = callstack.priority

	239

	240 # Iterate through all components and create new thread for each component.

	241 threads = []

	242 for component_path in component_dict:

	243 file_dict = component_dict.GetFileDict(component_path)

	244

	245 if component_path not in component_to_changelog_map:

	246 continue

	247 changelog = component_to_changelog_map[component_path]

	248

	249 t = Thread(

	250 target=FindMatchForComponent,

	251 args=[component_path, file_dict, changelog,

	252 callstack_priority, results, results_lock])

	253 threads.append(t)

	254 t.start()

	255

	256 # Join the threads before returning.

	257 for t in threads:

	258 t.join()

	259

	260

	261 def FindMatchForStacktrace(stacktrace, components, regression_dict):

	262 """Finds the culprit CL for stacktrace.

	263

	264 The passed stacktrace is either from release build stacktrace

	265 or debug build stacktrace.

	266

	267 Args:

	268 stacktrace: A list of parsed stacks within a stacktrace.

	269 components: A set of components to look for.

	270 regression_dict: A dictionary mapping component path to its regression.

	271

	272 Returns:

	273 A list of match results from all stacks.

	274 """

	275 # A list to aggregate results from the whole stacktrace.

	276 results = []

	277 results_lock = Lock()

	278

	279 # Parse changelog for each of the components and store it.

	280 svn_parser = svn_repository_parser.SVNParser(URL_MAP['svn'])

	281 git_parser = git_repository_parser.GitParser(regression_dict, URL_MAP['git'])

	282

	283 component_to_changelog_map = {}

	284 for component_path in components:

	285 component_object = regression_dict[component_path]

	286 range_start = component_object['old_revision']

	287 range_end = component_object['new_revision']

	288 if range_start == '0':

	289 continue

	290

	291 component_name = regression_dict[component_path]['name']

	292

	293 # Check if the regression is git hash or a revision number, and create

	294 # appropriate parser object.

	295 is_git = utils.IsGitHash(range_start)

	296 if is_git:

	297 repository_parser = git_parser

	298 else:

	299 repository_parser = svn_parser

	300

	301 # Parse revisions and changed files in regression.

	302 (revisions, file_to_revision_map) = repository_parser.ParseChangelog(

	303 component_path, range_start, range_end)

	304

	305 if not (revisions and file_to_revision_map):

	306 continue

	307

	308 component_to_changelog_map[component_path] = (repository_parser,

	309 component_name,

	310 revisions,

	311 file_to_revision_map)

	312

	313 # Create separate threads for each of the call stack in the stacktrace.

	314 threads = []

	315 for callstack in stacktrace.stack_list:

	316 t = Thread(

	317 target=FindMatchForCallstack,

	318 args=[callstack, components, component_to_changelog_map,

	319 results, results_lock])

	320 threads.append(t)

	321 t.start()

	322

	323 # Join the threads before returning.

	324 for t in threads:

	325 t.join()

	326

	327 return results

	328

	329

	330 def SortAndFilterMatches(matches):

	331 """Filters the list of potential culprit CLs to remove noise.

	332

	333 Args:

	334 matches: A list containing match results.

	335

	336 Returns:

	337 Filtered match results.

	338 """

	339 new_matches = []

	340 line_changed = False

	341 in_crash_state = False

	342 highest_priority_stack = crash_utils.INFINITY

	343

	344 # Sort the matches in specific order.

	345 matches.sort(key=lambda (stack_index, cl, match):

	346 (match.rank, stack_index,

	347 crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,

	348 match.priorities),

	349 -len(match.files), match.min_distance))

	350

	351 # Iterate through the matches to find out what results are significant.

	352 for stack_index, cl, match in matches:

	353 # Check if the current match changes crashed line.

	354 is_line_change = (match.rank == LINE_CHANGE_PRIORITY)

	355

	356 # Check which thread this match is from.

	357 current_stack = stack_index

	358 if current_stack < highest_priority_stack:

	359 highest_priority_stack = current_stack

	360

	361 # Check if current match changes a file that occurs in crash state.

	362 flattened_stack_frame_indices = [i for sl in match.stack_frame_indices

	363 for i in sl]

	364 current_in_crash_state = min(flattened_stack_frame_indices) < 5

	365

	366 # This match and anything lower than this should be ignored if:

	367 # - Current match does not change crashed lines but there are matches

	368 # that do so.

	369 # - Current match is not in crash state but there are matches in it.

	370 # - There are other matches that came from higher priority stack.

	371 if (line_changed and not is_line_change) or (

	372 in_crash_state and not current_in_crash_state) or (

	373 current_stack > highest_priority_stack):

	374 break

	375

	376 # Update the variables.

	377 if is_line_change:

	378 line_changed = True

	379 if current_in_crash_state:

	380 in_crash_state = True

	381

	382 # Add current match to the filtered result.

	383 new_matches.append((stack_index, cl, match))

	384

	385 return new_matches

	386

	387

	388 def GenerateReasonForMatches(matches):

	389 """Generates a reason that a match (CL) is a culprit cl.

	390

	391 Args:

	392 matches: A list of match objects.

	393 """

	394 # Iterate through the matches in the list.

	395 for i, _, match in matches:

	396 reason = []

	397

	398 # Zip the files in the match by the reason they are suspected

	399 # (how the file is modified) and sort it in specific order.

	400 match_by_priority = zip(

	401 match.priorities, match.line_of_crash, match.files,

	402 match.file_urls)

	403

	404 match_by_priority.sort(

	405 key=lambda (priority, crashed_line_number, file_name, file_url):

	406 (priority, len(file_name)))

	407

	408 # Iterate through the sorted match.

	409 for i in range(len(match_by_priority)):

	410 (priority, crashed_line_number, file_name, file_url) = \

	411 match_by_priority[i]

	412

	413 # If the file in the match is a line change, append a explanation.

	414 if priority == LINE_CHANGE_PRIORITY:

	415 reason.append(

	416 ' Line %s of file %s which caused the crash according '

	417 'to the stacktrace, is changed in this cl.\n' %

	418 (crash_utils.PrettifyList(crashed_line_number),

	419 crash_utils.PrettifyFiles([(file_name, file_url)])))

	420

	421 else:

	422 # Get all the files that are not line change.

	423 rest_of_the_files = match_by_priority[i:]

	424

	425 # Take care of single/multiple files in string.

	426 if len(rest_of_the_files) == 1:

	427 file_string = ' File %s is changed in this cl.\n'

	428 else:

	429 file_string = ' Files %s are changed in this cl.\n'

	430

	431 # Create a list of file name and its url, and prettify the list.

	432 file_name_with_url = [(file_name, file_url)

	433 for (_, _, file_name, file_url)

	434 in rest_of_the_files]

	435 pretty_file_name_url = crash_utils.PrettifyFiles(file_name_with_url)

	436

	437 # Add the reason, break because we took care of the rest of the files.

	438 reason.append(file_string % pretty_file_name_url)

	439 break

	440

	441 # Set the reason as string.

	442 match.reason = ''.join(reason)

	443

	444

	445 def CombineMatches(matches):

	446 """Combine possible duplicates in matches.

	447

	448 Args:

	449 matches: A list of matches object, along with its callstack priority and

	450 CL it is from.

	451 Returns:

	452 A combined list of matches.

	453 """

	454 combined_matches = []

	455

	456 for stack_index, cl, match in matches:

	457 found_match = None

	458

	459 # Iterate through the list of combined matches.

	460 for _, cl_combined, match_combined in combined_matches:

	461 # Check for if current match is already higher up in the result.

	462 if cl == cl_combined:

	463 found_match = match_combined

	464

	465 # If current match is not already in, add it to the list of matches.

	466 if not found_match:

	467 combined_matches.append((stack_index, cl, match))

	468 continue

	469

	470 # Combine the reason if the current match is already in there.

	471 found_match.reason += match.reason

	472

	473 return combined_matches

	474

	475

	476 def FilterAndGenerateReasonForMatches(result):

	477 """A wrapper function.

	478

	479 It generates reasons for the matches and returns string representation

	480 of filtered results.

	481

	482 Args:

	483 result: A list of match objects.

	484

	485 Returns:

	486 A string representation of filtered results.

	487 """

	488 new_result = SortAndFilterMatches(result)

	489 GenerateReasonForMatches(new_result)

	490 combined_matches = CombineMatches(new_result)

	491 return crash_utils.MatchListToResultList(combined_matches)

	492

	493

	494 def ParseCrashComponents(main_stack, top_n_frames=3):

	495 """Parses the crashing component.

	496

	497 Args:

	498 main_stack: Main stack from the stacktrace.

	499 top_n_frames: A number of frames to regard as crashing frame.

	500 Returns:

	501 A set of components.

	502 """

	503 frame_list = main_stack.GetTopNFrames(top_n_frames)

	504 components = set()

	505

	506 for frame in frame_list:

	507 components.add(frame.component_path)

	508

	509 return components

	510

	511

	512 def GenerateAndFilterBlameList(callstack, crash_revision_dict, regression_dict):

	513 """A wrapper function.

	514

	515 Finds blame information for stack and returns string representation.

	516

	517 Args:

	518 callstack: A callstack to find the blame information.

	519 crash_revision_dict: A dictionary mapping component to its crash revision.

	520 regression_dict: A dictionary mapping component to its regression.

	521

	522 Returns:

	523 A list of blame results.

	524 """

	525 # Setup parser objects to use for parsing blame information.

	526 svn_parser = svn_repository_parser.SVNParser(URL_MAP['svn'])

	527 git_parser = git_repository_parser.GitParser(regression_dict, URL_MAP['git'])

	528 parsers = [svn_parser, git_parser]

	529

	530 # Create and generate the blame objects from the callstack.

	531 blame_list = blame.BlameList()

	532 blame_list.FindBlame(callstack, crash_revision_dict, regression_dict,

	533 parsers)

	534

	535 blame_list.FilterAndSortBlameList()

	536 return crash_utils.BlameListToResultList(blame_list)

	537

	538

	539 def FindItForCrash(stacktrace_list,

	540 callstack,

	541 parsed_regression,

	542 parsed_crash_revision):

	543 """Finds the culprit CL from the list of stacktrace.

	544

	545 Args:

	546 stacktrace_list: A list of stacktraces to look for, in the order of

	547 decreasing significance.

	548 callstack: A callstack object to show blame information for, if there are

	549 no results for all stacktraces in the stacktrace_list.

	550 parsed_regression: A parsed regression information as a result of parsing

	551 DEPS file.

	552 parsed_crash_revision: A parsed crash revision information.

	553

	554 Returns:

	555 A list of result objects, with the message how the result is created.

	556 """

	557 # If regression information is not available, return blame information.

	558 if not parsed_regression:

	559 return_message = \

	560 """

	561 Regression information is not available. The result is

	562 the blame information.

	563 """

	564 result = GenerateAndFilterBlameList(callstack, parsed_crash_revision,

	565 parsed_regression)

	566 return (return_message, result)

	567

	568 for stacktrace in stacktrace_list:

	569 # Check the next stacktrace if current one is empty.

	570 if not stacktrace.stack_list:

	571 continue

	572

	573 # Get the crash stack for this stacktrace, and extract crashing components

	574 # from it.

	575 main_stack = stacktrace.GetCrashStack()

	576 components = ParseCrashComponents(main_stack)

	577

	578 result_for_stacktrace = FindMatchForStacktrace(stacktrace, components,

	579 parsed_regression)

	580

	581 # If the result is empty, check the next stacktrace. Else, return the

	582 # filtered result.

	583 if not result_for_stacktrace:

	584 continue

	585

	586 return_message = \

	587 'The result is a list of CLs that change the crashed files.'

	588 result = FilterAndGenerateReasonForMatches(result_for_stacktrace)

	589 return (return_message, result)

	590

	591 # If not match is found, return the blame information for the input

	592 # callstack.

	593 return_message = \

	594 """

	595 There are no CLs that change the crashed files. The result is the

	596 blame information.

	597 """

	598 result = GenerateAndFilterBlameList(callstack, parsed_crash_revision,

	599 parsed_regression)

	600 return (return_message, result)

	601

OLD	NEW

« no previous file with comments | « tools/findit/findit_for_clusterfuzz.py ('k') | tools/findit/urls.config » ('j') | no next file with comments »