tools/findit/findit_for_crash.py - Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation.

Side by Side Diff: tools/findit/findit_for_crash.py

Issue 468823003: [Findit] findit algorithms and a cluster-fuzz implementation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import logging

	6 import os

	7 from threading import Lock, Thread

	8

	9 import blame

	10 import component_dictionary

	11 import crash_utils

	12 import git_repository_parser

	13 import match_set

	14 import svn_repository_parser

	15 import utils

	16

	17

	18 LINE_CHANGE_PRIORITY = 1

	19 FILE_CHANGE_PRIORITY = 2

	20 logging.basicConfig(filename='errors.log', level=logging.WARNING,
	stgao 2014/08/14 18:25:42 Let's move this out. logging setup should be done Let's move this out. logging setup should be done by the end user, not by findit. jeun 2014/08/14 20:54:12 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > Let's move this out. > logging setup should be done by the end user, not by findit. Done.
	21 filemode='w+')

	22 URL_MAP = crash_utils.ParseURLsFromConfig('urls.config')

	23

	24

	25 def GenerateMatchEntry(

	26 matches, revisions, cl, file_path, file_name, function, component_path,

	27 component_name, crashed_lines, stack_frame_indices, file_action,

	28 repository_parser):

	29 """Generates a match object and adds it to the match set.

	30

	31 Args:

	32 matches: A matchset object, a map from CL to a match object.

	33 revisions: The dictionary mapping cl's number to the revision information,

	34 as returned from a function ParseChangelog in parser.

	35 cl: The SVN revision number or git hash.

	36 file_path: The path of the file.

	37 file_name: The name of the file that this CL might be the culprit cl.

	38 function: The function that caused an crash.

	39 component_path: The path of the component this file is from.

	40 component_name: The name of the component the file is from.

	41 crashed_lines: The list of the lines in the file that caused the crash.

	42 stack_frame_indices: The list of positions of this file within a stack.

	43 file_action: Whether file is modified, added or deleted.

	44 repository_parser: The parser object to parse line diff.

	45 """

	46 # Check if this CL should be avoided.
	stgao 2014/08/14 18:25:42 avoided -> ignored? avoided -> ignored? jeun 2014/08/14 20:54:13 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > avoided -> ignored? Done.
	47 with matches.matches_lock:

	48 if cl in matches.cls_to_ignore:

	49 return

	50

	51 # If this CL is not already identified as suspected, create a new entry.

	52 if cl not in matches.matches:

	53 revision = revisions[cl]

	54 match = match_set.Match(revision, component_name)

	55 message = revisions[cl]['message']

	56 # TODO(jeun): Don't hold lock while issuing http request.

	57 match.ParseMessage(message, matches.codereview_api_url)

	58

	59 # If this match is a revert, add to the set of CLs to be avoided.

	60 if match.is_reverted:

	61 matches.cls_to_ignore.add(cl)

	62

	63 # If this match has info on what CL it is reverted from, add that CL.

	64 if match.revert_of:

	65 matches.cls_to_ignore.add(match.revert_of)

	66

	67 # Return because we do not need to look at this CL anymore.

	68 return

	69

	70 # Add this CL to the set of matches.

	71 matches.matches[cl] = match

	72

	73 # Else, bring in the existing result.

	74 else:

	75 match = matches.matches[cl]

	76

	77 # Parse line diff information for this file.

	78 (diff_url, changed_line_numbers, changed_line_contents) = (

	79 repository_parser.ParseLineDiff(

	80 file_path, component_path, file_action, cl))

	81

	82 # Ignore this match if the component is not supported for svn.

	83 if not diff_url:

	84 return

	85

	86 # Find the intersection between the lines that this file crashed on and

	87 # the changed lines.

	88 (line_intersection, stack_frame_index_intersection) = (

	89 crash_utils.Intersection(

	90 crashed_lines, stack_frame_indices, changed_line_numbers))

	91

	92 # Find the minimum distance between the changed lines and crashed lines.

	93 min_distance = crash_utils.FindMinLineDistance(crashed_lines,

	94 changed_line_numbers)

	95

	96 # Check whether this CL changes the crashed lines or not.

	97 if line_intersection:

	98 priority = LINE_CHANGE_PRIORITY

	99 else:

	100 priority = FILE_CHANGE_PRIORITY

	101

	102 # Add the parsed information to the object.

	103 with matches.matches_lock:

	104 match.line_of_crash.append(line_intersection)

	105 match.files.append(file_name)

	106

	107 # Update the min distance only if it is less than the current one.

	108 if min_distance < match.min_distance:

	109 match.min_distance = min_distance

	110

	111 # If this CL does not change the crashed line, all occurrence of this

	112 # file in the stack has the same priority.

	113 if not stack_frame_index_intersection:

	114 stack_frame_index_intersection = stack_frame_indices

	115 match.stack_frame_indices.append(stack_frame_index_intersection)

	116 match.file_urls.append(diff_url)

	117

	118 # Only record the highest rank of this CL.

	119 if priority < match.rank:

	120 match.rank = priority

	121 match.priorities.append(priority)

	122 match.function.append(function)

	123

	124

	125 def FindMatch(revisions, file_map, file_dict, component_path, component_name,

	126 repository_parser, codereview_api_url):

	127 """Finds a CL that modifies file in the stacktrace.

	128

	129 Args:

	130 revisions: A dictionary mapping revision number to the CL information.

	131 file_map: A dictionary mapping file to the revision that modifies it.

	132 file_dict: A dictionary mapping file to its occurrence in stacktrace.

	133 component_path: The path of the component to search for.

	134 component_name: The name of the component to search for.

	135 repository_parser: The parser object to parse the line diff.

	136 codereview_api_url: A code review url to retrieve data from.

	137

	138 Returns:

	139 Matches, a set of match objects.

	140 """

	141 matches = match_set.MatchSet(codereview_api_url)

	142 threads = []

	143

	144 # For all files in stacktrace, if it is not header file, check against

	145 # all CLs to see if any CL changes this file. If so, add to the set of

	146 # suspected CLs.

	147 for file_name in file_dict:

	148 # Ignore header file.

	149 if file_name.endswith('.h'):

	150 continue

	151

	152 # If the file in the stacktrace is not changed in any commits, continue.

	153 if file_name not in file_map:

	154 continue

	155

	156 path_dict = file_dict.GetPathDic(file_name)

	157 # There can be two files that are in different path but have same

	158 # names, so also has to check path.

	159 for path in path_dict:

	160 crashed_lines = file_dict.GetCrashedLineNumbers(path)

	161 stack_frame_nums = file_dict.GetCrashStackFrameindex(path)

	162 function = file_dict.GetCrashFunction(path)

	163

	164 # Iterate through the revisions that this file is changed.

	165 for (cl, file_action, file_path) in file_map[file_name]:

	166
	stgao 2014/08/14 18:25:42 This empty line could be removed. This empty line could be removed. jeun 2014/08/14 20:54:13 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > This empty line could be removed. Done.
	167 # If path of two files are different or if a commit deletes a file,

	168 # continue.

	169 if not crash_utils.GuessIfSameSubPath(path, file_path) or (

	170 file_action == 'D'):

	171 continue

	172

	173 # Create a new thread for adding this CL.

	174 match_thread = Thread(

	175 target=GenerateMatchEntry,

	176 args=[matches, revisions, cl, file_path, file_name, function,

	177 component_path, component_name, crashed_lines,

	178 stack_frame_nums, file_action,

	179 repository_parser])

	180 threads.append(match_thread)

	181 match_thread.start()

	182

	183 # Make sure all threads finish before returning.

	184 for match_thread in threads:

	185 match_thread.join()

	186

	187 # Remove CLs that are revert.

	188 matches.RemoveReverts()

	189

	190 return matches

	191

	192

	193 def FindMatchForComponent(component_path, file_dict, changelog,

	194 callstack_priority, results, results_lock):

	195 """Parses changelog and finds suspected CLs for a given component.

	196

	197 Args:

	198 component_path: The path of component to look for the culprit CL.

	199 file_dict: A dictionary mapping file to its occurrence in stackframe.

	200 changelog: The parsed changelog for this component.

	201 callstack_priority: The priority of this call stack, 0 if from crash stack,

	202 1 if from freed, 2 if from previously allocated.

	203 results: A dictionary to store the result.

	204 results_lock: A lock that guards result_map.
	stgao 2014/08/14 18:25:42 result_map is not clear. result_map is not clear. jeun 2014/08/14 20:54:12 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > result_map is not clear. Done.
	205 """

	206 (repository_parser, component_name, revisions, file_to_revision_map) = \

	207 changelog

	208

	209 # Find match for this component.

	210 codereview_api_url = URL_MAP['codereview']['review_url']

	211 component_result = FindMatch(

	212 revisions, file_to_revision_map, file_dict, component_path,

	213 component_name, repository_parser, codereview_api_url)

	214 matches = component_result.matches

	215

	216 # For all the match results in a dictionary, add to the list so that it

	217 # can be sorted.

	218 with results_lock:

	219 for cl in matches:

	220 match = matches[cl]

	221 results.append((callstack_priority, cl, match))

	222

	223

	224 def FindMatchForCallstack(

	225 callstack, components, component_to_changelog_map, results,

	226 results_lock):

	227 """Finds culprit cl for a stack within a stacktrace.

	228

	229 For each components to look for, create new thread that computes the matches

	230 and join the results at the end.

	231 Args:

	232 callstack: A callstack in a stacktrace to find the result for.

	233 components: A set of components to look for.

	234 component_to_changelog_map: A map from component to its parsed changelog.

	235 results: A list to aggregrate results from all stacktraces.

	236 results_lock: A lock that guards all_results.
	stgao 2014/08/14 18:25:42 all_result is not clear. all_result is not clear. jeun 2014/08/14 20:54:13 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > all_result is not clear. Done.
	237 """

	238 # Create component dictionary from the component and call stack.

	239 component_dict = component_dictionary.ComponentDictionary(callstack,

	240 components)

	241 callstack_priority = callstack.priority

	242

	243 # Iterate through all components and create new thread for each component.

	244 threads = []

	245 for component_path in component_dict:

	246 file_dict = component_dict.GetFileDict(component_path)

	247

	248 if component_path not in component_to_changelog_map:

	249 continue

	250 changelog = component_to_changelog_map[component_path]

	251

	252 t = Thread(

	253 target=FindMatchForComponent,

	254 args=[component_path, file_dict, changelog,

	255 callstack_priority, results, results_lock])

	256 threads.append(t)

	257 t.start()

	258

	259 # Join the threads before returning.

	260 for t in threads:

	261 t.join()

	262

	263

	264 def FindMatchForStacktrace(stacktrace, components, regression_dict):

	265 """Finds the culprit CL for stacktrace.

	266

	267 The passed stacktrace is either from release build stacktrace

	268 or debug build stacktrace.

	269

	270 Args:

	271 stacktrace: A list of parsed stacks within a stacktrace.

	272 components: A set of components to look for.

	273 regression_dict: A dictionary mapping component path to its regression.

	274

	275 Returns:

	276 A list of match results from all stacks.

	277 """

	278 # A list to aggregate results from the whole stacktrace.

	279 results = []

	280 results_lock = Lock()

	281

	282 # Parse changelog for each of the components and store it.

	283 svn_parser = svn_repository_parser.SVNParser(URL_MAP['svn'])

	284 git_parser = git_repository_parser.GitParser(regression_dict, URL_MAP['git'])

	285

	286 component_to_changelog_map = {}

	287 for component_path in components:

	288 component_object = regression_dict[component_path]

	289 range_start = component_object['old_revision']

	290 range_end = component_object['new_revision']

	291 if range_start == '0':

	292 continue

	293

	294 component_name = regression_dict[component_path]['name']

	295

	296 # Check if the regression is git hash or a revision number, and create

	297 # appropriate parser object.

	298 is_git = utils.IsGitHash(range_start)

	299 if is_git:

	300 repository_parser = git_parser

	301 else:

	302 repository_parser = svn_parser

	303

	304 # Parse revisions and changed files in regression.

	305 (revisions, file_to_revision_map) = repository_parser.ParseChangelog(

	306 component_path, range_start, range_end)

	307

	308 if not (revisions and file_to_revision_map):

	309 continue

	310

	311 component_to_changelog_map[component_path] = (repository_parser,

	312 component_name,

	313 revisions,

	314 file_to_revision_map)

	315

	316 # Create separate threads for each of the call stack in the stacktrace.

	317 threads = []

	318 for callstack in stacktrace.stack_list:

	319 t = Thread(

	320 target=FindMatchForCallstack,

	321 args=[callstack, components, component_to_changelog_map,

	322 results, results_lock])

	323 threads.append(t)

	324 t.start()

	325

	326 # Join the threads before returning.

	327 for t in threads:

	328 t.join()

	329

	330 return results

	331

	332

	333 def SortAndFilterMatches(matches):

	334 """Filters the list of potential culprit CLs to remove noise.

	335

	336 Args:

	337 matches: A list containing match results.

	338

	339 Returns:

	340 Filtered match results.

	341 """

	342 new_matches = []

	343 line_changed = False

	344 in_crash_state = False

	345 highest_priority_stack = crash_utils.INFINITY

	346

	347 # Sort the matches in specific order.

	348 matches.sort(key=lambda (stack_index, cl, match):

	349 (match.rank, stack_index,

	350 crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,

	351 match.priorities),

	352 -len(match.files), match.min_distance))

	353

	354 # Iterate through the matches to find out what results are significant.

	355 for stack_index, cl, match in matches:

	356
	stgao 2014/08/14 18:25:42 This empty line could be removed. This empty line could be removed. jeun 2014/08/14 20:54:12 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > This empty line could be removed. Done.
	357 # Check if the current match changes crashed line.

	358 is_line_change = (match.rank == LINE_CHANGE_PRIORITY)

	359

	360 # Check which thread this match is from.

	361 current_stack = stack_index

	362 if current_stack < highest_priority_stack:

	363 highest_priority_stack = current_stack

	364

	365 # Check if current match changes a file that occurs in crash state.

	366 flattened_stack_frame_indices = [i for sl in match.stack_frame_indices

	367 for i in sl]

	368 current_in_crash_state = min(flattened_stack_frame_indices) < 5

	369

	370 # If current match does not change crashed lines but there are matches

	371 # that do so, or if current match is not in crash state but there are

	372 # matchtes in it, or if there are other matches that came from higher

	373 # priority stack, then this match and anything lower than this should

	374 # be ignored.
	stgao 2014/08/14 18:25:42 Could this comment turned into bullet point? Could this comment turned into bullet point? jeun 2014/08/14 20:54:13 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > Could this comment turned into bullet point? Done.
	375 if (line_changed and not is_line_change) or (

	376 in_crash_state and not current_in_crash_state) or (

	377 current_stack > highest_priority_stack):

	378 break

	379

	380 # Update the variables.

	381 if is_line_change:

	382 line_changed = True

	383 if current_in_crash_state:

	384 in_crash_state = True

	385

	386 # Add current match to the filtered result.

	387 new_matches.append((stack_index, cl, match))

	388

	389 return new_matches

	390

	391

	392 def GenerateReasonForMatches(matches):

	393 """Generates a reason that a match (CL) is a culprit cl.

	394

	395 Args:

	396 matches: A list of match objects.

	397 """

	398 # Iterate through the matches in the list.

	399 for i, _, match in matches:

	400 reason = []
	stgao 2014/08/14 18:25:42 reasons? reasons? jeun 2014/08/14 20:54:12 I would call it a reason because it is a list of s Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > reasons? I would call it a reason because it is a list of strings that together compose a reason why this match is a match.
	401

	402 # Zip the files in the match by the reason they are suspected

	403 # (how the file is modified) and sort it in specific order.

	404 match_by_priority = zip(

	405 match.priorities, match.line_of_crash, match.files,

	406 match.file_urls)

	407

	408 match_by_priority.sort(

	409 key=lambda (priority, crashed_line_number, file_name, file_url):

	410 (priority, len(file_name)))

	411

	412 # Iterate through the sorted match.

	413 for i in range(len(match_by_priority)):

	414 (priority, crashed_line_number, file_name, file_url) = \
	stgao 2014/08/14 18:25:42 file_name or file_path? Same the one for above in file_name or file_path? Same the one for above in line 409. jeun 2014/08/14 20:54:13 It is file_name. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > file_name or file_path? Same the one for above in line 409. It is file_name.
	415 match_by_priority[i]

	416

	417 # If the file in the match is a line change, append a explanation.

	418 if priority == LINE_CHANGE_PRIORITY:

	419 reason.append(

	420 ' Line %s of file %s which caused the crash according '

	421 'to the stacktrace, is changed in this cl.\n' %

	422 (crash_utils.PrettifyList(crashed_line_number),

	423 crash_utils.PrettifyFiles([(file_name, file_url)])))

	424

	425 else:

	426 # Get all the files that are not line change.

	427 rest_of_the_files = match_by_priority[i:]

	428

	429 # Take care of single/multiple files in string.

	430 if len(rest_of_the_files) == 1:

	431 file_string = ' File %s is changed in this cl.\n'

	432 else:

	433 file_string = ' Files %s are changed in this cl.\n'

	434

	435 # Create a list of file name and its url, and prettify the list.

	436 file_name_with_url = [(file_name, file_url)

	437 for (_, _, file_name, file_url)

	438 in rest_of_the_files]

	439 pretty_file_name_url = crash_utils.PrettifyFiles(file_name_with_url)

	440

	441 # Add the reason, break because we took care of the rest of the files.

	442 reason.append(file_string % pretty_file_name_url)

	443 break

	444

	445 # Set the reason as string.

	446 match.reason = ''.join(reason)

	447

	448

	449 def CombineMatches(matches):

	450 """Combine possible duplicates in matches.

	451

	452 Args:

	453 matches: A list of matches object, along with its callstack priority and

	454 CL it is from.

	455 Returns:

	456 A combined list of matches.

	457 """

	458 combined_matches = []

	459

	460 for stack_index, cl, match in matches:

	461 found_match = None

	462

	463 # Iterate through the list of combined matches.

	464 for _, cl_combined, match_combined in combined_matches:

	465 # Check for if current match is already higher up in the result.

	466 if cl == cl_combined:

	467 found_match = match_combined

	468

	469 # If current match is not already in, add it to the list of matches.

	470 if not found_match:

	471 combined_matches.append((stack_index, cl, match))

	472 continue

	473

	474 # Combine the reason if the current match is already in there.

	475 found_match.reason += match.reason

	476

	477 return combined_matches

	478

	479

	480 def FilterAndGenerateReasonForMatches(result):

	481 """A wrapper function.

	482

	483 It generates reasons for the matches and returns string representation

	484 of filtered results.

	485

	486 Args:

	487 result: A list of match objects.

	488

	489 Returns:

	490 A string representation of filtered results.

	491 """

	492 new_result = SortAndFilterMatches(result)

	493 GenerateReasonForMatches(new_result)

	494 combined_matches = CombineMatches(new_result)

	495 return crash_utils.MatchListToResultList(combined_matches)

	496

	497

	498 def ParseCrashComponents(main_stack, top_n_frames=3):

	499 """Parses the crashing component.

	500

	501 Args:

	502 main_stack: Main stack from the stacktrace.

	503 top_n_frames: A number of frames to regard as crashing frame.

	504 Returns:

	505 A set of components.

	506 """

	507 frame_list = main_stack.GetTopNFrames(top_n_frames)

	508 components = set()

	509

	510 for frame in frame_list:

	511 components.add(frame.component_path)

	512

	513 return components

	514

	515

	516 def GenerateAndFilterBlameList(callstack, crash_revision_dict, regression_dict):

	517 """A wrapper function.

	518

	519 Finds blame information for stack and returns string representation.

	520

	521 Args:

	522 callstack: A callstack to find the blame information.

	523 crash_revision_dict: A dictionary mapping component to its crash revision.

	524 regression_dict: A dictionary mapping component to its regression.

	525

	526 Returns:

	527 A list of blame results.

	528 """

	529 # Setup parser objects to use for parsing blame information.

	530 svn_parser = svn_repository_parser.SVNParser(URL_MAP['svn'])

	531 git_parser = git_repository_parser.GitParser(regression_dict, URL_MAP['git'])

	532 parsers = [svn_parser, git_parser]

	533

	534 # Create and generate the blame objects from the callstack.

	535 blame_list = blame.BlameList()

	536 blame_list.FindBlame(callstack, crash_revision_dict, regression_dict,

	537 parsers)

	538

	539 blame_list.FilterAndSortBlameList()

	540 return crash_utils.BlameListToResultList(blame_list)

	541

	542

	543 def FindItForCrash(stacktrace_list,

	544 callstack,

	545 parsed_regression,

	546 parsed_crash_revision):

	547 """Finds the culprit CL from the list of stacktrace.

	548

	549 Args:

	550 stacktrace_list: A list of stacktraces to look for, in the order of

	551 decreasing significance.

	552 callstack: A callstack object to show blame information for, if there are

	553 no results for all stacktraces in the stacktrace_list.

	554 parsed_regression: A parsed regression information as a result of parsing

	555 DEPS file.

	556 parsed_crash_revision: A parsed crash revision information.

	557

	558 Returns:

	559 A list of result objects.

	560 """

	561 # If regression information is not available, return blame information.

	562 if not parsed_regression:

	563 return GenerateAndFilterBlameList(callstack, parsed_crash_revision,

	564 parsed_regression)

	565

	566 for stacktrace in stacktrace_list:

	567
	stgao 2014/08/14 18:25:42 This empty line could be removed. This empty line could be removed. jeun 2014/08/14 20:54:12 Done. Show quoted text On 2014/08/14 18:25:42, Shuotao wrote: > This empty line could be removed. Done.
	568 # Check the next stacktrace if current one is empty.

	569 if not stacktrace.stack_list:

	570 continue

	571

	572 # Get the crash stack for this stacktrace, and extract crashing components

	573 # from it.

	574 main_stack = stacktrace.GetCrashStack()

	575 components = ParseCrashComponents(main_stack, parsed_crash_revision)

	576

	577 result_for_stacktrace = FindMatchForStacktrace(stacktrace, components,

	578 parsed_regression)

	579

	580 # If the result is empty, check the next stacktrace. Else, return the

	581 # filtered result.

	582 if not result_for_stacktrace:

	583 continue

	584

	585 return FilterAndGenerateReasonForMatches(result_for_stacktrace)

	586

	587 # If not match is found, return the blame information for the input

	588 # callstack.

	589 return GenerateAndFilterBlameList(callstack, parsed_crash_revision,

	590 parsed_regression)

	591

OLD	NEW

« tools/findit/findit_for_clusterfuzz.py ('K') | « tools/findit/findit_for_clusterfuzz.py ('k') | tools/findit/urls.config » ('j') | no next file with comments »