appengine/findit/waterfall/extract_signal_pipeline.py - Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures.

Side by Side Diff: appengine/findit/waterfall/extract_signal_pipeline.py

Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/findit/waterfall/buildbot.py ('K') | « appengine/findit/waterfall/buildbot.py ('k') | appengine/findit/waterfall/test/buildbot_test.py » ('j') | appengine/findit/waterfall/test/buildbot_test.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import logging	5 import logging

	6 import json

6	7

7 from google.appengine.api.urlfetch import ResponseTooLargeError	8 from google.appengine.api.urlfetch import ResponseTooLargeError

8	9

9 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline	10 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

10	11

11 from common.http_client_appengine import HttpClientAppengine as HttpClient	12 from common.http_client_appengine import HttpClientAppengine as HttpClient

12 from model.wf_step import WfStep	13 from model.wf_step import WfStep

13 from waterfall import buildbot	14 from waterfall import buildbot

14 from waterfall import extractors	15 from waterfall import extractors

15 from waterfall import lock_util	16 from waterfall import lock_util

(...skipping 22 matching lines...) Expand all Loading...
38	39

39 lines = log_data.split('\n')	40 lines = log_data.split('\n')

40 size = 0	41 size = 0

41 for line_index in reversed(range(len(lines))):	42 for line_index in reversed(range(len(lines))):

42 size += len(lines[line_index]) + 1	43 size += len(lines[line_index]) + 1

43 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:	44 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:

44 return '\n'.join(lines[line_index + 1:])	45 return '\n'.join(lines[line_index + 1:])

45 else:	46 else:

46 return log_data # pragma: no cover - this won't be reached.	47 return log_data # pragma: no cover - this won't be reached.

47	48

	49 @staticmethod

	50 def _GetTestLevelFailures(step_log):

	51 """Analyze the step log and extract reliable failures only.

	52

	53 Args:

	54 step_log (file): A JSON file for failed step log.

	55

	56 Returns:

	57 A dict like below:

	58 {

	59 'test_name1': [

	60 {

	61 "elapsed_time_ms": ..,

	62 "losless_snippet": ..,

	63 "output_snippet": ..,

	64 "status": "FAILURE",

	65 "output_snippet_base64":..

	66 },

	67 ..

	68 ],

	69 ..

	70 }

	71 """

	72 failed_test_log = {}

	73 step_failure_data = json.loads(step_log)

	74

	75 for iteration in step_failure_data['gtest_results']['per_iteration_data']:

	76 for key in iteration.keys(): # Keys are test names.

	77 is_reliable_failure = True

	78

	79 for test in iteration[key]:

	80 # We will ignore the test if one of the attempts passes.

	81 if test['status'] != 'FAILURE':
	stgao 2015/05/21 00:29:56 This code doesn't match the comment. I think statu This code doesn't match the comment. I think status has more values, like TIMEOUT, CRASHED, etc. You may check https://code.google.com/p/chromium/codesearch#chromium/build/scripts/slave/ru...
	82 is_reliable_failure = False

	83 break

	84

	85 if is_reliable_failure: # All attempts failed, it's a reliable failure.

	86 failed_test_log[key] = iteration[key]

	87

	88 return failed_test_log

48	89

49 # Arguments number differs from overridden method - pylint: disable=W0221	90 # Arguments number differs from overridden method - pylint: disable=W0221

50 def run(self, failure_info):	91 def run(self, failure_info):

51 """	92 """

52 Args:	93 Args:

53 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().	94 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().

54	95

55 Returns:	96 Returns:

56 A dict like below:	97 A dict like below:

57 {	98 {

58 'step_name1': waterfall.failure_signal.FailureSignal.ToJson(),	99 'step_name1': waterfall.failure_signal.FailureSignal.ToJson(),

59 ...	100 ...

60 }	101 }

61 """	102 """

62 signals = {}	103 signals = {}

63	104

64 master_name = failure_info['master_name']	105 master_name = failure_info['master_name']

65 builder_name = failure_info['builder_name']	106 builder_name = failure_info['builder_name']

66 build_number = failure_info['build_number']	107 build_number = failure_info['build_number']

67 for step_name in failure_info.get('failed_steps', []):	108 for step_name in failure_info.get('failed_steps', []):

68 step = WfStep.Get(master_name, builder_name, build_number, step_name)	109 step = WfStep.Get(master_name, builder_name, build_number, step_name)

69 if step and step.log_data:	110 if step and step.log_data:

70 stdio_log = step.log_data	111 test_failure_log = step.log_data

71 else:	112 else:

72 if not lock_util.WaitUntilDownloadAllowed(	113 if not lock_util.WaitUntilDownloadAllowed(
	stgao 2015/05/21 00:29:56 We don't have to wait unless the request is to the We don't have to wait unless the request is to the build master.
73 master_name): # pragma: no cover	114 master_name): # pragma: no cover

74 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'	115 raise pipeline.Retry('Failed to pull log of step %s of master %s'

75 % (step_name, master_name))	116 % (step_name, master_name))

76	117

77 # TODO: do test-level analysis instead of step-level.	118 # TODO: Add test level log info to signal
	stgao 2015/05/21 00:29:56 style. style.
78 try:	119 step_log = buildbot.GetGsStepLog(

79 stdio_log = buildbot.GetStepStdio(	120 master_name, builder_name, build_number, step_name)

80 master_name, builder_name, build_number, step_name,	121 if step_log:

81 self.HTTP_CLIENT)	122 test_failure_log = str(self._GetTestLevelFailures(step_log))
	stgao 2015/05/21 00:29:56 This is a little hacky. How about using StringIO This is a little hacky. How about using StringIO in self._GetTestLevelFailures to build it?
82 except ResponseTooLargeError: # pragma: no cover.	123 else:

83 logging.exception(	124 try:

84 'Log of step "%s" is too large for urlfetch.', step_name)	125 test_failure_log = buildbot.GetStepStdio(

85 # If the stdio log of a step is too large, we don't want to pull it	126 master_name, builder_name, build_number, step_name,

86 # again in next run, because that might lead to DDoS to the master.	127 self.HTTP_CLIENT)

87 # TODO: Use archived stdio logs in Google Storage instead.	128 except ResponseTooLargeError: # pragma: no cover.
stgao 2015/05/21 00:29:56 Please still keep this TODO. Because we still pull Please still keep this TODO. Because we still pull logs from the build master if gtest result is not available.
88 stdio_log = 'Stdio log is too large for urlfetch.'	129 logging.exception(

	130 'Log of step "%s" is too large for urlfetch.', step_name)

	131 # If the stdio log of a step is too large, we don't want to pull it

	132 # again in next run, because that might lead to DDoS to the master.

	133 test_failure_log = 'Stdio log is too large for urlfetch.'

89	134

90 if not stdio_log: # pragma: no cover	135 if not test_failure_log: # pragma: no cover

91 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'	136 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'

92 % (step_name, master_name))	137 % (step_name, master_name))

93	138

94 # Save stdio in datastore and avoid downloading again during retry.	139 # Save step log in datastore and avoid downloading again during retry.

95 if not step: # pragma: no cover	140 if not step: # pragma: no cover

96 step = WfStep.Create(	141 step = WfStep.Create(

97 master_name, builder_name, build_number, step_name)	142 master_name, builder_name, build_number, step_name)

98	143

99 step.log_data = self._ExtractStorablePortionOfLog(stdio_log)	144 step.log_data = self._ExtractStorablePortionOfLog(test_failure_log)

	145

100 try:	146 try:

101 step.put()	147 step.put()

102 except Exception as e: # pragma: no cover	148 except Exception as e: # pragma: no cover

103 # Sometimes, the stdio log is too large to save in datastore.	149 # Sometimes, the step log is too large to save in datastore.

104 logging.exception(e)	150 logging.exception(e)

105	151

106 # TODO: save result in datastore?	152 # TODO: save result in datastore?

107 signals[step_name] = extractors.ExtractSignal(	153 signals[step_name] = extractors.ExtractSignal(

108 master_name, builder_name, step_name, None, stdio_log).ToJson()	154 master_name, builder_name, step_name, None,

	155 str(test_failure_log)).ToJson()

109	156

110 return signals	157 return signals

OLD	NEW