appengine/findit/waterfall/extract_signal_pipeline.py - Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures.

Side by Side Diff: appengine/findit/waterfall/extract_signal_pipeline.py

Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Use cStringIO to pull the reliable test failures. Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/findit/waterfall/buildbot.py ('K') | « appengine/findit/waterfall/buildbot.py ('k') | appengine/findit/waterfall/test/buildbot_test.py » ('j') | appengine/findit/waterfall/test/buildbot_test.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

	5 import cStringIO

5 import logging	6 import logging

	7 import json

6	8

7 from google.appengine.api.urlfetch import ResponseTooLargeError	9 from google.appengine.api.urlfetch import ResponseTooLargeError

8	10

9 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline	11 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

10	12

11 from common.http_client_appengine import HttpClientAppengine as HttpClient	13 from common.http_client_appengine import HttpClientAppengine as HttpClient

12 from model.wf_step import WfStep	14 from model.wf_step import WfStep

13 from waterfall import buildbot	15 from waterfall import buildbot

14 from waterfall import extractors	16 from waterfall import extractors

15 from waterfall import lock_util	17 from waterfall import lock_util

(...skipping 22 matching lines...) Expand all Loading...
38	40

39 lines = log_data.split('\n')	41 lines = log_data.split('\n')

40 size = 0	42 size = 0

41 for line_index in reversed(range(len(lines))):	43 for line_index in reversed(range(len(lines))):

42 size += len(lines[line_index]) + 1	44 size += len(lines[line_index]) + 1

43 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:	45 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:

44 return '\n'.join(lines[line_index + 1:])	46 return '\n'.join(lines[line_index + 1:])

45 else:	47 else:

46 return log_data # pragma: no cover - this won't be reached.	48 return log_data # pragma: no cover - this won't be reached.

47	49

	50 @staticmethod

	51 def _GetTestLevelFailures(gtest_result):
	stgao 2015/05/22 01:30:37 Maybe rename it to _GetReliableTestFailureLog? or Maybe rename it to _GetReliableTestFailureLog? or FailureOutput?
	52 """Analyze the step log and extract reliable failures only.
	stgao 2015/05/22 01:30:37 not step log. not step log. chanli 2015/05/22 18:43:27 I think it actually is the 'archived' step log. is Show quoted text On 2015/05/22 01:30:37, Shuotao wrote: > not step log. I think it actually is the 'archived' step log. isn't it?
	53

	54 Args:

	55 gtest_result (file): A JSON file for failed step log.
	stgao 2015/05/22 01:30:37 This is not a file, it is a string in JSON format. This is not a file, it is a string in JSON format.
	56

	57 Returns:

	58 A string contains the names of reliable test failures and output_snippets.

	59 """

	60
	stgao 2015/05/22 01:30:37 no empty line here. no empty line here.
	61 step_failure_data = json.loads(gtest_result)

	62 sio = cStringIO.StringIO()

	63 for iteration in step_failure_data['gtest_results']['per_iteration_data']:

	64 for key in iteration.keys(): # Keys are test names.
	stgao 2015/05/22 01:30:37 key -> test_name That makes the code more clear. key -> test_name That makes the code more clear.
	65 is_reliable_failure = True

	66

	67 for test in iteration[key]:

	68 # We will ignore the test if some of the attempts were success.

	69 if test['status'] == 'SUCCESS':

	70 is_reliable_failure = False

	71 break

	72

	73 if is_reliable_failure: # All attempts failed, it's a reliable failure.
	stgao 2015/05/22 01:30:37 comment style. comment style.
	74 for test in iteration[key]:

	75 sio.write("'%s': %s\n" % (key, test['output_snippet']))

	76

	77 failed_test_log = sio.getvalue()

	78 sio.close()

	79

	80 return failed_test_log

48	81

49 # Arguments number differs from overridden method - pylint: disable=W0221	82 # Arguments number differs from overridden method - pylint: disable=W0221

50 def run(self, failure_info):	83 def run(self, failure_info):

51 """	84 """

52 Args:	85 Args:

53 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().	86 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().

54	87

55 Returns:	88 Returns:

56 A dict like below:	89 A dict like below:

57 {	90 {

58 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),	91 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),

59 ...	92 ...

60 }	93 }

61 """	94 """

62 signals = {}	95 signals = {}

63	96

64 master_name = failure_info['master_name']	97 master_name = failure_info['master_name']

65 builder_name = failure_info['builder_name']	98 builder_name = failure_info['builder_name']

66 build_number = failure_info['build_number']	99 build_number = failure_info['build_number']

67 for step_name in failure_info.get('failed_steps', []):	100 for step_name in failure_info.get('failed_steps', []):

68 step = WfStep.Get(master_name, builder_name, build_number, step_name)	101 step = WfStep.Get(master_name, builder_name, build_number, step_name)

69 if step and step.log_data:	102 if step and step.log_data:

70 stdio_log = step.log_data	103 test_failure_log = step.log_data
	stgao 2015/05/22 01:30:37 failure_log? Because it could be compile step too, failure_log? Because it could be compile step too, not just test step.
71 else:	104 else:

72 if not lock_util.WaitUntilDownloadAllowed(	105 # TODO: add test level log info to signal.

73 master_name): # pragma: no cover	106 gtest_result = buildbot.GetGtestResultLog(

74 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'	107 master_name, builder_name, build_number, step_name)

75 % (step_name, master_name))	108 if gtest_result:

	109 test_failure_log = self._GetTestLevelFailures(gtest_result)

	110 else:

	111 if not lock_util.WaitUntilDownloadAllowed(

	112 master_name): # pragma: no cover

	113 raise pipeline.Retry('Failed to pull log of step %s of master %s'

	114 % (step_name, master_name))

	115 try:

	116 test_failure_log = buildbot.GetStepStdio(

	117 master_name, builder_name, build_number, step_name,

	118 self.HTTP_CLIENT)

	119 except ResponseTooLargeError: # pragma: no cover.

	120 logging.exception(

	121 'Log of step "%s" is too large for urlfetch.', step_name)

	122 # If the stdio log of a step is too large, we don't want to pull it

	123 # again in next run, because that might lead to DDoS to the master.
	stgao 2015/05/22 01:30:37 Please keep the original TODO here. See my last co Please keep the original TODO here. See my last comment.
	124 test_failure_log = 'Stdio log is too large for urlfetch.'

76	125

77 # TODO: do test-level analysis instead of step-level.	126 if not test_failure_log: # pragma: no cover

78 try:	127 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'
	Sharu Jiang 2015/05/22 17:54:30 I think if we fail to get result from self._GetTes I think if we fail to get result from self._GetTestLevelFailures, test_failure_log may also be empty, should we check that? chanli 2015/05/22 18:43:27 We will go to stdio log if we failed retrieving da Show quoted text On 2015/05/22 17:54:30, sharu jiang wrote: > I think if we fail to get result from self._GetTestLevelFailures, > test_failure_log may also be empty, should we check that? We will go to stdio log if we failed retrieving data from archived step log. Sharu Jiang 2015/05/22 20:09:36 I mean even though we have gtest_result, we may no Show quoted text On 2015/05/22 18:43:27, chanli wrote: > On 2015/05/22 17:54:30, sharu jiang wrote: > > I think if we fail to get result from self._GetTestLevelFailures, > > test_failure_log may also be empty, should we check that? > > We will go to stdio log if we failed retrieving data from archived step log. I mean even though we have gtest_result, we may not get any test_failure_log, for example if all the tests are unreliable
79 stdio_log = buildbot.GetStepStdio(	128 % (step_name, master_name))

80 master_name, builder_name, build_number, step_name,

81 self.HTTP_CLIENT)

82 except ResponseTooLargeError: # pragma: no cover.

83 logging.exception(

84 'Log of step "%s" is too large for urlfetch.', step_name)

85 # If the stdio log of a step is too large, we don't want to pull it

86 # again in next run, because that might lead to DDoS to the master.

87 # TODO: Use archived stdio logs in Google Storage instead.

88 stdio_log = 'Stdio log is too large for urlfetch.'

89	129

90 if not stdio_log: # pragma: no cover	130 # Save step log in datastore and avoid downloading again during retry.

91 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'

92 % (step_name, master_name))

93

94 # Save stdio in datastore and avoid downloading again during retry.

95 if not step: # pragma: no cover	131 if not step: # pragma: no cover

96 step = WfStep.Create(	132 step = WfStep.Create(

97 master_name, builder_name, build_number, step_name)	133 master_name, builder_name, build_number, step_name)

98	134

99 step.log_data = self._ExtractStorablePortionOfLog(stdio_log)	135 step.log_data = self._ExtractStorablePortionOfLog(test_failure_log)

	136

100 try:	137 try:

101 step.put()	138 step.put()

102 except Exception as e: # pragma: no cover	139 except Exception as e: # pragma: no cover

103 # Sometimes, the stdio log is too large to save in datastore.	140 # Sometimes, the step log is too large to save in datastore.

104 logging.exception(e)	141 logging.exception(e)

105	142

106 # TODO: save result in datastore?	143 # TODO: save result in datastore?

107 signals[step_name] = extractors.ExtractSignal(	144 signals[step_name] = extractors.ExtractSignal(

108 master_name, builder_name, step_name, None, stdio_log).ToDict()	145 master_name, builder_name, step_name, None,

	146 str(test_failure_log)).ToDict()
	stgao 2015/05/22 01:30:37 Why we need str here? Same reason as you told me? Why we need str here? Same reason as you told me? chanli 2015/05/22 18:43:27 No... I forgot this one. Fixed. Show quoted text On 2015/05/22 01:30:37, Shuotao wrote: > Why we need str here? Same reason as you told me? No... I forgot this one. Fixed.
109	147

110 return signals	148 return signals

OLD	NEW