appengine/findit/waterfall/extract_signal_pipeline.py - Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures.

Side by Side Diff: appengine/findit/waterfall/extract_signal_pipeline.py

Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Add check for 'gtest_results' being 'invalid'. Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/findit/waterfall/buildbot.py ('K') | « appengine/findit/waterfall/buildbot.py ('k') | appengine/findit/waterfall/test/buildbot_test.py » ('j') | appengine/findit/waterfall/test/buildbot_test.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

	5 import cStringIO

5 import logging	6 import logging

	7 import json

6	8

7 from google.appengine.api.urlfetch import ResponseTooLargeError	9 from google.appengine.api.urlfetch import ResponseTooLargeError

8	10

9 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline	11 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

10	12

11 from common.http_client_appengine import HttpClientAppengine as HttpClient	13 from common.http_client_appengine import HttpClientAppengine as HttpClient

12 from model.wf_step import WfStep	14 from model.wf_step import WfStep

13 from waterfall import buildbot	15 from waterfall import buildbot

14 from waterfall import extractors	16 from waterfall import extractors

15 from waterfall import lock_util	17 from waterfall import lock_util

(...skipping 22 matching lines...) Expand all Loading...
38	40

39 lines = log_data.split('\n')	41 lines = log_data.split('\n')

40 size = 0	42 size = 0

41 for line_index in reversed(range(len(lines))):	43 for line_index in reversed(range(len(lines))):

42 size += len(lines[line_index]) + 1	44 size += len(lines[line_index]) + 1

43 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:	45 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:

44 return '\n'.join(lines[line_index + 1:])	46 return '\n'.join(lines[line_index + 1:])

45 else:	47 else:

46 return log_data # pragma: no cover - this won't be reached.	48 return log_data # pragma: no cover - this won't be reached.

47	49

	50 @staticmethod

	51 def _GetReliableTestFailureLog(gtest_result):

	52 """Analyze the archived gtest json results and extract reliable failures.

	53

	54 Args:

	55 gtest_result (str): A JSON file for failed step log.

	56

	57 Returns:

	58 A string contains the names of reliable test failures and related

	59 log content.

	60 """

	61 step_failure_data = json.loads(gtest_result)

	62 sio = cStringIO.StringIO()
	stgao 2015/05/26 18:14:49 This could be moved to after the check of 'invalid This could be moved to after the check of 'invalid'. chanli 2015/05/26 19:07:38 Done. Show quoted text On 2015/05/26 18:14:49, Shuotao wrote: > This could be moved to after the check of 'invalid'. Done.
	63

	64 if step_failure_data['gtest_results'] == 'invalid': # pragma: no cover

	65 return None

	66

	67 for iteration in step_failure_data['gtest_results']['per_iteration_data']:

	68 for test_name in iteration.keys():

	69 is_reliable_failure = True

	70

	71 for test_run in iteration[test_name]:

	72 # We will ignore the test if some of the attempts were success.

	73 if test_run['status'] == 'SUCCESS':

	74 is_reliable_failure = False

	75 break

	76

	77 if is_reliable_failure: # all attempts failed

	78 for test_run in iteration[test_name]:

	79 sio.write("'%s': %s\n" % (test_name, test_run['output_snippet']))

	80

	81 failed_test_log = sio.getvalue()

	82 sio.close()

	83

	84 return failed_test_log

48	85

49 # Arguments number differs from overridden method - pylint: disable=W0221	86 # Arguments number differs from overridden method - pylint: disable=W0221

50 def run(self, failure_info):	87 def run(self, failure_info):

51 """	88 """

52 Args:	89 Args:

53 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().	90 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().

54	91

55 Returns:	92 Returns:

56 A dict like below:	93 A dict like below:

57 {	94 {

58 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),	95 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),

59 ...	96 ...

60 }	97 }

61 """	98 """

62 signals = {}	99 signals = {}

63	100

64 master_name = failure_info['master_name']	101 master_name = failure_info['master_name']

65 builder_name = failure_info['builder_name']	102 builder_name = failure_info['builder_name']

66 build_number = failure_info['build_number']	103 build_number = failure_info['build_number']

67 for step_name in failure_info.get('failed_steps', []):	104 for step_name in failure_info.get('failed_steps', []):

68 step = WfStep.Get(master_name, builder_name, build_number, step_name)	105 step = WfStep.Get(master_name, builder_name, build_number, step_name)

69 if step and step.log_data:	106 if step and step.log_data:

70 stdio_log = step.log_data	107 failure_log = step.log_data

71 else:	108 else:

72 if not lock_util.WaitUntilDownloadAllowed(	109 # TODO: do test-level analysis instead of step-level.

73 master_name): # pragma: no cover	110 gtest_result = buildbot.GetGtestResultLog(

74 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'	111 master_name, builder_name, build_number, step_name)

75 % (step_name, master_name))	112 if gtest_result:

	113 failure_log = self._GetReliableTestFailureLog(gtest_result)
	stgao 2015/05/26 18:14:49 If the gtest_result is 'invalid', we should fallba If the gtest_result is 'invalid', we should fallback to stdio instead. However, the code here seems not like that. chanli 2015/05/26 19:07:38 Now we will both get None as return if we get 'inv Show quoted text On 2015/05/26 18:14:49, Shuotao wrote: > If the gtest_result is 'invalid', we should fallback to stdio instead. > However, the code here seems not like that. Now we will both get None as return if we get 'invalid' as gtest_result, or if all the failures are flaky. I'm thinking maybe I can return different messages and only fallback to stdio if we get 'invalid'? stgao 2015/05/26 19:17:11 Sounds good to me. Show quoted text On 2015/05/26 19:07:38, chanli wrote: > On 2015/05/26 18:14:49, Shuotao wrote: > > If the gtest_result is 'invalid', we should fallback to stdio instead. > > However, the code here seems not like that. > > Now we will both get None as return if we get 'invalid' as gtest_result, or if > all the failures are flaky. > > I'm thinking maybe I can return different messages and only fallback to stdio if > we get 'invalid'? Sounds good to me. chanli 2015/05/26 23:33:41 Done. Show quoted text On 2015/05/26 19:17:11, Shuotao wrote: > On 2015/05/26 19:07:38, chanli wrote: > > On 2015/05/26 18:14:49, Shuotao wrote: > > > If the gtest_result is 'invalid', we should fallback to stdio instead. > > > However, the code here seems not like that. > > > > Now we will both get None as return if we get 'invalid' as gtest_result, or if > > all the failures are flaky. > > > > I'm thinking maybe I can return different messages and only fallback to stdio > if > > we get 'invalid'? > > Sounds good to me. Done.
	114 else:

	115 if not lock_util.WaitUntilDownloadAllowed(

	116 master_name): # pragma: no cover

	117 raise pipeline.Retry('Failed to pull log of step %s of master %s'

	118 % (step_name, master_name))

	119 try:

	120 failure_log = buildbot.GetStepStdio(

	121 master_name, builder_name, build_number, step_name,

	122 self.HTTP_CLIENT)

	123 except ResponseTooLargeError: # pragma: no cover.

	124 logging.exception(

	125 'Log of step "%s" is too large for urlfetch.', step_name)

	126 # If the stdio log of a step is too large, we don't want to pull it

	127 # again in next run, because that might lead to DDoS to the master.

	128 # TODO: Use archived stdio logs in Google Storage instead.

	129 failure_log = 'Stdio log is too large for urlfetch.'

76	130

77 # TODO: do test-level analysis instead of step-level.	131 if not failure_log: # pragma: no cover

78 try:	132 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'

79 stdio_log = buildbot.GetStepStdio(	133 % (step_name, master_name))

80 master_name, builder_name, build_number, step_name,

81 self.HTTP_CLIENT)

82 except ResponseTooLargeError: # pragma: no cover.

83 logging.exception(

84 'Log of step "%s" is too large for urlfetch.', step_name)

85 # If the stdio log of a step is too large, we don't want to pull it

86 # again in next run, because that might lead to DDoS to the master.

87 # TODO: Use archived stdio logs in Google Storage instead.

88 stdio_log = 'Stdio log is too large for urlfetch.'

89	134

90 if not stdio_log: # pragma: no cover	135 # Save step log in datastore and avoid downloading again during retry.

91 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'

92 % (step_name, master_name))

93

94 # Save stdio in datastore and avoid downloading again during retry.

95 if not step: # pragma: no cover	136 if not step: # pragma: no cover

96 step = WfStep.Create(	137 step = WfStep.Create(

97 master_name, builder_name, build_number, step_name)	138 master_name, builder_name, build_number, step_name)

98	139

99 step.log_data = self._ExtractStorablePortionOfLog(stdio_log)	140 step.log_data = self._ExtractStorablePortionOfLog(failure_log)

	141

100 try:	142 try:

101 step.put()	143 step.put()

102 except Exception as e: # pragma: no cover	144 except Exception as e: # pragma: no cover

103 # Sometimes, the stdio log is too large to save in datastore.	145 # Sometimes, the step log is too large to save in datastore.

104 logging.exception(e)	146 logging.exception(e)

105	147

106 # TODO: save result in datastore?	148 # TODO: save result in datastore?

107 signals[step_name] = extractors.ExtractSignal(	149 signals[step_name] = extractors.ExtractSignal(

108 master_name, builder_name, step_name, None, stdio_log).ToDict()	150 master_name, builder_name, step_name, None,

	151 failure_log).ToDict()

109	152

110 return signals	153 return signals

OLD	NEW