appengine/findit/waterfall/extract_signal_pipeline.py - Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures.

Side by Side Diff: appengine/findit/waterfall/extract_signal_pipeline.py

Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Fixed several small issues based on comments. Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/findit/waterfall/buildbot.py ('K') | « appengine/findit/waterfall/buildbot.py ('k') | appengine/findit/waterfall/test/buildbot_test.py » ('j') | appengine/findit/waterfall/test/buildbot_test.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

	5 import cStringIO

5 import logging	6 import logging

	7 import json

6	8

7 from google.appengine.api.urlfetch import ResponseTooLargeError	9 from google.appengine.api.urlfetch import ResponseTooLargeError

8	10

9 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline	11 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

10	12

11 from common.http_client_appengine import HttpClientAppengine as HttpClient	13 from common.http_client_appengine import HttpClientAppengine as HttpClient

12 from model.wf_step import WfStep	14 from model.wf_step import WfStep

13 from waterfall import buildbot	15 from waterfall import buildbot

14 from waterfall import extractors	16 from waterfall import extractors

15 from waterfall import lock_util	17 from waterfall import lock_util

(...skipping 22 matching lines...) Expand all Loading...
38	40

39 lines = log_data.split('\n')	41 lines = log_data.split('\n')

40 size = 0	42 size = 0

41 for line_index in reversed(range(len(lines))):	43 for line_index in reversed(range(len(lines))):

42 size += len(lines[line_index]) + 1	44 size += len(lines[line_index]) + 1

43 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:	45 if size > ExtractSignalPipeline.LOG_DATA_BYTE_LIMIT:

44 return '\n'.join(lines[line_index + 1:])	46 return '\n'.join(lines[line_index + 1:])

45 else:	47 else:

46 return log_data # pragma: no cover - this won't be reached.	48 return log_data # pragma: no cover - this won't be reached.

47	49

	50 @staticmethod

	51 def _GetReliableTestFailureLog(gtest_result):

	52 """Analyze the archived step log and extract reliable failures only.
	stgao 2015/05/22 22:49:43 To be accurate, it is 'archived gtest json results To be accurate, it is 'archived gtest json results'. For 'archived' step log, it is the whole stdio log of a step. Gtest json results is a subset of it and in json format. chanli 2015/05/22 23:26:32 Done. Show quoted text On 2015/05/22 22:49:43, Shuotao wrote: > To be accurate, it is 'archived gtest json results'. > > For 'archived' step log, it is the whole stdio log of a step. Gtest json results > is a subset of it and in json format. Done.
	53

	54 Args:

	55 gtest_result (str): A JSON file for failed step log.

	56

	57 Returns:

	58 A string contains the names of reliable test failures and output_snippets.
	stgao 2015/05/22 22:49:43 'output_snippets' is a detail about the gtest outp 'output_snippets' is a detail about the gtest output. So it's better to hide detail from the user of this function. chanli 2015/05/22 23:26:32 Done. Show quoted text On 2015/05/22 22:49:43, Shuotao wrote: > 'output_snippets' is a detail about the gtest output. So it's better to hide > detail from the user of this function. Done.
	59 """

	60 step_failure_data = json.loads(gtest_result)

	61 sio = cStringIO.StringIO()

	62 for iteration in step_failure_data['gtest_results']['per_iteration_data']:

	63 for test_name in iteration.keys():

	64 is_reliable_failure = True

	65

	66 for test in iteration[test_name]:
	stgao 2015/05/22 22:49:42 test->test_run? test->test_run? chanli 2015/05/22 23:26:32 Done. Show quoted text On 2015/05/22 22:49:42, Shuotao wrote: > test->test_run? Done.
	67 # We will ignore the test if some of the attempts were success.

	68 if test['status'] == 'SUCCESS':

	69 is_reliable_failure = False

	70 break

	71

	72 if is_reliable_failure: # all attempts failed, it's a reliable failure
	stgao 2015/05/22 22:49:43 Still not correct, please check the google python Still not correct, please check the google python style. chanli 2015/05/22 23:26:32 Done. Show quoted text On 2015/05/22 22:49:43, Shuotao wrote: > Still not correct, please check the google python style. Done.
	73 for test in iteration[test_name]:

	74 sio.write("'%s': %s\n" % (test_name, test['output_snippet']))

	75

	76 failed_test_log = sio.getvalue()

	77 sio.close()

	78

	79 return failed_test_log

48	80

49 # Arguments number differs from overridden method - pylint: disable=W0221	81 # Arguments number differs from overridden method - pylint: disable=W0221

50 def run(self, failure_info):	82 def run(self, failure_info):

51 """	83 """

52 Args:	84 Args:

53 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().	85 failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().

54	86

55 Returns:	87 Returns:

56 A dict like below:	88 A dict like below:

57 {	89 {

58 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),	90 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),

59 ...	91 ...

60 }	92 }

61 """	93 """

62 signals = {}	94 signals = {}

63	95

64 master_name = failure_info['master_name']	96 master_name = failure_info['master_name']

65 builder_name = failure_info['builder_name']	97 builder_name = failure_info['builder_name']

66 build_number = failure_info['build_number']	98 build_number = failure_info['build_number']

67 for step_name in failure_info.get('failed_steps', []):	99 for step_name in failure_info.get('failed_steps', []):

68 step = WfStep.Get(master_name, builder_name, build_number, step_name)	100 step = WfStep.Get(master_name, builder_name, build_number, step_name)

69 if step and step.log_data:	101 if step and step.log_data:

70 stdio_log = step.log_data	102 failure_log = step.log_data

71 else:	103 else:

72 if not lock_util.WaitUntilDownloadAllowed(	104 # TODO: do test-level analysis instead of step-level.

73 master_name): # pragma: no cover	105 gtest_result = buildbot.GetGtestResultLog(

74 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'	106 master_name, builder_name, build_number, step_name)

75 % (step_name, master_name))	107 if gtest_result:
	stgao 2015/05/22 22:49:43 https://code.google.com/p/chromium/codesearch#chro https://code.google.com/p/chromium/codesearch#chromium/build/scripts/slave/ru... We need to fall back to stdio log if the gtest result is invalid. You may think about where the check should be added. chanli 2015/05/22 23:26:32 Done. Show quoted text On 2015/05/22 22:49:43, Shuotao wrote: > https://code.google.com/p/chromium/codesearch#chromium/build/scripts/slave/ru... > > We need to fall back to stdio log if the gtest result is invalid. > You may think about where the check should be added. Done.
	108 failure_log = self._GetReliableTestFailureLog(gtest_result)

	109 else:

	110 if not lock_util.WaitUntilDownloadAllowed(

	111 master_name): # pragma: no cover

	112 raise pipeline.Retry('Failed to pull log of step %s of master %s'

	113 % (step_name, master_name))

	114 try:

	115 failure_log = buildbot.GetStepStdio(

	116 master_name, builder_name, build_number, step_name,

	117 self.HTTP_CLIENT)

	118 except ResponseTooLargeError: # pragma: no cover.

	119 logging.exception(

	120 'Log of step "%s" is too large for urlfetch.', step_name)

	121 # If the stdio log of a step is too large, we don't want to pull it

	122 # again in next run, because that might lead to DDoS to the master.

	123 failure_log = 'Stdio log is too large for urlfetch.'

76	124

77 # TODO: do test-level analysis instead of step-level.	125 if not failure_log: # pragma: no cover

78 try:	126 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'

79 stdio_log = buildbot.GetStepStdio(	127 % (step_name, master_name))

80 master_name, builder_name, build_number, step_name,

81 self.HTTP_CLIENT)

82 except ResponseTooLargeError: # pragma: no cover.

83 logging.exception(

84 'Log of step "%s" is too large for urlfetch.', step_name)

85 # If the stdio log of a step is too large, we don't want to pull it

86 # again in next run, because that might lead to DDoS to the master.

87 # TODO: Use archived stdio logs in Google Storage instead.

88 stdio_log = 'Stdio log is too large for urlfetch.'

89	128

90 if not stdio_log: # pragma: no cover	129 # Save step log in datastore and avoid downloading again during retry.

91 raise pipeline.Retry('Failed to pull stdio of step %s of master %s'

92 % (step_name, master_name))

93

94 # Save stdio in datastore and avoid downloading again during retry.

95 if not step: # pragma: no cover	130 if not step: # pragma: no cover

96 step = WfStep.Create(	131 step = WfStep.Create(

97 master_name, builder_name, build_number, step_name)	132 master_name, builder_name, build_number, step_name)

98	133

99 step.log_data = self._ExtractStorablePortionOfLog(stdio_log)	134 step.log_data = self._ExtractStorablePortionOfLog(failure_log)

	135

100 try:	136 try:

101 step.put()	137 step.put()

102 except Exception as e: # pragma: no cover	138 except Exception as e: # pragma: no cover

103 # Sometimes, the stdio log is too large to save in datastore.	139 # Sometimes, the step log is too large to save in datastore.

104 logging.exception(e)	140 logging.exception(e)

105	141

106 # TODO: save result in datastore?	142 # TODO: save result in datastore?

107 signals[step_name] = extractors.ExtractSignal(	143 signals[step_name] = extractors.ExtractSignal(

108 master_name, builder_name, step_name, None, stdio_log).ToDict()	144 master_name, builder_name, step_name, None,

	145 failure_log).ToDict()

109	146

110 return signals	147 return signals

OLD	NEW