appengine/findit/waterfall/monitor_try_job_pipeline.py - Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline

Side by Side Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Addressing comments Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/findit/model/try_job_error.py ('K') | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 from datetime import datetime	5 from datetime import datetime

	6 import json

	7 import re

6 import time	8 import time

7	9

8 from common.pipeline_wrapper import BasePipeline	10 from common.pipeline_wrapper import BasePipeline

9 from common.pipeline_wrapper import pipeline	11 from common.pipeline_wrapper import pipeline

10 from common.waterfall import buildbucket_client	12 from common.waterfall import buildbucket_client

11 from common.waterfall.buildbucket_client import BuildbucketBuild	13 from common.waterfall.buildbucket_client import BuildbucketBuild

12 from model import analysis_status	14 from model import analysis_status

	15 from model import try_job_error

13 from model.wf_try_job import WfTryJob	16 from model.wf_try_job import WfTryJob

14 from model.wf_try_job_data import WfTryJobData	17 from model.wf_try_job_data import WfTryJobData

15 from waterfall import waterfall_config	18 from waterfall import waterfall_config

16 from waterfall.try_job_type import TryJobType	19 from waterfall.try_job_type import TryJobType

17	20

18	21

19 class MonitorTryJobPipeline(BasePipeline):	22 class MonitorTryJobPipeline(BasePipeline):

20 """A pipeline for monitoring a try job and recording results when it's done.	23 """A pipeline for monitoring a try job and recording results when it's done.

21	24

22 The result will be stored to compile_results or test_results according to	25 The result will be stored to compile_results or test_results according to

23 which type of build failure we are running try job for.	26 which type of build failure we are running try job for.

24 """	27 """

25	28

26 TIMEOUT = 'TIMEOUT'	29 UNKNOWN = 'UNKNOWN'

27	30

28 @staticmethod	31 @staticmethod

29 def _MicrosecondsToDatetime(microseconds):	32 def _MicrosecondsToDatetime(microseconds):

30 """Returns a datetime given the number of microseconds, or None."""	33 """Returns a datetime given the number of microseconds, or None."""

31 if microseconds:	34 if microseconds:

32 return datetime.utcfromtimestamp(float(microseconds) / 1000000)	35 return datetime.utcfromtimestamp(float(microseconds) / 1000000)

33 return None	36 return None

34	37

35 @staticmethod	38 @staticmethod

36 def _GetError(buildbucket_error, timed_out):	39 def _GetError(buildbucket_response, buildbucket_error, timed_out):

37 # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the	40 """Determines whether or not a try job error occurred.

38 # try job after waiting too long for it to complete) and errors reported	41

39 # directly in the buildbucket_client request are captured. Several other	42 Args:

40 # failures can be derrived from the response in the build too which should	43 buildbucket_response: A dict of the json response from buildbucket.

41 # be determined here.	44 buildbucket_error: A BuildBucketError object returned from the call to

	45 buildbucket_client.GetTryJobs()

	46 timed_out: A bool whether or not Findit abandoned monitoring the try job.

	47

	48 Returns:

	49 A tuple containing an error dict and number representing an error code, or

	50 (None, None) if no error was determined to have occurred.

	51 """

	52

42 if buildbucket_error:	53 if buildbucket_error:

43 return {	54 return (

44 'message': buildbucket_error.message,	55 {

45 'reason': buildbucket_error.reason	56 'message': buildbucket_error.message,

46 }	57 'reason': buildbucket_error.reason

	58 },

	59 try_job_error.BUILDBUCKET_ERROR)

47	60

48 if timed_out:	61 if timed_out:

49 return {	62 return (

50 'message': 'Try job monitoring was abandoned.',	63 {

51 'reason': MonitorTryJobPipeline.TIMEOUT	64 'message': 'Try job monitoring was abandoned.',

52 }	65 'reason': 'Timeout after %s hours' % (

	66 waterfall_config.GetTryJobSettings().get('job_timeout_hours'))

	67 },

	68 try_job_error.TIMEOUT)

	69

	70 if buildbucket_response:

	71 # If there is no explicit timeout or reason specified, check the last

	72 # build response for errors.

	73 result_details_json = json.loads(

	74 buildbucket_response.get('result_details_json', '{}')) or {}

	75

	76 # Check result_details_json for any obvious errors.

	77 error = result_details_json.get('error', {})

	78 if error:

	79 root_cause = error.get('message')

	80 if root_cause:

	81 error_dict = {

	82 'message': 'Try job could not be triggered.',

	83 'reason': root_cause

	84 }

	85 trybot_not_found_pattern = re.compile(r'Builder [^\s-]+ not found')

	86

	87 if trybot_not_found_pattern.match(root_cause):

	88 error_code = try_job_error.TRYBOT_NOT_FOUND
	stgao 2016/04/25 23:44:39 Why we want to handle this case specially? Why we want to handle this case specially? lijeffrey 2016/04/26 00:50:08 This case should be rare and never occur once it's Show quoted text On 2016/04/25 23:44:39, stgao wrote: > Why we want to handle this case specially? This case should be rare and never occur once it's detected and the config updated. I've renamed it BUILDBUCKET_REPORTED_ERROR for cases where the error is reported in the result_details_json, since there may be other types of failures that are reported here.
	89 else:

	90 error_code = try_job_error.UNKNOWN

	91

	92 return error_dict, error_code

	93

	94 return (

	95 {

	96 'message': 'Try job error was detected.',

	97 'reason': MonitorTryJobPipeline.UNKNOWN

	98 },

	99 try_job_error.UNKNOWN)

	100

	101 # Check the report to see if anything went wrong.

	102 report = result_details_json.get('properties', {}).get('report')

	103 if report:

	104 if ('infra_failed' in report.get('result', {}).itervalues() or

	105 report.get('metadata', {}).get('infra_failure')):
	stgao 2016/04/25 23:44:39 It is not added to findit/chromium/test.py yet. It is not added to findit/chromium/test.py yet. stgao 2016/04/25 23:44:39 Why we need to check both? Why we need to check both? lijeffrey 2016/04/26 00:50:08 Done. Good point, I think just metadata should be Show quoted text On 2016/04/25 23:44:39, stgao wrote: > Why we need to check both? Done. Good point, I think just metadata should be sufficient which should account for test.py once the change is made to add it to that recipe.
	106 # Check for any infra issues caught by the recipe.

	107 return (

	108 {

	109 'message': ('Try job encountered an infra issue during '

	110 'execution.'),

	111 'reason': MonitorTryJobPipeline.UNKNOWN

	112 },

	113 try_job_error.INFRA_FAILURE)

	114 else:

	115 # A report should always be included as prt of the properties. If it is

	116 # missing something is wrong.

	117 return (

	118 {

	119 'message': 'No result report was found.',

	120 'reason': MonitorTryJobPipeline.UNKNOWN

	121 },

	122 try_job_error.UNKNOWN)

	123

	124 return None, None

53	125

54 @staticmethod	126 @staticmethod

55 def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,	127 def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,

56 buildbucket_error, timed_out):	128 buildbucket_error, timed_out):

	129 buildbucket_response = {}

57 if buildbucket_build:	130 if buildbucket_build:

58 try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(	131 try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(

59 buildbucket_build.request_time)	132 buildbucket_build.request_time)

60 # If start_time is unavailable, fallback to request_time.	133 # If start_time is unavailable, fallback to request_time.

61 try_job_data.start_time = start_time or try_job_data.request_time	134 try_job_data.start_time = start_time or try_job_data.request_time

62 try_job_data.end_time = MonitorTryJobPipeline._MicrosecondsToDatetime(	135 try_job_data.end_time = MonitorTryJobPipeline._MicrosecondsToDatetime(

63 buildbucket_build.end_time)	136 buildbucket_build.end_time)

64 try_job_data.number_of_commits_analyzed = len(	137 try_job_data.number_of_commits_analyzed = len(

65 buildbucket_build.report.get('result', {}))	138 buildbucket_build.report.get('result', {}))

66 try_job_data.try_job_url = buildbucket_build.url	139 try_job_data.try_job_url = buildbucket_build.url

67 try_job_data.regression_range_size = buildbucket_build.report.get(	140 try_job_data.regression_range_size = buildbucket_build.report.get(

68 'metadata', {}).get('regression_range_size')	141 'metadata', {}).get('regression_range_size')

69 try_job_data.last_buildbucket_response = buildbucket_build.response	142 try_job_data.last_buildbucket_response = buildbucket_build.response

	143 buildbucket_response = buildbucket_build.response

70	144

71 error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out)	145 error_dict, error_code = MonitorTryJobPipeline._GetError(

	146 buildbucket_response, buildbucket_error, timed_out)

72	147

73 if error:	148 if error_dict:

74 try_job_data.error = error	149 try_job_data.error = error_dict

	150 try_job_data.error_code = error_code

75	151

76 try_job_data.put()	152 try_job_data.put()

77	153

78 def _UpdateTryJobResult(	154 def _UpdateTryJobResult(

79 self, status, master_name, builder_name, build_number, try_job_type,	155 self, status, master_name, builder_name, build_number, try_job_type,

80 try_job_id, try_job_url, result_content=None):	156 try_job_id, try_job_url, result_content=None):

81 """Updates try job result based on responsed try job status and result."""	157 """Updates try job result based on responsed try job status and result."""

82 result = {	158 result = {

83 'report': result_content,	159 'report': result_content,

84 'url': try_job_url,	160 'url': try_job_url,

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
166 already_set_started = True	242 already_set_started = True

167	243

168 if time.time() > deadline: # pragma: no cover	244 if time.time() > deadline: # pragma: no cover

169 self._UpdateTryJobMetadata(try_job_data, start_time, build, error, True)	245 self._UpdateTryJobMetadata(try_job_data, start_time, build, error, True)

170 # Explicitly abort the whole pipeline.	246 # Explicitly abort the whole pipeline.

171 raise pipeline.Abort(	247 raise pipeline.Abort(

172 'Try job %s timed out after %d hours.' % (	248 'Try job %s timed out after %d hours.' % (

173 try_job_id, timeout_hours))	249 try_job_id, timeout_hours))

174	250

175 time.sleep(pipeline_wait_seconds) # pragma: no cover	251 time.sleep(pipeline_wait_seconds) # pragma: no cover

OLD	NEW