appengine/findit/waterfall/monitor_try_job_pipeline.py - Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline

Side by Side Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Addressing comments Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 from datetime import datetime	5 from datetime import datetime

	6 import json

6 import time	7 import time

7	8

8 from common.pipeline_wrapper import BasePipeline	9 from common.pipeline_wrapper import BasePipeline

9 from common.pipeline_wrapper import pipeline	10 from common.pipeline_wrapper import pipeline

10 from common.waterfall import buildbucket_client	11 from common.waterfall import buildbucket_client

	12 from common.waterfall import try_job_error

11 from common.waterfall.buildbucket_client import BuildbucketBuild	13 from common.waterfall.buildbucket_client import BuildbucketBuild

12 from model import analysis_status	14 from model import analysis_status

13 from model.wf_try_job import WfTryJob	15 from model.wf_try_job import WfTryJob

14 from model.wf_try_job_data import WfTryJobData	16 from model.wf_try_job_data import WfTryJobData

15 from waterfall import waterfall_config	17 from waterfall import waterfall_config

16 from waterfall.try_job_type import TryJobType	18 from waterfall.try_job_type import TryJobType

17	19

18	20

19 class MonitorTryJobPipeline(BasePipeline):	21 class MonitorTryJobPipeline(BasePipeline):

20 """A pipeline for monitoring a try job and recording results when it's done.	22 """A pipeline for monitoring a try job and recording results when it's done.

21	23

22 The result will be stored to compile_results or test_results according to	24 The result will be stored to compile_results or test_results according to

23 which type of build failure we are running try job for.	25 which type of build failure we are running try job for.

24 """	26 """

25	27

26 TIMEOUT = 'TIMEOUT'	28 UNKNOWN = 'UNKNOWN'

27	29

28 @staticmethod	30 @staticmethod

29 def _MicrosecondsToDatetime(microseconds):	31 def _MicrosecondsToDatetime(microseconds):

30 """Returns a datetime given the number of microseconds, or None."""	32 """Returns a datetime given the number of microseconds, or None."""

31 if microseconds:	33 if microseconds:

32 return datetime.utcfromtimestamp(float(microseconds) / 1000000)	34 return datetime.utcfromtimestamp(float(microseconds) / 1000000)

33 return None	35 return None

34	36

35 @staticmethod	37 @staticmethod

36 def _GetError(buildbucket_error, timed_out):	38 def _GetError(buildbucket_response, buildbucket_error, timed_out):

37 # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the	39 """Determines whether or not a try job error occurred.

38 # try job after waiting too long for it to complete) and errors reported	40

39 # directly in the buildbucket_client request are captured. Several other	41 Args:

40 # failures can be derrived from the response in the build too which should	42 buildbucket_response: A dict of the json response from buildbucket.

41 # be determined here.	43 buildbucket_error: A BuildBucketError object returned from the call to

	44 buildbucket_client.GetTryJobs()

	45 timed_out: A bool whether or not Findit abandoned monitoring the try job.

	46

	47 Returns:

	48 A tuple containing an error dict and number representing an error code, or

	49 (None, None) if no error was determined to have occurred.

	50 """

	51

42 if buildbucket_error:	52 if buildbucket_error:

43 return {	53 return (

44 'message': buildbucket_error.message,	54 {

45 'reason': buildbucket_error.reason	55 'message': buildbucket_error.message,

46 }	56 'reason': buildbucket_error.reason

	57 },

	58 try_job_error.BUILDBUCKET_REQUEST_ERROR)

47	59

48 if timed_out:	60 if timed_out:

49 return {	61 return (

50 'message': 'Try job monitoring was abandoned.',	62 {

51 'reason': MonitorTryJobPipeline.TIMEOUT	63 'message': 'Try job monitoring was abandoned.',

52 }	64 'reason': 'Timeout after %s hours' % (

	65 waterfall_config.GetTryJobSettings().get('job_timeout_hours'))

	66 },

	67 try_job_error.TIMEOUT)

	68

	69 if buildbucket_response:

	70 # If there is no explicit timeout or reason specified, check the last

	71 # build response for errors.

	72 result_details_json = json.loads(

	73 buildbucket_response.get('result_details_json', '{}')) or {}

	74

	75 # Check result_details_json for any obvious errors.

	76 error = result_details_json.get('error', {})

	77 if error:

	78 return (

	79 {

	80 'message': 'Buildbucket reported an error.',

	81 'reason': error.get('message', MonitorTryJobPipeline.UNKNOWN)

	82 },

	83 try_job_error.CI_REPORTED_ERROR)

	84

	85 # Check the report to see if anything went wrong.

	86 report = result_details_json.get('properties', {}).get('report')

	87 if report:

	88 if report.get('metadata', {}).get('infra_failure'):

	89 # Check for any infra issues caught by the recipe.

	90 return (

	91 {

	92 'message': ('Try job encountered an infra issue during '

	93 'execution.'),

	94 'reason': MonitorTryJobPipeline.UNKNOWN

	95 },

	96 try_job_error.INFRA_FAILURE)

	97 else:

	98 # A report should always be included as part of 'properties'. If it is

	99 # missing something else is wrong.

	100 return (

	101 {

	102 'message': 'No result report was found.',

	103 'reason': MonitorTryJobPipeline.UNKNOWN

	104 },

	105 try_job_error.UNKNOWN)

	106

	107 return None, None

53	108

54 @staticmethod	109 @staticmethod

55 def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,	110 def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,

56 buildbucket_error, timed_out):	111 buildbucket_error, timed_out):

	112 buildbucket_response = {}

57 if buildbucket_build:	113 if buildbucket_build:

58 try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(	114 try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(

59 buildbucket_build.request_time)	115 buildbucket_build.request_time)

60 # If start_time is unavailable, fallback to request_time.	116 # If start_time is unavailable, fallback to request_time.

61 try_job_data.start_time = start_time or try_job_data.request_time	117 try_job_data.start_time = start_time or try_job_data.request_time

62 try_job_data.end_time = MonitorTryJobPipeline._MicrosecondsToDatetime(	118 try_job_data.end_time = MonitorTryJobPipeline._MicrosecondsToDatetime(

63 buildbucket_build.end_time)	119 buildbucket_build.end_time)

64 try_job_data.number_of_commits_analyzed = len(	120 try_job_data.number_of_commits_analyzed = len(

65 buildbucket_build.report.get('result', {}))	121 buildbucket_build.report.get('result', {}))

66 try_job_data.try_job_url = buildbucket_build.url	122 try_job_data.try_job_url = buildbucket_build.url

67 try_job_data.regression_range_size = buildbucket_build.report.get(	123 try_job_data.regression_range_size = buildbucket_build.report.get(

68 'metadata', {}).get('regression_range_size')	124 'metadata', {}).get('regression_range_size')

69 try_job_data.last_buildbucket_response = buildbucket_build.response	125 try_job_data.last_buildbucket_response = buildbucket_build.response

	126 buildbucket_response = buildbucket_build.response

70	127

71 error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out)	128 error_dict, error_code = MonitorTryJobPipeline._GetError(

	129 buildbucket_response, buildbucket_error, timed_out)

72	130

73 if error:	131 if error_dict:

74 try_job_data.error = error	132 try_job_data.error = error_dict

	133 try_job_data.error_code = error_code

75	134

76 try_job_data.put()	135 try_job_data.put()

77	136

78 def _UpdateTryJobResult(	137 def _UpdateTryJobResult(

79 self, status, master_name, builder_name, build_number, try_job_type,	138 self, status, master_name, builder_name, build_number, try_job_type,

80 try_job_id, try_job_url, result_content=None):	139 try_job_id, try_job_url, result_content=None):

81 """Updates try job result based on responsed try job status and result."""	140 """Updates try job result based on responsed try job status and result."""

82 result = {	141 result = {

83 'report': result_content,	142 'report': result_content,

84 'url': try_job_url,	143 'url': try_job_url,

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
166 already_set_started = True	225 already_set_started = True

167	226

168 if time.time() > deadline: # pragma: no cover	227 if time.time() > deadline: # pragma: no cover

169 self._UpdateTryJobMetadata(try_job_data, start_time, build, error, True)	228 self._UpdateTryJobMetadata(try_job_data, start_time, build, error, True)

170 # Explicitly abort the whole pipeline.	229 # Explicitly abort the whole pipeline.

171 raise pipeline.Abort(	230 raise pipeline.Abort(

172 'Try job %s timed out after %d hours.' % (	231 'Try job %s timed out after %d hours.' % (

173 try_job_id, timeout_hours))	232 try_job_id, timeout_hours))

174	233

175 time.sleep(pipeline_wait_seconds) # pragma: no cover	234 time.sleep(pipeline_wait_seconds) # pragma: no cover

OLD	NEW

« no previous file with comments | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | no next file with comments »