Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Side by Side Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from datetime import datetime 5 from datetime import datetime
6 import json
7 import re
6 import time 8 import time
7 9
8 from common.pipeline_wrapper import BasePipeline 10 from common.pipeline_wrapper import BasePipeline
9 from common.pipeline_wrapper import pipeline 11 from common.pipeline_wrapper import pipeline
10 from common.waterfall import buildbucket_client 12 from common.waterfall import buildbucket_client
11 from common.waterfall.buildbucket_client import BuildbucketBuild 13 from common.waterfall.buildbucket_client import BuildbucketBuild
12 from model import analysis_status 14 from model import analysis_status
15 from model import try_job_error
13 from model.wf_try_job import WfTryJob 16 from model.wf_try_job import WfTryJob
14 from model.wf_try_job_data import WfTryJobData 17 from model.wf_try_job_data import WfTryJobData
15 from waterfall import waterfall_config 18 from waterfall import waterfall_config
16 from waterfall.try_job_type import TryJobType 19 from waterfall.try_job_type import TryJobType
17 20
18 21
19 class MonitorTryJobPipeline(BasePipeline): 22 class MonitorTryJobPipeline(BasePipeline):
20 """A pipeline for monitoring a try job and recording results when it's done. 23 """A pipeline for monitoring a try job and recording results when it's done.
21 24
22 The result will be stored to compile_results or test_results according to 25 The result will be stored to compile_results or test_results according to
23 which type of build failure we are running try job for. 26 which type of build failure we are running try job for.
24 """ 27 """
25 28
26 TIMEOUT = 'TIMEOUT' 29 UNKNOWN = 'UNKNOWN'
27 30
28 @staticmethod 31 @staticmethod
29 def _MicrosecondsToDatetime(microseconds): 32 def _MicrosecondsToDatetime(microseconds):
30 """Returns a datetime given the number of microseconds, or None.""" 33 """Returns a datetime given the number of microseconds, or None."""
31 if microseconds: 34 if microseconds:
32 return datetime.utcfromtimestamp(float(microseconds) / 1000000) 35 return datetime.utcfromtimestamp(float(microseconds) / 1000000)
33 return None 36 return None
34 37
35 @staticmethod 38 @staticmethod
36 def _GetError(buildbucket_error, timed_out): 39 def _GetError(buildbucket_response, buildbucket_error, timed_out):
chanli 2016/04/25 20:51:28 Will this handle cases like: https://build.chromiu
lijeffrey 2016/04/25 22:54:16 This try job was triggered outside of Findit, but
37 # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the 40 """Determines whether or not a try job error occurred.
38 # try job after waiting too long for it to complete) and errors reported 41
39 # directly in the buildbucket_client request are captured. Several other 42 Args:
40 # failures can be derrived from the response in the build too which should 43 buildbucket_response: A dict of the json response from buildbucket.
41 # be determined here. 44 buildbucket_error: A BuildBucketError object returned from the call to
45 buildbucket_client.GetTryJobs()
46 timed_out: A bool whether or not Findit abandoned monitoring the try job.
47
48 Returns:
49 A tuple containing an error dict and number representing an error code, or
50 (None, None) if no error was determined to have occurred.
51 """
52 error_dict = None
53 error_code = None
54
42 if buildbucket_error: 55 if buildbucket_error:
43 return { 56 error_dict = {
44 'message': buildbucket_error.message, 57 'message': buildbucket_error.message,
45 'reason': buildbucket_error.reason 58 'reason': buildbucket_error.reason
46 } 59 }
60 error_code = try_job_error.BUILDBUCKET_ERROR
61 elif timed_out:
62 error_dict = {
63 'message': 'Try job monitoring was abandoned.',
64 'reason': 'Timeout after %s hours' % (
65 waterfall_config.GetTryJobSettings().get('job_timeout_hours'))
66 }
67 error_code = try_job_error.TIMEOUT
68 elif buildbucket_response:
69 # If there is no explicit timeout or reason specified, check the last
70 # build response for errors.
71 result_details_json = json.loads(
72 buildbucket_response.get('result_details_json', '{}')) or {}
47 73
48 if timed_out: 74 # Check result_details_json for any obvious errors.
49 return { 75 error = result_details_json.get('error', {})
50 'message': 'Try job monitoring was abandoned.', 76 if error:
51 'reason': MonitorTryJobPipeline.TIMEOUT 77 message = error.get('message')
52 } 78 if message:
79 error_dict = {
80 'message': 'Try job could not be triggered.',
81 'reason': message
82 }
chanli 2016/04/25 20:51:28 Will it be better if error_dict = { 'message'
lijeffrey 2016/04/25 22:54:16 Reason should be the root cause, not the observed
83 trybot_not_found_pattern = re.compile(r'Builder [^\s-]+ not found')
84 if trybot_not_found_pattern.match(message):
85 error_code = try_job_error.TRYBOT_NOT_FOUND
86 else:
87 error_code = try_job_error.UNKNOWN
88 else:
89 error_dict = {
90 'message': 'Try job error was detected.',
91 'reason': MonitorTryJobPipeline.UNKNOWN
92 }
93 error_code = try_job_error.UNKNOWN
94
95 # Check the report to see if anything went wrong.
96 report = result_details_json.get('report')
97 if report:
98 if ('infra_failed' in report.get('result', {}).itervalues() or
99 report.get('metadata', {}).get('infra_failure')):
100 # Check for any infra issues caught by the recipe.
101 error_dict = {
102 'message': 'Try job encountered an infra issue during execution.',
103 'reason': MonitorTryJobPipeline.UNKNOWN
104 }
105 error_code = try_job_error.INFRA_FAILURE
106
107 return error_dict, error_code
53 108
54 @staticmethod 109 @staticmethod
55 def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build, 110 def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,
56 buildbucket_error, timed_out): 111 buildbucket_error, timed_out):
112 buildbucket_response = {}
57 if buildbucket_build: 113 if buildbucket_build:
58 try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime( 114 try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(
59 buildbucket_build.request_time) 115 buildbucket_build.request_time)
60 # If start_time is unavailable, fallback to request_time. 116 # If start_time is unavailable, fallback to request_time.
61 try_job_data.start_time = start_time or try_job_data.request_time 117 try_job_data.start_time = start_time or try_job_data.request_time
62 try_job_data.end_time = MonitorTryJobPipeline._MicrosecondsToDatetime( 118 try_job_data.end_time = MonitorTryJobPipeline._MicrosecondsToDatetime(
63 buildbucket_build.end_time) 119 buildbucket_build.end_time)
64 try_job_data.number_of_commits_analyzed = len( 120 try_job_data.number_of_commits_analyzed = len(
65 buildbucket_build.report.get('result', {})) 121 buildbucket_build.report.get('result', {}))
66 try_job_data.try_job_url = buildbucket_build.url 122 try_job_data.try_job_url = buildbucket_build.url
67 try_job_data.regression_range_size = buildbucket_build.report.get( 123 try_job_data.regression_range_size = buildbucket_build.report.get(
68 'metadata', {}).get('regression_range_size') 124 'metadata', {}).get('regression_range_size')
69 try_job_data.last_buildbucket_response = buildbucket_build.response 125 try_job_data.last_buildbucket_response = buildbucket_build.response
126 buildbucket_response = buildbucket_build.response
70 127
71 error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out) 128 error_dict, error_code = MonitorTryJobPipeline._GetError(
129 buildbucket_response, buildbucket_error, timed_out)
72 130
73 if error: 131 if error_dict:
74 try_job_data.error = error 132 try_job_data.error = error_dict
133 try_job_data.error_code = error_code
75 134
76 try_job_data.put() 135 try_job_data.put()
77 136
78 def _UpdateTryJobResult( 137 def _UpdateTryJobResult(
79 self, status, master_name, builder_name, build_number, try_job_type, 138 self, status, master_name, builder_name, build_number, try_job_type,
80 try_job_id, try_job_url, result_content=None): 139 try_job_id, try_job_url, result_content=None):
81 """Updates try job result based on responsed try job status and result.""" 140 """Updates try job result based on responsed try job status and result."""
82 result = { 141 result = {
83 'report': result_content, 142 'report': result_content,
84 'url': try_job_url, 143 'url': try_job_url,
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
166 already_set_started = True 225 already_set_started = True
167 226
168 if time.time() > deadline: # pragma: no cover 227 if time.time() > deadline: # pragma: no cover
169 self._UpdateTryJobMetadata(try_job_data, start_time, build, error, True) 228 self._UpdateTryJobMetadata(try_job_data, start_time, build, error, True)
170 # Explicitly abort the whole pipeline. 229 # Explicitly abort the whole pipeline.
171 raise pipeline.Abort( 230 raise pipeline.Abort(
172 'Try job %s timed out after %d hours.' % ( 231 'Try job %s timed out after %d hours.' % (
173 try_job_id, timeout_hours)) 232 try_job_id, timeout_hours))
174 233
175 time.sleep(pipeline_wait_seconds) # pragma: no cover 234 time.sleep(pipeline_wait_seconds) # pragma: no cover
OLDNEW
« no previous file with comments | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698