Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(181)

Side by Side Diff: appengine/findit/waterfall/process_base_swarming_task_result_pipeline.py

Issue 2491473002: [Findit] Implementing swarming task error detection (Closed)
Patch Set: Rebase Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | appengine/findit/waterfall/process_flake_swarming_task_result_pipeline.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from collections import defaultdict 5 from collections import defaultdict
6 import datetime 6 import datetime
7 import logging 7 import logging
8 import time 8 import time
9 9
10 from common.http_client_appengine import HttpClientAppengine as HttpClient 10 from common.http_client_appengine import HttpClientAppengine as HttpClient
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
64 for fmt in ('%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%S'): 64 for fmt in ('%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%S'):
65 # When microseconds are 0, the '.123456' suffix is elided. 65 # When microseconds are 0, the '.123456' suffix is elided.
66 try: 66 try:
67 return datetime.datetime.strptime(time_string, fmt) 67 return datetime.datetime.strptime(time_string, fmt)
68 except ValueError: 68 except ValueError:
69 pass 69 pass
70 raise ValueError('Failed to parse %s' % time_string) # pragma: no cover 70 raise ValueError('Failed to parse %s' % time_string) # pragma: no cover
71 71
72 def _MonitorSwarmingTask(self, task_id, *call_args): 72 def _MonitorSwarmingTask(self, task_id, *call_args):
73 """Monitors the swarming task and waits for it to complete.""" 73 """Monitors the swarming task and waits for it to complete."""
74 assert task_id
74 timeout_hours = waterfall_config.GetSwarmingSettings().get( 75 timeout_hours = waterfall_config.GetSwarmingSettings().get(
75 'task_timeout_hours') 76 'task_timeout_hours')
76 deadline = time.time() + timeout_hours * 60 * 60 77 deadline = time.time() + timeout_hours * 60 * 60
77 server_query_interval_seconds = waterfall_config.GetSwarmingSettings().get( 78 server_query_interval_seconds = waterfall_config.GetSwarmingSettings().get(
78 'server_query_interval_seconds') 79 'server_query_interval_seconds')
79
80 task_started = False 80 task_started = False
81 task_completed = False 81 task_completed = False
82 tests_statuses = {} 82 tests_statuses = {}
83 step_name_no_platform = None 83 step_name_no_platform = None
84 task = self._GetSwarmingTask(*call_args) 84 task = self._GetSwarmingTask(*call_args)
85 85
86 while not task_completed: 86 while not task_completed:
87 data = swarming_util.GetSwarmingTaskResultById(task_id, self.HTTP_CLIENT) 87 data, error = swarming_util.GetSwarmingTaskResultById(
88 task_id, self.HTTP_CLIENT)
89
90 if error:
91 # An error occurred when trying to contact the swarming server.
92 task.status = analysis_status.ERROR
93 task.error = error
94 task.put()
95 break
96
88 task_state = data['state'] 97 task_state = data['state']
89 exit_code = (data.get('exit_code') if 98 exit_code = (data.get('exit_code') if
90 task_state == swarming_util.STATE_COMPLETED else None) 99 task_state == swarming_util.STATE_COMPLETED else None)
91 step_name_no_platform = ( 100 step_name_no_platform = (
92 step_name_no_platform or swarming_util.GetTagValue( 101 step_name_no_platform or swarming_util.GetTagValue(
93 data.get('tags', {}), 'ref_name')) 102 data.get('tags', {}), 'ref_name'))
94 103
95 if task_state not in swarming_util.STATES_RUNNING: 104 if task_state not in swarming_util.STATES_RUNNING:
96 task_completed = True 105 task_completed = True
97 106
98 if (task_state == swarming_util.STATE_COMPLETED and 107 if (task_state == swarming_util.STATE_COMPLETED and
99 int(exit_code) != swarming_util.TASK_FAILED): 108 int(exit_code) != swarming_util.TASK_FAILED):
100 outputs_ref = data.get('outputs_ref') 109 outputs_ref = data.get('outputs_ref')
101 output_json = swarming_util.GetSwarmingTaskFailureLog( 110 output_json, error = swarming_util.GetSwarmingTaskFailureLog(
102 outputs_ref, self.HTTP_CLIENT) 111 outputs_ref, self.HTTP_CLIENT)
112
113 if error:
114 task.status = analysis_status.ERROR
115 task.error = error
116 else:
117 task.status = analysis_status.COMPLETED
118
103 tests_statuses = self._CheckTestsRunStatuses(output_json, *call_args) 119 tests_statuses = self._CheckTestsRunStatuses(output_json, *call_args)
104 task.status = analysis_status.COMPLETED
105 task.tests_statuses = tests_statuses 120 task.tests_statuses = tests_statuses
121 task.put()
106 else: 122 else:
123 if exit_code is not None:
124 # Swarming task completed, but the task failed.
125 code = int(exit_code)
126 message = swarming_util.EXIT_CODE_DESCRIPTIONS[code]
127 else:
128 # The swarming task did not complete.
129 code = swarming_util.STATES_NOT_RUNNING_TO_ERROR_CODES[task_state]
130 message = task_state
131
107 task.status = analysis_status.ERROR 132 task.status = analysis_status.ERROR
133 task.error = {
134 'code': code,
135 'message': message
136 }
137 task.put()
138
108 logging_str = 'Swarming task stopped with status: %s' % task_state 139 logging_str = 'Swarming task stopped with status: %s' % task_state
109 if exit_code: # pragma: no cover 140 if exit_code: # pragma: no cover
110 logging_str += ' and exit_code: %s - %s' % ( 141 logging_str += ' and exit_code: %s - %s' % (
111 exit_code, swarming_util.EXIT_CODE_DESCRIPTIONS[int(exit_code)]) 142 exit_code, swarming_util.EXIT_CODE_DESCRIPTIONS[code])
112 logging.error(logging_str) 143 logging.error(logging_str)
113 144
114 tags = data.get('tags', {}) 145 tags = data.get('tags', {})
115 priority_str = swarming_util.GetTagValue(tags, 'priority') 146 priority_str = swarming_util.GetTagValue(tags, 'priority')
116 if priority_str: 147 if priority_str:
117 task.parameters['priority'] = int(priority_str) 148 task.parameters['priority'] = int(priority_str)
118 149
119 task.put() 150 task.put()
120 else: # pragma: no cover 151 else: # pragma: no cover
121 if task_state == 'RUNNING' and not task_started: 152 if task_state == 'RUNNING' and not task_started:
122 # swarming task just starts, update status. 153 # swarming task just starts, update status.
123 task_started = True 154 task_started = True
124 task.status = analysis_status.RUNNING 155 task.status = analysis_status.RUNNING
125 task.put() 156 task.put()
126 time.sleep(server_query_interval_seconds) 157 time.sleep(server_query_interval_seconds)
127 158
128 # Timeout. 159 # Timeout.
129 if time.time() > deadline: 160 if time.time() > deadline:
130 # Updates status as ERROR. 161 # Updates status as ERROR.
131 task.status = analysis_status.ERROR 162 task.status = analysis_status.ERROR
163 task.error = {
164 'code': swarming_util.TIMED_OUT,
165 'message': 'Process swarming task result timed out'
166 }
132 task.put() 167 task.put()
133 logging.error('Swarming task timed out after %d hours.' % timeout_hours) 168 logging.error('Swarming task timed out after %d hours.' % timeout_hours)
134 break # Stops the loop and return. 169 break # Stops the loop and return.
135 170
136 # Update swarming task metadata timestamps. 171 # Update swarming task metadata.
137 task.created_time = self._ConvertDateTime(data.get('created_ts')) 172 task.created_time = (task.created_time or
138 task.started_time = self._ConvertDateTime(data.get('started_ts')) 173 self._ConvertDateTime(data.get('created_ts')))
139 task.completed_time = self._ConvertDateTime(data.get('completed_ts')) 174 task.started_time = (task.started_time or
175 self._ConvertDateTime(data.get('started_ts')))
176 task.completed_time = (task.completed_time or
177 self._ConvertDateTime(data.get('completed_ts')))
140 task.put() 178 task.put()
141 179
142 return step_name_no_platform 180 return step_name_no_platform
143 181
144 # Arguments number differs from overridden method - pylint: disable=W0221 182 # Arguments number differs from overridden method - pylint: disable=W0221
145 def run(self, master_name, builder_name, build_number, step_name, task_id, 183 def run(self, master_name, builder_name, build_number, step_name, task_id,
146 *args): 184 *args):
147 """Monitors a swarming task. 185 """Monitors a swarming task.
148 186
149 Args: 187 Args:
150 master_name (str): The master name. 188 master_name (str): The master name.
151 builder_name (str): The builder name. 189 builder_name (str): The builder name.
152 build_number (str): The build number. 190 build_number (str): The build number.
153 step_name (str): The failed test step name. 191 step_name (str): The failed test step name.
154 task_id (str): The task id to query the swarming server on the progresss 192 task_id (str): The task id to query the swarming server on the progresss
155 of a swarming task. 193 of a swarming task.
156 194
157 Returns: 195 Returns:
158 A dict of lists for reliable/flaky tests. 196 A dict of lists for reliable/flaky tests.
159 """ 197 """
160 call_args = self._GetArgs(master_name, builder_name, build_number, 198 call_args = self._GetArgs(master_name, builder_name, build_number,
161 step_name, *args) 199 step_name, *args)
162 step_name_no_platform = self._MonitorSwarmingTask(task_id, *call_args) 200 step_name_no_platform = self._MonitorSwarmingTask(task_id, *call_args)
163 return step_name, step_name_no_platform 201 return step_name, step_name_no_platform
164
OLDNEW
« no previous file with comments | « no previous file | appengine/findit/waterfall/process_flake_swarming_task_result_pipeline.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698