Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(433)

Side by Side Diff: appengine/findit/waterfall/swarming_util.py

Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls (Closed)
Patch Set: Self-review Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import base64 5 import base64
6 from collections import defaultdict 6 from collections import defaultdict
7 import json 7 import json
8 import logging 8 import logging
9 import time
9 import urllib 10 import urllib
10 import zlib 11 import zlib
11 12
12 from google.appengine.api.urlfetch_errors import DeadlineExceededError 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError
13 from google.appengine.api.urlfetch_errors import DownloadError 14 from google.appengine.api.urlfetch_errors import DownloadError
14 from google.appengine.api.urlfetch_errors import ConnectionClosedError 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError
15 from google.appengine.ext import ndb 16 from google.appengine.ext import ndb
16 17
17 from common import auth_util 18 from common import auth_util
18 from model.wf_step import WfStep 19 from model.wf_step import WfStep
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
58 TASK_FAILED = 2 59 TASK_FAILED = 2
59 60
60 # Swarming task exit code descriptions. 61 # Swarming task exit code descriptions.
61 EXIT_CODE_DESCRIPTIONS = { 62 EXIT_CODE_DESCRIPTIONS = {
62 ALL_TESTS_PASSED: 'All tests passed', 63 ALL_TESTS_PASSED: 'All tests passed',
63 SOME_TESTS_FAILED: 'Some tests failed', 64 SOME_TESTS_FAILED: 'Some tests failed',
64 TASK_FAILED: 'Swarming task failed', 65 TASK_FAILED: 'Swarming task failed',
65 } 66 }
66 67
67 68
69 def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval):
70 """Returns how many seconds to wait before next retry.
71
72 Params:
73 retry_backoff (int): The base backoff in seconds.
74 tries (int): Indicates how many tries have been done.
75 maximum_retry_interval (int): The upper limit in seconds of how long to wait
76 between retries.
77 """
78 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)
79
80
68 def _SendRequestToServer(url, http_client, post_data=None): 81 def _SendRequestToServer(url, http_client, post_data=None):
69 """Sends GET/POST request to arbitrary url and returns response content.""" 82 """Sends GET/POST request to arbitrary url and returns response content.
83
84 Because the Swarming and Isolated servers that _SendRequestToServer tries to
85 contact are prone to outages, exceptions trying to reach them may occur thus
86 this method should retry. We want to monitor and document these occurrences
87 even if the request eventually succeeds after retrying, with the last error
88 encountered being the one that is reported.
89
90 Args:
91 url (str): The url to send the request to.
92 http_client (HttpClient): The httpclient object with which to make the
93 server calls.
94 post_data (dict): Data/params to send with the request, if any.
95 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming
96 task with which to capture errors.
97
98 Returns:
99 content (dict), error (dict): The content from the server and the last error
100 encountered trying to retrieve it.
101 """
70 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} 102 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}
103 swarming_settings = waterfall_config.GetSwarmingSettings()
104 should_retry = swarming_settings.get('should_retry_server')
chanli 2016/11/23 23:45:32 I have a question: How long is the server down typ
lijeffrey 2016/11/28 23:55:24 It's difficult to say just from the logs, which us
chanli 2016/11/29 18:49:38 Acknowledged.
105 timeout_seconds = swarming_settings.get('server_retry_timeout') * 60 * 60
106 maximum_retry_interval = swarming_settings.get(
107 'maximum_server_contact_retry_interval_seconds')
108 deadline = time.time() + timeout_seconds
109 retry_backoff = 60
110 tries = 1
71 error = None 111 error = None
72 112
73 try: 113 while True:
74 if post_data: 114 try:
75 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) 115 if post_data:
76 headers['Content-Type'] = 'application/json; charset=UTF-8' 116 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))
77 headers['Content-Length'] = len(post_data) 117 headers['Content-Type'] = 'application/json; charset=UTF-8'
78 status_code, content = http_client.Post(url, post_data, headers=headers) 118 headers['Content-Length'] = len(post_data)
chanli 2016/11/30 18:53:29 Nit: This part can be done only once outside of th
lijeffrey 2016/11/30 20:12:11 Done.
119 status_code, content = http_client.Post(url, post_data, headers=headers)
120 else:
121 status_code, content = http_client.Get(url, headers=headers)
122 except ConnectionClosedError as e:
123 error = {
124 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
125 'message': e.message
126 }
127 except DeadlineExceededError as e:
128 error = {
129 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
130 'message': e.message
131 }
132 except DownloadError as e:
133 error = {
134 'code': URLFETCH_DOWNLOAD_ERROR,
135 'message': e.message
136 }
137 except Exception as e: # pragma: no cover
138 error = {
139 'code': UNKNOWN,
140 'message': e.message
141 }
142
143 if error or status_code != 200:
144 # The retry upon 50x (501 excluded) is automatically handled in the
145 # underlying http_client.
146 # By default, it retries 5 times with exponential backoff.
147 error = error or {
148 'code': EXCEEDED_MAX_RETRIES_ERROR,
149 'message': 'Max retries exceeded trying to reach %s' % url
150 }
151 logging.error(error['message'])
79 else: 152 else:
80 status_code, content = http_client.Get(url, headers=headers) 153 # Even if the call is successful, still return the last error encountered.
81 except ConnectionClosedError as e: 154 return content, error
82 error = {
83 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
84 'message': e.message
85 }
86 except DeadlineExceededError as e:
87 error = {
88 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
89 'message': e.message
90 }
91 except DownloadError as e:
92 error = {
93 'code': URLFETCH_DOWNLOAD_ERROR,
94 'message': e.message
95 }
96 except Exception as e: # pragma: no cover
97 error = {
98 'code': UNKNOWN,
99 'message': e.message
100 }
101 # Still raise an exception here for cases not encountered before in order
102 # to see what went wrong in the logs.
103 raise e
104 155
105 if error or status_code != 200: 156 if should_retry and time.time() < deadline: # pragma: no cover
106 # The retry upon 50x (501 excluded) is automatically handled in the 157 # Wait, then retry if applicable.
107 # underlying http_client. 158 wait_time = _GetBackoffSeconds(
108 # By default, it retries 5 times with exponential backoff. 159 retry_backoff, tries, maximum_retry_interval)
109 error = error or { 160 logging.info('Retrying connection to %s in %d seconds', url, wait_time)
110 'code': EXCEEDED_MAX_RETRIES_ERROR, 161 time.sleep(wait_time)
111 'message': 'Max retries exceeded trying to reach %s' % url 162 tries += 1
112 } 163 else:
113 logging.error(error['message']) 164 if should_retry:
chanli 2016/11/23 23:45:32 Shouldn't you also handle case when should_retry =
lijeffrey 2016/11/28 23:55:24 If I'm understanding your comment correctly, the s
114 return None, error 165 # Indicate in the error that the retry timeout was reached.
166 error['retry_timeout'] = True
167 break
115 168
116 return content, None 169 logging.error('Failed to get an adequate response from %s. No data could be '
170 'retrieved', url)
171 return None, error
117 172
118 173
119 def GetSwarmingTaskRequest(task_id, http_client): 174 def GetSwarmingTaskRequest(task_id, http_client):
120 """Returns an instance of SwarmingTaskRequest representing the given task.""" 175 """Returns an instance of SwarmingTaskRequest representing the given task."""
121 swarming_server_host = waterfall_config.GetSwarmingSettings().get( 176 swarming_server_host = waterfall_config.GetSwarmingSettings().get(
122 'server_host') 177 'server_host')
123 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( 178 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % (
124 swarming_server_host, task_id) 179 swarming_server_host, task_id)
125 content, error = _SendRequestToServer(url, http_client) 180 content, error = _SendRequestToServer(url, http_client)
126 181
(...skipping 20 matching lines...) Expand all
147 request.priority = max(100, swarming_settings.get('default_request_priority')) 202 request.priority = max(100, swarming_settings.get('default_request_priority'))
148 request.expiration_secs = request_expiration_hours * 60 * 60 203 request.expiration_secs = request_expiration_hours * 60 * 60
149 204
150 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) 205 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit'])
151 206
152 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( 207 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get(
153 'server_host') 208 'server_host')
154 response_data, error = _SendRequestToServer( 209 response_data, error = _SendRequestToServer(
155 url, http_client, request.Serialize()) 210 url, http_client, request.Serialize())
156 211
157 # TODO(lijeffrey): Handle error in calling functions.
158 if not error: 212 if not error:
159 return json.loads(response_data)['task_id'], None 213 return json.loads(response_data)['task_id'], None
160 214
161 return None, error 215 return None, error
162 216
163 217
164 def ListSwarmingTasksDataByTags( 218 def ListSwarmingTasksDataByTags(
165 master_name, builder_name, build_number, http_client, 219 master_name, builder_name, build_number, http_client,
166 additional_tag_filters=None): 220 additional_tag_filters=None):
167 """Downloads tasks data from swarming server. 221 """Downloads tasks data from swarming server.
(...skipping 303 matching lines...) Expand 10 before | Expand all | Expand 10 after
471 for isolated_data in list_isolated_data: 525 for isolated_data in list_isolated_data:
472 output_json, _ = _DownloadTestResults(isolated_data, http_client) 526 output_json, _ = _DownloadTestResults(isolated_data, http_client)
473 if not output_json: 527 if not output_json:
474 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. 528 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.
475 return None 529 return None
476 shard_results.append(output_json) 530 shard_results.append(output_json)
477 531
478 if len(list_isolated_data) == 1: 532 if len(list_isolated_data) == 1:
479 return shard_results[0] 533 return shard_results[0]
480 return _MergeSwarmingTestShards(shard_results) 534 return _MergeSwarmingTestShards(shard_results)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698