Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1235)

Side by Side Diff: appengine/findit/waterfall/swarming_util.py

Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls (Closed)
Patch Set: Addressing comments Rebase Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import base64 5 import base64
6 from collections import defaultdict 6 from collections import defaultdict
7 import json 7 import json
8 import logging 8 import logging
9 import time
9 import urllib 10 import urllib
10 import zlib 11 import zlib
11 12
12 from google.appengine.api.urlfetch_errors import DeadlineExceededError 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError
13 from google.appengine.api.urlfetch_errors import DownloadError 14 from google.appengine.api.urlfetch_errors import DownloadError
14 from google.appengine.api.urlfetch_errors import ConnectionClosedError 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError
15 from google.appengine.ext import ndb 16 from google.appengine.ext import ndb
16 17
17 from common import auth_util 18 from common import auth_util
18 from model.wf_step import WfStep 19 from model.wf_step import WfStep
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
62 TASK_FAILED = 2 63 TASK_FAILED = 2
63 64
64 # Swarming task exit code descriptions. 65 # Swarming task exit code descriptions.
65 EXIT_CODE_DESCRIPTIONS = { 66 EXIT_CODE_DESCRIPTIONS = {
66 ALL_TESTS_PASSED: 'All tests passed', 67 ALL_TESTS_PASSED: 'All tests passed',
67 SOME_TESTS_FAILED: 'Some tests failed', 68 SOME_TESTS_FAILED: 'Some tests failed',
68 TASK_FAILED: 'Swarming task failed', 69 TASK_FAILED: 'Swarming task failed',
69 } 70 }
70 71
71 72
73 def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval):
74 """Returns how many seconds to wait before next retry.
75
76 Params:
77 retry_backoff (int): The base backoff in seconds.
78 tries (int): Indicates how many tries have been done.
79 maximum_retry_interval (int): The upper limit in seconds of how long to wait
80 between retries.
81 """
82 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)
83
84
72 def _SendRequestToServer(url, http_client, post_data=None): 85 def _SendRequestToServer(url, http_client, post_data=None):
73 """Sends GET/POST request to arbitrary url and returns response content.""" 86 """Sends GET/POST request to arbitrary url and returns response content.
87
88 Because the Swarming and Isolated servers that _SendRequestToServer tries to
89 contact are prone to outages, exceptions trying to reach them may occur thus
90 this method should retry. We want to monitor and document these occurrences
91 even if the request eventually succeeds after retrying, with the last error
92 encountered being the one that is reported.
93
94 Args:
95 url (str): The url to send the request to.
96 http_client (HttpClient): The httpclient object with which to make the
97 server calls.
98 post_data (dict): Data/params to send with the request, if any.
99 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming
stgao 2016/12/01 20:33:46 This seems invalid now.
lijeffrey 2016/12/01 20:51:11 Good catch, originally I had designed this to cons
100 task with which to capture errors.
101
102 Returns:
103 content (dict), error (dict): The content from the server and the last error
104 encountered trying to retrieve it.
105 """
74 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} 106 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}
107 swarming_settings = waterfall_config.GetSwarmingSettings()
108 should_retry = swarming_settings.get('should_retry_server')
109 timeout_seconds = (
110 swarming_settings.get('server_retry_timeout_hours') * 60 * 60)
111 maximum_retry_interval = swarming_settings.get(
112 'maximum_server_contact_retry_interval_seconds')
113 deadline = time.time() + timeout_seconds
114 retry_backoff = 60
115 tries = 1
75 error = None 116 error = None
76 117
77 try: 118 if post_data:
78 if post_data: 119 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))
stgao 2016/12/01 20:33:46 Why we need to sort?
lijeffrey 2016/12/01 20:51:12 Chan may be more familiar with how post_data needs
79 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) 120 headers['Content-Type'] = 'application/json; charset=UTF-8'
80 headers['Content-Type'] = 'application/json; charset=UTF-8' 121 headers['Content-Length'] = len(post_data)
81 headers['Content-Length'] = len(post_data) 122
82 status_code, content = http_client.Post(url, post_data, headers=headers) 123 while True:
124 try:
125 if post_data:
126 status_code, content = http_client.Post(url, post_data, headers=headers)
127 else:
128 status_code, content = http_client.Get(url, headers=headers)
129 except ConnectionClosedError as e:
130 error = {
131 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
132 'message': e.message
133 }
134 except DeadlineExceededError as e:
135 error = {
136 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
137 'message': e.message
138 }
139 except DownloadError as e:
140 error = {
141 'code': URLFETCH_DOWNLOAD_ERROR,
142 'message': e.message
143 }
144 except Exception as e: # pragma: no cover
145 error = {
146 'code': UNKNOWN,
147 'message': e.message
148 }
149
150 if error or status_code != 200:
151 # The retry upon 50x (501 excluded) is automatically handled in the
152 # underlying http_client.
153 # By default, it retries 5 times with exponential backoff.
154 error = error or {
155 'code': EXCEEDED_MAX_RETRIES_ERROR,
156 'message': 'Max retries exceeded trying to reach %s' % url
157 }
158 logging.error(error['message'])
83 else: 159 else:
84 status_code, content = http_client.Get(url, headers=headers) 160 # Even if the call is successful, still return the last error encountered.
85 except ConnectionClosedError as e: 161 return content, error
86 error = {
87 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
88 'message': e.message
89 }
90 except DeadlineExceededError as e:
91 error = {
92 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
93 'message': e.message
94 }
95 except DownloadError as e:
96 error = {
97 'code': URLFETCH_DOWNLOAD_ERROR,
98 'message': e.message
99 }
100 except Exception as e: # pragma: no cover
101 error = {
102 'code': UNKNOWN,
103 'message': e.message
104 }
105 # Still raise an exception here for cases not encountered before in order
106 # to see what went wrong in the logs.
107 raise e
108 162
109 if error or status_code != 200: 163 if should_retry and time.time() < deadline: # pragma: no cover
110 # The retry upon 50x (501 excluded) is automatically handled in the 164 # Wait, then retry if applicable.
111 # underlying http_client. 165 wait_time = _GetBackoffSeconds(
112 # By default, it retries 5 times with exponential backoff. 166 retry_backoff, tries, maximum_retry_interval)
113 error = error or { 167 logging.info('Retrying connection to %s in %d seconds', url, wait_time)
114 'code': EXCEEDED_MAX_RETRIES_ERROR, 168 time.sleep(wait_time)
115 'message': 'Max retries exceeded trying to reach %s' % url 169 tries += 1
116 } 170 else:
117 logging.error(error['message']) 171 if should_retry:
118 return None, error 172 # Indicate in the error that the retry timeout was reached.
173 error['retry_timeout'] = True
174 break
119 175
120 return content, None 176 logging.error('Failed to get an adequate response from %s. No data could be '
177 'retrieved', url)
178 return None, error
121 179
122 180
123 def GetSwarmingTaskRequest(task_id, http_client): 181 def GetSwarmingTaskRequest(task_id, http_client):
124 """Returns an instance of SwarmingTaskRequest representing the given task.""" 182 """Returns an instance of SwarmingTaskRequest representing the given task."""
125 swarming_server_host = waterfall_config.GetSwarmingSettings().get( 183 swarming_server_host = waterfall_config.GetSwarmingSettings().get(
126 'server_host') 184 'server_host')
127 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( 185 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % (
128 swarming_server_host, task_id) 186 swarming_server_host, task_id)
129 content, error = _SendRequestToServer(url, http_client) 187 content, error = _SendRequestToServer(url, http_client)
130 188
(...skipping 20 matching lines...) Expand all
151 request.priority = max(100, swarming_settings.get('default_request_priority')) 209 request.priority = max(100, swarming_settings.get('default_request_priority'))
152 request.expiration_secs = request_expiration_hours * 60 * 60 210 request.expiration_secs = request_expiration_hours * 60 * 60
153 211
154 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) 212 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit'])
155 213
156 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( 214 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get(
157 'server_host') 215 'server_host')
158 response_data, error = _SendRequestToServer( 216 response_data, error = _SendRequestToServer(
159 url, http_client, request.Serialize()) 217 url, http_client, request.Serialize())
160 218
161 # TODO(lijeffrey): Handle error in calling functions.
162 if not error: 219 if not error:
163 return json.loads(response_data)['task_id'], None 220 return json.loads(response_data)['task_id'], None
164 221
165 return None, error 222 return None, error
166 223
167 224
168 def ListSwarmingTasksDataByTags( 225 def ListSwarmingTasksDataByTags(
169 master_name, builder_name, build_number, http_client, 226 master_name, builder_name, build_number, http_client,
170 additional_tag_filters=None): 227 additional_tag_filters=None):
171 """Downloads tasks data from swarming server. 228 """Downloads tasks data from swarming server.
(...skipping 303 matching lines...) Expand 10 before | Expand all | Expand 10 after
475 for isolated_data in list_isolated_data: 532 for isolated_data in list_isolated_data:
476 output_json, _ = _DownloadTestResults(isolated_data, http_client) 533 output_json, _ = _DownloadTestResults(isolated_data, http_client)
477 if not output_json: 534 if not output_json:
478 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. 535 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.
479 return None 536 return None
480 shard_results.append(output_json) 537 shard_results.append(output_json)
481 538
482 if len(list_isolated_data) == 1: 539 if len(list_isolated_data) == 1:
483 return shard_results[0] 540 return shard_results[0]
484 return _MergeSwarmingTestShards(shard_results) 541 return _MergeSwarmingTestShards(shard_results)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698