appengine/findit/waterfall/swarming_util.py - Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls

Side by Side Diff: appengine/findit/waterfall/swarming_util.py

Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls (Closed)

Patch Set: Addressing comments Rebase Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « appengine/findit/waterfall/process_base_swarming_task_result_pipeline.py ('k') | appengine/findit/waterfall/test/process_base_swarming_task_result_pipeline_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import base64	5 import base64

6 from collections import defaultdict	6 from collections import defaultdict

7 import json	7 import json

8 import logging	8 import logging

	9 import time

9 import urllib	10 import urllib

10 import zlib	11 import zlib

11	12

12 from google.appengine.api.urlfetch_errors import DeadlineExceededError	13 from google.appengine.api.urlfetch_errors import DeadlineExceededError

13 from google.appengine.api.urlfetch_errors import DownloadError	14 from google.appengine.api.urlfetch_errors import DownloadError

14 from google.appengine.api.urlfetch_errors import ConnectionClosedError	15 from google.appengine.api.urlfetch_errors import ConnectionClosedError

15 from google.appengine.ext import ndb	16 from google.appengine.ext import ndb

16	17

17 from common import auth_util	18 from common import auth_util

18 from model.wf_step import WfStep	19 from model.wf_step import WfStep

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
62 TASK_FAILED = 2	63 TASK_FAILED = 2

63	64

64 # Swarming task exit code descriptions.	65 # Swarming task exit code descriptions.

65 EXIT_CODE_DESCRIPTIONS = {	66 EXIT_CODE_DESCRIPTIONS = {

66 ALL_TESTS_PASSED: 'All tests passed',	67 ALL_TESTS_PASSED: 'All tests passed',

67 SOME_TESTS_FAILED: 'Some tests failed',	68 SOME_TESTS_FAILED: 'Some tests failed',

68 TASK_FAILED: 'Swarming task failed',	69 TASK_FAILED: 'Swarming task failed',

69 }	70 }

70	71

71	72

	73 def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval):

	74 """Returns how many seconds to wait before next retry.

	75

	76 Params:

	77 retry_backoff (int): The base backoff in seconds.

	78 tries (int): Indicates how many tries have been done.

	79 maximum_retry_interval (int): The upper limit in seconds of how long to wait

	80 between retries.

	81 """

	82 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)

	83

	84

72 def _SendRequestToServer(url, http_client, post_data=None):	85 def _SendRequestToServer(url, http_client, post_data=None):

73 """Sends GET/POST request to arbitrary url and returns response content."""	86 """Sends GET/POST request to arbitrary url and returns response content.

	87

	88 Because the Swarming and Isolated servers that _SendRequestToServer tries to

	89 contact are prone to outages, exceptions trying to reach them may occur thus

	90 this method should retry. We want to monitor and document these occurrences

	91 even if the request eventually succeeds after retrying, with the last error

	92 encountered being the one that is reported.

	93

	94 Args:

	95 url (str): The url to send the request to.

	96 http_client (HttpClient): The httpclient object with which to make the

	97 server calls.

	98 post_data (dict): Data/params to send with the request, if any.

	99 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming
	stgao 2016/12/01 20:33:46 This seems invalid now. This seems invalid now. lijeffrey 2016/12/01 20:51:11 Good catch, originally I had designed this to cons Show quoted text On 2016/12/01 20:33:46, stgao (slow on Monday) wrote: > This seems invalid now. Good catch, originally I had designed this to consume a swarming task that can capture the error but ended up putting that logic in the calling function instead. I will remove this line in a follow up CL.
	100 task with which to capture errors.

	101

	102 Returns:

	103 content (dict), error (dict): The content from the server and the last error

	104 encountered trying to retrieve it.

	105 """

74 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}	106 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}

	107 swarming_settings = waterfall_config.GetSwarmingSettings()

	108 should_retry = swarming_settings.get('should_retry_server')

	109 timeout_seconds = (

	110 swarming_settings.get('server_retry_timeout_hours') * 60 * 60)

	111 maximum_retry_interval = swarming_settings.get(

	112 'maximum_server_contact_retry_interval_seconds')

	113 deadline = time.time() + timeout_seconds

	114 retry_backoff = 60

	115 tries = 1

75 error = None	116 error = None

76	117

77 try:	118 if post_data:

78 if post_data:	119 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))
	stgao 2016/12/01 20:33:46 Why we need to sort? Why we need to sort? lijeffrey 2016/12/01 20:51:12 Chan may be more familiar with how post_data needs Show quoted text On 2016/12/01 20:33:46, stgao (slow on Monday) wrote: > Why we need to sort? Chan may be more familiar with how post_data needs to be formatted, my change is only to move this particular line it outside of the while loop/try statement so it's not regenerated each time.
79 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))	120 headers['Content-Type'] = 'application/json; charset=UTF-8'

80 headers['Content-Type'] = 'application/json; charset=UTF-8'	121 headers['Content-Length'] = len(post_data)

81 headers['Content-Length'] = len(post_data)	122

82 status_code, content = http_client.Post(url, post_data, headers=headers)	123 while True:

	124 try:

	125 if post_data:

	126 status_code, content = http_client.Post(url, post_data, headers=headers)

	127 else:

	128 status_code, content = http_client.Get(url, headers=headers)

	129 except ConnectionClosedError as e:

	130 error = {

	131 'code': URLFETCH_CONNECTION_CLOSED_ERROR,

	132 'message': e.message

	133 }

	134 except DeadlineExceededError as e:

	135 error = {

	136 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,

	137 'message': e.message

	138 }

	139 except DownloadError as e:

	140 error = {

	141 'code': URLFETCH_DOWNLOAD_ERROR,

	142 'message': e.message

	143 }

	144 except Exception as e: # pragma: no cover

	145 error = {

	146 'code': UNKNOWN,

	147 'message': e.message

	148 }

	149

	150 if error or status_code != 200:

	151 # The retry upon 50x (501 excluded) is automatically handled in the

	152 # underlying http_client.

	153 # By default, it retries 5 times with exponential backoff.

	154 error = error or {

	155 'code': EXCEEDED_MAX_RETRIES_ERROR,

	156 'message': 'Max retries exceeded trying to reach %s' % url

	157 }

	158 logging.error(error['message'])

83 else:	159 else:

84 status_code, content = http_client.Get(url, headers=headers)	160 # Even if the call is successful, still return the last error encountered.

85 except ConnectionClosedError as e:	161 return content, error

86 error = {

87 'code': URLFETCH_CONNECTION_CLOSED_ERROR,

88 'message': e.message

89 }

90 except DeadlineExceededError as e:

91 error = {

92 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,

93 'message': e.message

94 }

95 except DownloadError as e:

96 error = {

97 'code': URLFETCH_DOWNLOAD_ERROR,

98 'message': e.message

99 }

100 except Exception as e: # pragma: no cover

101 error = {

102 'code': UNKNOWN,

103 'message': e.message

104 }

105 # Still raise an exception here for cases not encountered before in order

106 # to see what went wrong in the logs.

107 raise e

108	162

109 if error or status_code != 200:	163 if should_retry and time.time() < deadline: # pragma: no cover

110 # The retry upon 50x (501 excluded) is automatically handled in the	164 # Wait, then retry if applicable.

111 # underlying http_client.	165 wait_time = _GetBackoffSeconds(

112 # By default, it retries 5 times with exponential backoff.	166 retry_backoff, tries, maximum_retry_interval)

113 error = error or {	167 logging.info('Retrying connection to %s in %d seconds', url, wait_time)

114 'code': EXCEEDED_MAX_RETRIES_ERROR,	168 time.sleep(wait_time)

115 'message': 'Max retries exceeded trying to reach %s' % url	169 tries += 1

116 }	170 else:

117 logging.error(error['message'])	171 if should_retry:

118 return None, error	172 # Indicate in the error that the retry timeout was reached.

	173 error['retry_timeout'] = True

	174 break

119	175

120 return content, None	176 logging.error('Failed to get an adequate response from %s. No data could be '

	177 'retrieved', url)

	178 return None, error

121	179

122	180

123 def GetSwarmingTaskRequest(task_id, http_client):	181 def GetSwarmingTaskRequest(task_id, http_client):

124 """Returns an instance of SwarmingTaskRequest representing the given task."""	182 """Returns an instance of SwarmingTaskRequest representing the given task."""

125 swarming_server_host = waterfall_config.GetSwarmingSettings().get(	183 swarming_server_host = waterfall_config.GetSwarmingSettings().get(

126 'server_host')	184 'server_host')

127 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % (	185 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % (

128 swarming_server_host, task_id)	186 swarming_server_host, task_id)

129 content, error = _SendRequestToServer(url, http_client)	187 content, error = _SendRequestToServer(url, http_client)

130	188

(...skipping 20 matching lines...) Expand all Loading...
151 request.priority = max(100, swarming_settings.get('default_request_priority'))	209 request.priority = max(100, swarming_settings.get('default_request_priority'))

152 request.expiration_secs = request_expiration_hours * 60 * 60	210 request.expiration_secs = request_expiration_hours * 60 * 60

153	211

154 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit'])	212 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit'])

155	213

156 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get(	214 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get(

157 'server_host')	215 'server_host')

158 response_data, error = _SendRequestToServer(	216 response_data, error = _SendRequestToServer(

159 url, http_client, request.Serialize())	217 url, http_client, request.Serialize())

160	218

161 # TODO(lijeffrey): Handle error in calling functions.

162 if not error:	219 if not error:

163 return json.loads(response_data)['task_id'], None	220 return json.loads(response_data)['task_id'], None

164	221

165 return None, error	222 return None, error

166	223

167	224

168 def ListSwarmingTasksDataByTags(	225 def ListSwarmingTasksDataByTags(

169 master_name, builder_name, build_number, http_client,	226 master_name, builder_name, build_number, http_client,

170 additional_tag_filters=None):	227 additional_tag_filters=None):

171 """Downloads tasks data from swarming server.	228 """Downloads tasks data from swarming server.

(...skipping 303 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
475 for isolated_data in list_isolated_data:	532 for isolated_data in list_isolated_data:

476 output_json, _ = _DownloadTestResults(isolated_data, http_client)	533 output_json, _ = _DownloadTestResults(isolated_data, http_client)

477 if not output_json:	534 if not output_json:

478 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.	535 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.

479 return None	536 return None

480 shard_results.append(output_json)	537 shard_results.append(output_json)

481	538

482 if len(list_isolated_data) == 1:	539 if len(list_isolated_data) == 1:

483 return shard_results[0]	540 return shard_results[0]

484 return _MergeSwarmingTestShards(shard_results)	541 return _MergeSwarmingTestShards(shard_results)

OLD	NEW