appengine/findit/waterfall/swarming_util.py - Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls

Side by Side Diff: appengine/findit/waterfall/swarming_util.py

Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls (Closed)

Patch Set: Self-review Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« appengine/findit/waterfall/process_base_swarming_task_result_pipeline.py ('K') | « appengine/findit/waterfall/process_base_swarming_task_result_pipeline.py ('k') | appengine/findit/waterfall/test/process_base_swarming_task_result_pipeline_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import base64	5 import base64

6 from collections import defaultdict	6 from collections import defaultdict

7 import json	7 import json

8 import logging	8 import logging

	9 import time

9 import urllib	10 import urllib

10 import zlib	11 import zlib

11	12

12 from google.appengine.api.urlfetch_errors import DeadlineExceededError	13 from google.appengine.api.urlfetch_errors import DeadlineExceededError

13 from google.appengine.api.urlfetch_errors import DownloadError	14 from google.appengine.api.urlfetch_errors import DownloadError

14 from google.appengine.api.urlfetch_errors import ConnectionClosedError	15 from google.appengine.api.urlfetch_errors import ConnectionClosedError

15 from google.appengine.ext import ndb	16 from google.appengine.ext import ndb

16	17

17 from common import auth_util	18 from common import auth_util

18 from model.wf_step import WfStep	19 from model.wf_step import WfStep

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
58 TASK_FAILED = 2	59 TASK_FAILED = 2

59	60

60 # Swarming task exit code descriptions.	61 # Swarming task exit code descriptions.

61 EXIT_CODE_DESCRIPTIONS = {	62 EXIT_CODE_DESCRIPTIONS = {

62 ALL_TESTS_PASSED: 'All tests passed',	63 ALL_TESTS_PASSED: 'All tests passed',

63 SOME_TESTS_FAILED: 'Some tests failed',	64 SOME_TESTS_FAILED: 'Some tests failed',

64 TASK_FAILED: 'Swarming task failed',	65 TASK_FAILED: 'Swarming task failed',

65 }	66 }

66	67

67	68

	69 def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval):

	70 """Returns how many seconds to wait before next retry.

	71

	72 Params:

	73 retry_backoff (int): The base backoff in seconds.

	74 tries (int): Indicates how many tries have been done.

	75 maximum_retry_interval (int): The upper limit in seconds of how long to wait

	76 between retries.

	77 """

	78 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)

	79

	80

68 def _SendRequestToServer(url, http_client, post_data=None):	81 def _SendRequestToServer(url, http_client, post_data=None):

69 """Sends GET/POST request to arbitrary url and returns response content."""	82 """Sends GET/POST request to arbitrary url and returns response content.

	83

	84 Because the Swarming and Isolated servers that _SendRequestToServer tries to

	85 contact are prone to outages, exceptions trying to reach them may occur thus

	86 this method should retry. We want to monitor and document these occurrences

	87 even if the request eventually succeeds after retrying, with the last error

	88 encountered being the one that is reported.

	89

	90 Args:

	91 url (str): The url to send the request to.

	92 http_client (HttpClient): The httpclient object with which to make the

	93 server calls.

	94 post_data (dict): Data/params to send with the request, if any.

	95 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming

	96 task with which to capture errors.

	97

	98 Returns:

	99 content (dict), error (dict): The content from the server and the last error

	100 encountered trying to retrieve it.

	101 """

70 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}	102 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}

	103 swarming_settings = waterfall_config.GetSwarmingSettings()

	104 should_retry = swarming_settings.get('should_retry_server')
	chanli 2016/11/23 23:45:32 I have a question: How long is the server down typ I have a question: How long is the server down typically? How do we decide the time we wait and retry for a task? lijeffrey 2016/11/28 23:55:24 It's difficult to say just from the logs, which us Show quoted text On 2016/11/23 23:45:32, chanli wrote: > I have a question: How long is the server down typically? How do we decide the > time we wait and retry for a task? It's difficult to say just from the logs, which using data from October 2016 doesn't seem to be available anymore :(. Originally when I was seeing these errors we only get 1 at a time, since there is no retry logic that 1 flake analysis that happened to have been requested at that 1 time would see it, then it would be a period of several hours until the next request came in which may or may not end in the same error. I plan to hook up an upcoming CL to use ts_mon for each retry to log the errors encountered, so it should be possible to detect a continuous outage and we can use that to adjust our timeouts accordingly. chanli 2016/11/29 18:49:38 Acknowledged. Show quoted text On 2016/11/28 23:55:24, lijeffrey wrote: > On 2016/11/23 23:45:32, chanli wrote: > > I have a question: How long is the server down typically? How do we decide the > > time we wait and retry for a task? > > It's difficult to say just from the logs, which using data from October 2016 > doesn't seem to be available anymore :(. Originally when I was seeing these > errors we only get 1 at a time, since there is no retry logic that 1 flake > analysis that happened to have been requested at that 1 time would see it, then > it would be a period of several hours until the next request came in which may > or may not end in the same error. > > I plan to hook up an upcoming CL to use ts_mon for each retry to log the errors > encountered, so it should be possible to detect a continuous outage and we can > use that to adjust our timeouts accordingly. Acknowledged.
	105 timeout_seconds = swarming_settings.get('server_retry_timeout') * 60 * 60

	106 maximum_retry_interval = swarming_settings.get(

	107 'maximum_server_contact_retry_interval_seconds')

	108 deadline = time.time() + timeout_seconds

	109 retry_backoff = 60

	110 tries = 1

71 error = None	111 error = None

72	112

73 try:	113 while True:

74 if post_data:	114 try:

75 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))	115 if post_data:

76 headers['Content-Type'] = 'application/json; charset=UTF-8'	116 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))

77 headers['Content-Length'] = len(post_data)	117 headers['Content-Type'] = 'application/json; charset=UTF-8'

78 status_code, content = http_client.Post(url, post_data, headers=headers)	118 headers['Content-Length'] = len(post_data)
	chanli 2016/11/30 18:53:29 Nit: This part can be done only once outside of th Nit: This part can be done only once outside of the loop. lijeffrey 2016/11/30 20:12:11 Done. Show quoted text On 2016/11/30 18:53:29, chanli wrote: > Nit: This part can be done only once outside of the loop. Done.
	119 status_code, content = http_client.Post(url, post_data, headers=headers)

	120 else:

	121 status_code, content = http_client.Get(url, headers=headers)

	122 except ConnectionClosedError as e:

	123 error = {

	124 'code': URLFETCH_CONNECTION_CLOSED_ERROR,

	125 'message': e.message

	126 }

	127 except DeadlineExceededError as e:

	128 error = {

	129 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,

	130 'message': e.message

	131 }

	132 except DownloadError as e:

	133 error = {

	134 'code': URLFETCH_DOWNLOAD_ERROR,

	135 'message': e.message

	136 }

	137 except Exception as e: # pragma: no cover

	138 error = {

	139 'code': UNKNOWN,

	140 'message': e.message

	141 }

	142

	143 if error or status_code != 200:

	144 # The retry upon 50x (501 excluded) is automatically handled in the

	145 # underlying http_client.

	146 # By default, it retries 5 times with exponential backoff.

	147 error = error or {

	148 'code': EXCEEDED_MAX_RETRIES_ERROR,

	149 'message': 'Max retries exceeded trying to reach %s' % url

	150 }

	151 logging.error(error['message'])

79 else:	152 else:

80 status_code, content = http_client.Get(url, headers=headers)	153 # Even if the call is successful, still return the last error encountered.

81 except ConnectionClosedError as e:	154 return content, error

82 error = {

83 'code': URLFETCH_CONNECTION_CLOSED_ERROR,

84 'message': e.message

85 }

86 except DeadlineExceededError as e:

87 error = {

88 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,

89 'message': e.message

90 }

91 except DownloadError as e:

92 error = {

93 'code': URLFETCH_DOWNLOAD_ERROR,

94 'message': e.message

95 }

96 except Exception as e: # pragma: no cover

97 error = {

98 'code': UNKNOWN,

99 'message': e.message

100 }

101 # Still raise an exception here for cases not encountered before in order

102 # to see what went wrong in the logs.

103 raise e

104	155

105 if error or status_code != 200:	156 if should_retry and time.time() < deadline: # pragma: no cover

106 # The retry upon 50x (501 excluded) is automatically handled in the	157 # Wait, then retry if applicable.

107 # underlying http_client.	158 wait_time = _GetBackoffSeconds(

108 # By default, it retries 5 times with exponential backoff.	159 retry_backoff, tries, maximum_retry_interval)

109 error = error or {	160 logging.info('Retrying connection to %s in %d seconds', url, wait_time)

110 'code': EXCEEDED_MAX_RETRIES_ERROR,	161 time.sleep(wait_time)

111 'message': 'Max retries exceeded trying to reach %s' % url	162 tries += 1

112 }	163 else:

113 logging.error(error['message'])	164 if should_retry:
	chanli 2016/11/23 23:45:32 Shouldn't you also handle case when should_retry = Shouldn't you also handle case when should_retry == False and should break the loop? lijeffrey 2016/11/28 23:55:24 If I'm understanding your comment correctly, the s Show quoted text On 2016/11/23 23:45:32, chanli wrote: > Shouldn't you also handle case when should_retry == False and should break the > loop? If I'm understanding your comment correctly, the should_retry == False too will break the loop. We only want to include more information in the error dict that Findit eventually gave up on trying after retrying for a period of time.
114 return None, error	165 # Indicate in the error that the retry timeout was reached.

	166 error['retry_timeout'] = True

	167 break

115	168

116 return content, None	169 logging.error('Failed to get an adequate response from %s. No data could be '

	170 'retrieved', url)

	171 return None, error

117	172

118	173

119 def GetSwarmingTaskRequest(task_id, http_client):	174 def GetSwarmingTaskRequest(task_id, http_client):

120 """Returns an instance of SwarmingTaskRequest representing the given task."""	175 """Returns an instance of SwarmingTaskRequest representing the given task."""

121 swarming_server_host = waterfall_config.GetSwarmingSettings().get(	176 swarming_server_host = waterfall_config.GetSwarmingSettings().get(

122 'server_host')	177 'server_host')

123 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % (	178 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % (

124 swarming_server_host, task_id)	179 swarming_server_host, task_id)

125 content, error = _SendRequestToServer(url, http_client)	180 content, error = _SendRequestToServer(url, http_client)

126	181

(...skipping 20 matching lines...) Expand all Loading...
147 request.priority = max(100, swarming_settings.get('default_request_priority'))	202 request.priority = max(100, swarming_settings.get('default_request_priority'))

148 request.expiration_secs = request_expiration_hours * 60 * 60	203 request.expiration_secs = request_expiration_hours * 60 * 60

149	204

150 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit'])	205 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit'])

151	206

152 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get(	207 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get(

153 'server_host')	208 'server_host')

154 response_data, error = _SendRequestToServer(	209 response_data, error = _SendRequestToServer(

155 url, http_client, request.Serialize())	210 url, http_client, request.Serialize())

156	211

157 # TODO(lijeffrey): Handle error in calling functions.

158 if not error:	212 if not error:

159 return json.loads(response_data)['task_id'], None	213 return json.loads(response_data)['task_id'], None

160	214

161 return None, error	215 return None, error

162	216

163	217

164 def ListSwarmingTasksDataByTags(	218 def ListSwarmingTasksDataByTags(

165 master_name, builder_name, build_number, http_client,	219 master_name, builder_name, build_number, http_client,

166 additional_tag_filters=None):	220 additional_tag_filters=None):

167 """Downloads tasks data from swarming server.	221 """Downloads tasks data from swarming server.

(...skipping 303 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
471 for isolated_data in list_isolated_data:	525 for isolated_data in list_isolated_data:

472 output_json, _ = _DownloadTestResults(isolated_data, http_client)	526 output_json, _ = _DownloadTestResults(isolated_data, http_client)

473 if not output_json:	527 if not output_json:

474 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.	528 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.

475 return None	529 return None

476 shard_results.append(output_json)	530 shard_results.append(output_json)

477	531

478 if len(list_isolated_data) == 1:	532 if len(list_isolated_data) == 1:

479 return shard_results[0]	533 return shard_results[0]

480 return _MergeSwarmingTestShards(shard_results)	534 return _MergeSwarmingTestShards(shard_results)

OLD	NEW