Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2154)

Unified Diff: appengine/findit/waterfall/swarming_util.py

Issue 2526963002: [Findit] Implement retry within swarming_util.py when making server calls (Closed)
Patch Set: Addressing comments Rebase Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/waterfall/swarming_util.py
diff --git a/appengine/findit/waterfall/swarming_util.py b/appengine/findit/waterfall/swarming_util.py
index 20a1918741c793fb4665a2bae6a65a4a3a792289..44e2e8fe0068d013ce02e52a771a642447c65480 100644
--- a/appengine/findit/waterfall/swarming_util.py
+++ b/appengine/findit/waterfall/swarming_util.py
@@ -6,6 +6,7 @@ import base64
from collections import defaultdict
import json
import logging
+import time
import urllib
import zlib
@@ -69,55 +70,112 @@ EXIT_CODE_DESCRIPTIONS = {
}
+def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval):
+ """Returns how many seconds to wait before next retry.
+
+ Params:
+ retry_backoff (int): The base backoff in seconds.
+ tries (int): Indicates how many tries have been done.
+ maximum_retry_interval (int): The upper limit in seconds of how long to wait
+ between retries.
+ """
+ return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)
+
+
def _SendRequestToServer(url, http_client, post_data=None):
- """Sends GET/POST request to arbitrary url and returns response content."""
+ """Sends GET/POST request to arbitrary url and returns response content.
+
+ Because the Swarming and Isolated servers that _SendRequestToServer tries to
+ contact are prone to outages, exceptions trying to reach them may occur thus
+ this method should retry. We want to monitor and document these occurrences
+ even if the request eventually succeeds after retrying, with the last error
+ encountered being the one that is reported.
+
+ Args:
+ url (str): The url to send the request to.
+ http_client (HttpClient): The httpclient object with which to make the
+ server calls.
+ post_data (dict): Data/params to send with the request, if any.
+ swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming
stgao 2016/12/01 20:33:46 This seems invalid now.
lijeffrey 2016/12/01 20:51:11 Good catch, originally I had designed this to cons
+ task with which to capture errors.
+
+ Returns:
+ content (dict), error (dict): The content from the server and the last error
+ encountered trying to retrieve it.
+ """
headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}
+ swarming_settings = waterfall_config.GetSwarmingSettings()
+ should_retry = swarming_settings.get('should_retry_server')
+ timeout_seconds = (
+ swarming_settings.get('server_retry_timeout_hours') * 60 * 60)
+ maximum_retry_interval = swarming_settings.get(
+ 'maximum_server_contact_retry_interval_seconds')
+ deadline = time.time() + timeout_seconds
+ retry_backoff = 60
+ tries = 1
error = None
- try:
- if post_data:
- post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))
- headers['Content-Type'] = 'application/json; charset=UTF-8'
- headers['Content-Length'] = len(post_data)
- status_code, content = http_client.Post(url, post_data, headers=headers)
+ if post_data:
+ post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':'))
stgao 2016/12/01 20:33:46 Why we need to sort?
lijeffrey 2016/12/01 20:51:12 Chan may be more familiar with how post_data needs
+ headers['Content-Type'] = 'application/json; charset=UTF-8'
+ headers['Content-Length'] = len(post_data)
+
+ while True:
+ try:
+ if post_data:
+ status_code, content = http_client.Post(url, post_data, headers=headers)
+ else:
+ status_code, content = http_client.Get(url, headers=headers)
+ except ConnectionClosedError as e:
+ error = {
+ 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
+ 'message': e.message
+ }
+ except DeadlineExceededError as e:
+ error = {
+ 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
+ 'message': e.message
+ }
+ except DownloadError as e:
+ error = {
+ 'code': URLFETCH_DOWNLOAD_ERROR,
+ 'message': e.message
+ }
+ except Exception as e: # pragma: no cover
+ error = {
+ 'code': UNKNOWN,
+ 'message': e.message
+ }
+
+ if error or status_code != 200:
+ # The retry upon 50x (501 excluded) is automatically handled in the
+ # underlying http_client.
+ # By default, it retries 5 times with exponential backoff.
+ error = error or {
+ 'code': EXCEEDED_MAX_RETRIES_ERROR,
+ 'message': 'Max retries exceeded trying to reach %s' % url
+ }
+ logging.error(error['message'])
else:
- status_code, content = http_client.Get(url, headers=headers)
- except ConnectionClosedError as e:
- error = {
- 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
- 'message': e.message
- }
- except DeadlineExceededError as e:
- error = {
- 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
- 'message': e.message
- }
- except DownloadError as e:
- error = {
- 'code': URLFETCH_DOWNLOAD_ERROR,
- 'message': e.message
- }
- except Exception as e: # pragma: no cover
- error = {
- 'code': UNKNOWN,
- 'message': e.message
- }
- # Still raise an exception here for cases not encountered before in order
- # to see what went wrong in the logs.
- raise e
-
- if error or status_code != 200:
- # The retry upon 50x (501 excluded) is automatically handled in the
- # underlying http_client.
- # By default, it retries 5 times with exponential backoff.
- error = error or {
- 'code': EXCEEDED_MAX_RETRIES_ERROR,
- 'message': 'Max retries exceeded trying to reach %s' % url
- }
- logging.error(error['message'])
- return None, error
+ # Even if the call is successful, still return the last error encountered.
+ return content, error
+
+ if should_retry and time.time() < deadline: # pragma: no cover
+ # Wait, then retry if applicable.
+ wait_time = _GetBackoffSeconds(
+ retry_backoff, tries, maximum_retry_interval)
+ logging.info('Retrying connection to %s in %d seconds', url, wait_time)
+ time.sleep(wait_time)
+ tries += 1
+ else:
+ if should_retry:
+ # Indicate in the error that the retry timeout was reached.
+ error['retry_timeout'] = True
+ break
- return content, None
+ logging.error('Failed to get an adequate response from %s. No data could be '
+ 'retrieved', url)
+ return None, error
def GetSwarmingTaskRequest(task_id, http_client):
@@ -158,7 +216,6 @@ def TriggerSwarmingTask(request, http_client):
response_data, error = _SendRequestToServer(
url, http_client, request.Serialize())
- # TODO(lijeffrey): Handle error in calling functions.
if not error:
return json.loads(response_data)['task_id'], None

Powered by Google App Engine
This is Rietveld 408576698