Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2575)

Unified Diff: appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py

Issue 2592683002: Retry non-streaming gRPC calls (Closed)
Patch Set: Disable incorrect pylint warning Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | appengine/swarming/swarming_bot/bot_code/remote_client_grpc_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
diff --git a/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py b/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
index 7393c415d9ade7b74117a7e3a9054f27eb1715fe..e68e067fc90434a6699dee44d9f5fa3c8553ab40 100644
--- a/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
+++ b/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
@@ -7,12 +7,16 @@
import json
import logging
+import math
+import random
+import time
import grpc
import google.protobuf.json_format
from proto_bot import swarming_bot_pb2
from remote_client_errors import InternalError
from remote_client_errors import PollError
+from utils import net
# How long to wait for a response from the server. Keeping the same as
@@ -20,6 +24,14 @@ from remote_client_errors import PollError
NET_CONNECTION_TIMEOUT_SEC = 5*60
+# How many times to retry a gRPC call
+MAX_GRPC_ATTEMPTS = 30
+
+
+# Longest time to sleep between gRPC calls
+MAX_GRPC_SLEEP = 10.
+
+
class RemoteClientGrpc(object):
"""RemoteClientGrpc knows how to make calls via gRPC.
"""
@@ -68,8 +80,7 @@ class RemoteClientGrpc(object):
google.protobuf.json_format.ParseDict(params, request)
# Perform update
- response = self._stub.TaskUpdate(request,
- timeout=NET_CONNECTION_TIMEOUT_SEC)
+ response = call_grpc(self._stub.TaskUpdate, request)
logging.debug('post_task_update() = %s', request)
if response.error:
raise InternalError(response.error)
@@ -82,7 +93,7 @@ class RemoteClientGrpc(object):
request.msg = message
logging.error('post_task_error() = %s', request)
- response = self._stub.TaskError(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
+ response = call_grpc(self._stub.TaskError, request)
return response.ok
def _attributes_json_to_proto(self, json_attr, msg):
@@ -96,7 +107,7 @@ class RemoteClientGrpc(object):
def do_handshake(self, attributes):
request = swarming_bot_pb2.HandshakeRequest()
self._attributes_json_to_proto(attributes, request.attributes)
- response = self._stub.Handshake(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
+ response = call_grpc(self._stub.Handshake, request)
resp = {
'server_version': response.server_version,
'bot_version': response.bot_version,
@@ -114,7 +125,7 @@ class RemoteClientGrpc(object):
request = swarming_bot_pb2.PollRequest()
self._attributes_json_to_proto(attributes, request.attributes)
# TODO(aludwin): gRPC-specific exception handling (raise PollError).
- response = self._stub.Poll(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
+ response = call_grpc(self._stub.Poll, request)
if response.cmd == swarming_bot_pb2.PollResponse.UPDATE:
return 'update', response.version
@@ -174,7 +185,7 @@ class RemoteClientGrpc(object):
logging.info('Updating to version: %s', bot_version)
request = swarming_bot_pb2.BotUpdateRequest()
request.bot_version = bot_version
- response = self._stub.BotUpdate(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
+ response = call_grpc(self._stub.BotUpdate, request)
with open(new_zip_fn, 'wb') as f:
f.write(response.bot_code)
@@ -222,3 +233,20 @@ def insert_dict_as_submessage(message, keyname, value):
"""
sub_msg = getattr(message, keyname)
google.protobuf.json_format.Parse(json.dumps(value), sub_msg)
+
+
+def call_grpc(method, request):
+ """Retries a command a set number of times"""
+ for attempt in range(1, MAX_GRPC_ATTEMPTS+1):
+ try:
+ return method(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
+ except grpc.RpcError as g:
+ if g.code() is not grpc.StatusCode.UNAVAILABLE:
+ raise
+ logging.warning('call_grpc - proxy is unavailable (attempt %d/%d)',
+ attempt, MAX_GRPC_ATTEMPTS)
+ grpc_error = g
+ time.sleep(net.calculate_sleep_before_retry(attempt, MAX_GRPC_SLEEP))
+ # If we get here, it must be because we got (and saved) an error
+ assert grpc_error is not None
+ raise grpc_error
« no previous file with comments | « no previous file | appengine/swarming/swarming_bot/bot_code/remote_client_grpc_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698