| Index: appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
|
| diff --git a/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py b/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
|
| index 7393c415d9ade7b74117a7e3a9054f27eb1715fe..e68e067fc90434a6699dee44d9f5fa3c8553ab40 100644
|
| --- a/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
|
| +++ b/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py
|
| @@ -7,12 +7,16 @@
|
|
|
| import json
|
| import logging
|
| +import math
|
| +import random
|
| +import time
|
|
|
| import grpc
|
| import google.protobuf.json_format
|
| from proto_bot import swarming_bot_pb2
|
| from remote_client_errors import InternalError
|
| from remote_client_errors import PollError
|
| +from utils import net
|
|
|
|
|
| # How long to wait for a response from the server. Keeping the same as
|
| @@ -20,6 +24,14 @@ from remote_client_errors import PollError
|
| NET_CONNECTION_TIMEOUT_SEC = 5*60
|
|
|
|
|
| +# How many times to retry a gRPC call
|
| +MAX_GRPC_ATTEMPTS = 30
|
| +
|
| +
|
| +# Longest time to sleep between gRPC calls
|
| +MAX_GRPC_SLEEP = 10.
|
| +
|
| +
|
| class RemoteClientGrpc(object):
|
| """RemoteClientGrpc knows how to make calls via gRPC.
|
| """
|
| @@ -68,8 +80,7 @@ class RemoteClientGrpc(object):
|
| google.protobuf.json_format.ParseDict(params, request)
|
|
|
| # Perform update
|
| - response = self._stub.TaskUpdate(request,
|
| - timeout=NET_CONNECTION_TIMEOUT_SEC)
|
| + response = call_grpc(self._stub.TaskUpdate, request)
|
| logging.debug('post_task_update() = %s', request)
|
| if response.error:
|
| raise InternalError(response.error)
|
| @@ -82,7 +93,7 @@ class RemoteClientGrpc(object):
|
| request.msg = message
|
| logging.error('post_task_error() = %s', request)
|
|
|
| - response = self._stub.TaskError(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
|
| + response = call_grpc(self._stub.TaskError, request)
|
| return response.ok
|
|
|
| def _attributes_json_to_proto(self, json_attr, msg):
|
| @@ -96,7 +107,7 @@ class RemoteClientGrpc(object):
|
| def do_handshake(self, attributes):
|
| request = swarming_bot_pb2.HandshakeRequest()
|
| self._attributes_json_to_proto(attributes, request.attributes)
|
| - response = self._stub.Handshake(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
|
| + response = call_grpc(self._stub.Handshake, request)
|
| resp = {
|
| 'server_version': response.server_version,
|
| 'bot_version': response.bot_version,
|
| @@ -114,7 +125,7 @@ class RemoteClientGrpc(object):
|
| request = swarming_bot_pb2.PollRequest()
|
| self._attributes_json_to_proto(attributes, request.attributes)
|
| # TODO(aludwin): gRPC-specific exception handling (raise PollError).
|
| - response = self._stub.Poll(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
|
| + response = call_grpc(self._stub.Poll, request)
|
|
|
| if response.cmd == swarming_bot_pb2.PollResponse.UPDATE:
|
| return 'update', response.version
|
| @@ -174,7 +185,7 @@ class RemoteClientGrpc(object):
|
| logging.info('Updating to version: %s', bot_version)
|
| request = swarming_bot_pb2.BotUpdateRequest()
|
| request.bot_version = bot_version
|
| - response = self._stub.BotUpdate(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
|
| + response = call_grpc(self._stub.BotUpdate, request)
|
| with open(new_zip_fn, 'wb') as f:
|
| f.write(response.bot_code)
|
|
|
| @@ -222,3 +233,20 @@ def insert_dict_as_submessage(message, keyname, value):
|
| """
|
| sub_msg = getattr(message, keyname)
|
| google.protobuf.json_format.Parse(json.dumps(value), sub_msg)
|
| +
|
| +
|
| +def call_grpc(method, request):
|
| + """Retries a command a set number of times"""
|
| + for attempt in range(1, MAX_GRPC_ATTEMPTS+1):
|
| + try:
|
| + return method(request, timeout=NET_CONNECTION_TIMEOUT_SEC)
|
| + except grpc.RpcError as g:
|
| + if g.code() is not grpc.StatusCode.UNAVAILABLE:
|
| + raise
|
| + logging.warning('call_grpc - proxy is unavailable (attempt %d/%d)',
|
| + attempt, MAX_GRPC_ATTEMPTS)
|
| + grpc_error = g
|
| + time.sleep(net.calculate_sleep_before_retry(attempt, MAX_GRPC_SLEEP))
|
| + # If we get here, it must be because we got (and saved) an error
|
| + assert grpc_error is not None
|
| + raise grpc_error
|
|
|