| Index: appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py | 
| diff --git a/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py b/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py | 
| index 7393c415d9ade7b74117a7e3a9054f27eb1715fe..e68e067fc90434a6699dee44d9f5fa3c8553ab40 100644 | 
| --- a/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py | 
| +++ b/appengine/swarming/swarming_bot/bot_code/remote_client_grpc.py | 
| @@ -7,12 +7,16 @@ | 
|  | 
| import json | 
| import logging | 
| +import math | 
| +import random | 
| +import time | 
|  | 
| import grpc | 
| import google.protobuf.json_format | 
| from proto_bot import swarming_bot_pb2 | 
| from remote_client_errors import InternalError | 
| from remote_client_errors import PollError | 
| +from utils import net | 
|  | 
|  | 
| # How long to wait for a response from the server. Keeping the same as | 
| @@ -20,6 +24,14 @@ from remote_client_errors import PollError | 
| NET_CONNECTION_TIMEOUT_SEC = 5*60 | 
|  | 
|  | 
| +# How many times to retry a gRPC call | 
| +MAX_GRPC_ATTEMPTS = 30 | 
| + | 
| + | 
| +# Longest time to sleep between gRPC calls | 
| +MAX_GRPC_SLEEP = 10. | 
| + | 
| + | 
| class RemoteClientGrpc(object): | 
| """RemoteClientGrpc knows how to make calls via gRPC. | 
| """ | 
| @@ -68,8 +80,7 @@ class RemoteClientGrpc(object): | 
| google.protobuf.json_format.ParseDict(params, request) | 
|  | 
| # Perform update | 
| -    response = self._stub.TaskUpdate(request, | 
| -                                     timeout=NET_CONNECTION_TIMEOUT_SEC) | 
| +    response = call_grpc(self._stub.TaskUpdate, request) | 
| logging.debug('post_task_update() = %s', request) | 
| if response.error: | 
| raise InternalError(response.error) | 
| @@ -82,7 +93,7 @@ class RemoteClientGrpc(object): | 
| request.msg = message | 
| logging.error('post_task_error() = %s', request) | 
|  | 
| -    response = self._stub.TaskError(request, timeout=NET_CONNECTION_TIMEOUT_SEC) | 
| +    response = call_grpc(self._stub.TaskError, request) | 
| return response.ok | 
|  | 
| def _attributes_json_to_proto(self, json_attr, msg): | 
| @@ -96,7 +107,7 @@ class RemoteClientGrpc(object): | 
| def do_handshake(self, attributes): | 
| request = swarming_bot_pb2.HandshakeRequest() | 
| self._attributes_json_to_proto(attributes, request.attributes) | 
| -    response = self._stub.Handshake(request, timeout=NET_CONNECTION_TIMEOUT_SEC) | 
| +    response = call_grpc(self._stub.Handshake, request) | 
| resp = { | 
| 'server_version': response.server_version, | 
| 'bot_version': response.bot_version, | 
| @@ -114,7 +125,7 @@ class RemoteClientGrpc(object): | 
| request = swarming_bot_pb2.PollRequest() | 
| self._attributes_json_to_proto(attributes, request.attributes) | 
| # TODO(aludwin): gRPC-specific exception handling (raise PollError). | 
| -    response = self._stub.Poll(request, timeout=NET_CONNECTION_TIMEOUT_SEC) | 
| +    response = call_grpc(self._stub.Poll, request) | 
|  | 
| if response.cmd == swarming_bot_pb2.PollResponse.UPDATE: | 
| return 'update', response.version | 
| @@ -174,7 +185,7 @@ class RemoteClientGrpc(object): | 
| logging.info('Updating to version: %s', bot_version) | 
| request = swarming_bot_pb2.BotUpdateRequest() | 
| request.bot_version = bot_version | 
| -    response = self._stub.BotUpdate(request, timeout=NET_CONNECTION_TIMEOUT_SEC) | 
| +    response = call_grpc(self._stub.BotUpdate, request) | 
| with open(new_zip_fn, 'wb') as f: | 
| f.write(response.bot_code) | 
|  | 
| @@ -222,3 +233,20 @@ def insert_dict_as_submessage(message, keyname, value): | 
| """ | 
| sub_msg = getattr(message, keyname) | 
| google.protobuf.json_format.Parse(json.dumps(value), sub_msg) | 
| + | 
| + | 
| +def call_grpc(method, request): | 
| +  """Retries a command a set number of times""" | 
| +  for attempt in range(1, MAX_GRPC_ATTEMPTS+1): | 
| +    try: | 
| +      return method(request, timeout=NET_CONNECTION_TIMEOUT_SEC) | 
| +    except grpc.RpcError as g: | 
| +      if g.code() is not grpc.StatusCode.UNAVAILABLE: | 
| +        raise | 
| +      logging.warning('call_grpc - proxy is unavailable (attempt %d/%d)', | 
| +                      attempt, MAX_GRPC_ATTEMPTS) | 
| +      grpc_error = g | 
| +      time.sleep(net.calculate_sleep_before_retry(attempt, MAX_GRPC_SLEEP)) | 
| +  # If we get here, it must be because we got (and saved) an error | 
| +  assert grpc_error is not None | 
| +  raise grpc_error | 
|  |