Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Unified Diff: appengine/swarming/swarming_bot/bot_code/bot_main.py

Issue 2593863002: Fix evil retry loop on poll errors. (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/swarming/swarming_bot/bot_code/bot_main.py
diff --git a/appengine/swarming/swarming_bot/bot_code/bot_main.py b/appengine/swarming/swarming_bot/bot_code/bot_main.py
index 2cdad9663d7cef5153d6a3f624d61c548e9170f0..0ff2793ea0c63aab218d9d09b2c90c034170909a 100644
--- a/appengine/swarming/swarming_bot/bot_code/bot_main.py
+++ b/appengine/swarming/swarming_bot/bot_code/bot_main.py
@@ -43,7 +43,6 @@ from api import bot
from api import os_utilities
from api import platforms
from utils import file_path
-from utils import net
from utils import on_error
from utils import subprocess42
from utils import zip_package
@@ -551,10 +550,12 @@ def run_bot(arg_error):
else:
consecutive_sleeps += 1
except Exception as e:
- logging.exception('poll_server failed')
+ logging.exception('poll_server failed in a completely unexpected way')
Vadim Sh. 2016/12/21 00:27:16 :) in theory, this should never happen.
msg = '%s\n%s' % (e, traceback.format_exc()[-2048:])
botobj.post_error(msg)
consecutive_sleeps = 0
+ # Sleep a bit as a precaution to avoid hammering the server.
+ quit_bit.wait(10)
logging.info('Quitting')
# Tell the server we are going away.
@@ -567,12 +568,14 @@ def poll_server(botobj, quit_bit, last_action):
Returns True if executed some action, False if server asked the bot to sleep.
"""
- # Access to a protected member _XXX of a client class - pylint: disable=W0212
start = time.time()
- cmd, value = botobj.remote.poll(botobj._attributes)
- if cmd == '':
Vadim Sh. 2016/12/21 00:27:16 on errors, cmd was None here. None != '' => no sle
+ try:
+ cmd, value = botobj.remote.poll(botobj._attributes)
+ except remote_client_errors.PollError as e:
# Back off on failure.
- time.sleep(max(1, min(60, botobj.state.get('sleep_streak', 10) * 2)))
+ delay = max(1, min(60, botobj.state.get('sleep_streak', 10) * 2))
+ logging.warning('Poll failed (%s), sleeping %.1f sec', e, delay)
+ quit_bit.wait(delay)
return False
logging.debug('Server response:\n%s: %s', cmd, value)
« no previous file with comments | « appengine/swarming/swarming_bot/api/bot.py ('k') | appengine/swarming/swarming_bot/bot_code/remote_client.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698