Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(308)

Side by Side Diff: appengine/swarming/swarming_bot/bot_code/bot_main.py

Issue 2593863002: Fix evil retry loop on poll errors. (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2013 The LUCI Authors. All rights reserved. 1 # Copyright 2013 The LUCI Authors. All rights reserved.
2 # Use of this source code is governed under the Apache License, Version 2.0 2 # Use of this source code is governed under the Apache License, Version 2.0
3 # that can be found in the LICENSE file. 3 # that can be found in the LICENSE file.
4 4
5 """Swarming bot main process. 5 """Swarming bot main process.
6 6
7 This is the program that communicates with the Swarming server, ensures the code 7 This is the program that communicates with the Swarming server, ensures the code
8 is always up to date and executes a child process to run tasks and upload 8 is always up to date and executes a child process to run tasks and upload
9 results back. 9 results back.
10 10
(...skipping 25 matching lines...) Expand all
36 import bot_auth 36 import bot_auth
37 import common 37 import common
38 import file_refresher 38 import file_refresher
39 import remote_client 39 import remote_client
40 import remote_client_errors 40 import remote_client_errors
41 import singleton 41 import singleton
42 from api import bot 42 from api import bot
43 from api import os_utilities 43 from api import os_utilities
44 from api import platforms 44 from api import platforms
45 from utils import file_path 45 from utils import file_path
46 from utils import net
47 from utils import on_error 46 from utils import on_error
48 from utils import subprocess42 47 from utils import subprocess42
49 from utils import zip_package 48 from utils import zip_package
50 49
51 50
52 # Used to opportunistically set the error handler to notify the server when the 51 # Used to opportunistically set the error handler to notify the server when the
53 # process exits due to an exception. 52 # process exits due to an exception.
54 _ERROR_HANDLER_WAS_REGISTERED = False 53 _ERROR_HANDLER_WAS_REGISTERED = False
55 54
56 55
(...skipping 487 matching lines...) Expand 10 before | Expand all | Expand 10 after
544 try: 543 try:
545 botobj._update_dimensions(get_dimensions(botobj)) 544 botobj._update_dimensions(get_dimensions(botobj))
546 botobj._update_state(get_state(botobj, consecutive_sleeps)) 545 botobj._update_state(get_state(botobj, consecutive_sleeps))
547 did_something = poll_server(botobj, quit_bit, last_action) 546 did_something = poll_server(botobj, quit_bit, last_action)
548 if did_something: 547 if did_something:
549 last_action = time.time() 548 last_action = time.time()
550 consecutive_sleeps = 0 549 consecutive_sleeps = 0
551 else: 550 else:
552 consecutive_sleeps += 1 551 consecutive_sleeps += 1
553 except Exception as e: 552 except Exception as e:
554 logging.exception('poll_server failed') 553 logging.exception('poll_server failed in a completely unexpected way')
Vadim Sh. 2016/12/21 00:27:16 :) in theory, this should never happen.
555 msg = '%s\n%s' % (e, traceback.format_exc()[-2048:]) 554 msg = '%s\n%s' % (e, traceback.format_exc()[-2048:])
556 botobj.post_error(msg) 555 botobj.post_error(msg)
557 consecutive_sleeps = 0 556 consecutive_sleeps = 0
557 # Sleep a bit as a precaution to avoid hammering the server.
558 quit_bit.wait(10)
558 logging.info('Quitting') 559 logging.info('Quitting')
559 560
560 # Tell the server we are going away. 561 # Tell the server we are going away.
561 botobj.post_event('bot_shutdown', 'Signal was received') 562 botobj.post_event('bot_shutdown', 'Signal was received')
562 return 0 563 return 0
563 564
564 565
565 def poll_server(botobj, quit_bit, last_action): 566 def poll_server(botobj, quit_bit, last_action):
566 """Polls the server to run one loop. 567 """Polls the server to run one loop.
567 568
568 Returns True if executed some action, False if server asked the bot to sleep. 569 Returns True if executed some action, False if server asked the bot to sleep.
569 """ 570 """
570 # Access to a protected member _XXX of a client class - pylint: disable=W0212
571 start = time.time() 571 start = time.time()
572 cmd, value = botobj.remote.poll(botobj._attributes) 572 try:
573 if cmd == '': 573 cmd, value = botobj.remote.poll(botobj._attributes)
Vadim Sh. 2016/12/21 00:27:16 on errors, cmd was None here. None != '' => no sle
574 except remote_client_errors.PollError as e:
574 # Back off on failure. 575 # Back off on failure.
575 time.sleep(max(1, min(60, botobj.state.get('sleep_streak', 10) * 2))) 576 delay = max(1, min(60, botobj.state.get('sleep_streak', 10) * 2))
577 logging.warning('Poll failed (%s), sleeping %.1f sec', e, delay)
578 quit_bit.wait(delay)
576 return False 579 return False
577 logging.debug('Server response:\n%s: %s', cmd, value) 580 logging.debug('Server response:\n%s: %s', cmd, value)
578 581
579 if cmd == 'sleep': 582 if cmd == 'sleep':
580 # Value is duration 583 # Value is duration
581 call_hook(botobj, 'on_bot_idle', max(0, time.time() - last_action)) 584 call_hook(botobj, 'on_bot_idle', max(0, time.time() - last_action))
582 quit_bit.wait(value) 585 quit_bit.wait(value)
583 return False 586 return False
584 587
585 if cmd == 'terminate': 588 if cmd == 'terminate':
(...skipping 363 matching lines...) Expand 10 before | Expand all | Expand 10 after
949 952
950 error = None 953 error = None
951 if len(args) != 0: 954 if len(args) != 0:
952 error = 'Unexpected arguments: %s' % args 955 error = 'Unexpected arguments: %s' % args
953 try: 956 try:
954 return run_bot(error) 957 return run_bot(error)
955 finally: 958 finally:
956 call_hook( 959 call_hook(
957 bot.Bot(None, None, None, None, base_dir, None), 'on_bot_shutdown') 960 bot.Bot(None, None, None, None, base_dir, None), 'on_bot_shutdown')
958 logging.info('main() returning') 961 logging.info('main() returning')
OLDNEW
« no previous file with comments | « appengine/swarming/swarming_bot/api/bot.py ('k') | appengine/swarming/swarming_bot/bot_code/remote_client.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698