Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(80)

Side by Side Diff: appengine/swarming/swarming_bot/bot_code/task_runner.py

Issue 1373133004: Fixes and add smoke test: hard timeout on isolated task. (Closed) Base URL: git@github.com:luci/luci-py.git@master
Patch Set: . Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « appengine/swarming/swarming_bot/__main__.py ('k') | appengine/swarming/tools/start_bot.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright 2013 The Swarming Authors. All rights reserved. 1 # Copyright 2013 The Swarming Authors. All rights reserved.
2 # Use of this source code is governed by the Apache v2.0 license that can be 2 # Use of this source code is governed by the Apache v2.0 license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 """Runs a Swarming task. 5 """Runs a Swarming task.
6 6
7 Downloads all the necessary files to run the task, executes the command and 7 Downloads all the necessary files to run the task, executes the command and
8 streams results back to the Swarming server. 8 streams results back to the Swarming server.
9 9
10 The process exit code is 0 when the task was executed, even if the task itself 10 The process exit code is 0 when the task was executed, even if the task itself
(...skipping 327 matching lines...) Expand 10 before | Expand all | Expand 10 after
338 u'must_signal_internal_failure': None, 338 u'must_signal_internal_failure': None,
339 u'version': OUT_VERSION, 339 u'version': OUT_VERSION,
340 } 340 }
341 341
342 output_chunk_start = 0 342 output_chunk_start = 0
343 stdout = '' 343 stdout = ''
344 exit_code = None 344 exit_code = None
345 had_hard_timeout = False 345 had_hard_timeout = False
346 had_io_timeout = False 346 had_io_timeout = False
347 must_signal_internal_failure = None 347 must_signal_internal_failure = None
348 kill_sent = False
348 timed_out = None 349 timed_out = None
349 try: 350 try:
350 calc = lambda: calc_yield_wait( 351 calc = lambda: calc_yield_wait(
351 task_details, start, last_io, timed_out, stdout) 352 task_details, start, last_io, timed_out, stdout)
352 maxsize = lambda: MAX_CHUNK_SIZE - len(stdout) 353 maxsize = lambda: MAX_CHUNK_SIZE - len(stdout)
353 last_io = monotonic_time() 354 last_io = monotonic_time()
354 for _, new_data in proc.yield_any(maxsize=maxsize, soft_timeout=calc): 355 for _, new_data in proc.yield_any(maxsize=maxsize, soft_timeout=calc):
355 now = monotonic_time() 356 now = monotonic_time()
356 if new_data: 357 if new_data:
357 stdout += new_data 358 stdout += new_data
358 last_io = now 359 last_io = now
359 360
360 # Post update if necessary. 361 # Post update if necessary.
361 if should_post_update(stdout, now, last_packet): 362 if should_post_update(stdout, now, last_packet):
362 last_packet = monotonic_time() 363 last_packet = monotonic_time()
363 params['cost_usd'] = ( 364 params['cost_usd'] = (
364 cost_usd_hour * (last_packet - task_start) / 60. / 60.) 365 cost_usd_hour * (last_packet - task_start) / 60. / 60.)
365 post_update(swarming_server, params, None, stdout, output_chunk_start) 366 post_update(swarming_server, params, None, stdout, output_chunk_start)
366 output_chunk_start += len(stdout) 367 output_chunk_start += len(stdout)
367 stdout = '' 368 stdout = ''
368 369
369 # Send signal on timeout if necessary. Both are failures, not 370 # Send signal on timeout if necessary. Both are failures, not
370 # internal_failures. 371 # internal_failures.
371 # Eventually kill but return 0 so bot_main.py doesn't cancel the task. 372 # Eventually kill but return 0 so bot_main.py doesn't cancel the task.
372 if not timed_out: 373 if not timed_out:
373 if now - last_io > task_details.io_timeout: 374 if now - last_io > task_details.io_timeout:
374 had_io_timeout = True 375 had_io_timeout = True
375 logging.warning('I/O timeout') 376 logging.warning('I/O timeout; sending SIGTERM')
376 try: 377 proc.terminate()
377 proc.terminate()
378 except OSError:
379 pass
380 timed_out = monotonic_time() 378 timed_out = monotonic_time()
381 elif now - start > task_details.hard_timeout: 379 elif now - start > task_details.hard_timeout:
382 had_hard_timeout = True 380 had_hard_timeout = True
383 logging.warning('Hard timeout') 381 logging.warning('Hard timeout; sending SIGTERM')
384 try: 382 proc.terminate()
385 proc.terminate()
386 except OSError:
387 pass
388 timed_out = monotonic_time() 383 timed_out = monotonic_time()
389 else: 384 else:
390 # During grace period. 385 # During grace period.
391 if now >= timed_out + task_details.grace_period: 386 if not kill_sent and now >= timed_out + task_details.grace_period:
392 # Now kill for real. The user can distinguish between the following 387 # Now kill for real. The user can distinguish between the following
393 # states: 388 # states:
394 # - signal but process exited within grace period, 389 # - signal but process exited within grace period,
395 # (hard_|io_)_timed_out will be set but the process exit code will 390 # (hard_|io_)_timed_out will be set but the process exit code will
396 # be script provided. 391 # be script provided.
397 # - processed exited late, exit code will be -9 on posix. 392 # - processed exited late, exit code will be -9 on posix.
398 try: 393 logging.warning('Grace exhausted; sending SIGKILL')
399 logging.warning('proc.kill() after grace') 394 proc.kill()
400 proc.kill() 395 kill_sent = True
401 except OSError:
402 pass
403 logging.info('Waiting for proces exit') 396 logging.info('Waiting for proces exit')
404 exit_code = proc.wait() 397 exit_code = proc.wait()
405 except MustExit as e: 398 except MustExit as e:
406 # TODO(maruel): Do the send SIGTERM to child process and give it 399 # TODO(maruel): Do the send SIGTERM to child process and give it
407 # task_details.grace_period to terminate. 400 # task_details.grace_period to terminate.
408 must_signal_internal_failure = ( 401 must_signal_internal_failure = (
409 u'task_runner received signal %s' % e.signal) 402 u'task_runner received signal %s' % e.signal)
410 exit_code = kill_and_wait(proc, 'signal %d' % e.signal) 403 exit_code = kill_and_wait(proc, 'signal %d' % e.signal)
411 except (IOError, OSError): 404 except (IOError, OSError):
412 # Something wrong happened, try to kill the child process. 405 # Something wrong happened, try to kill the child process.
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
484 if options.start > now: 477 if options.start > now:
485 options.start = now 478 options.start = now
486 479
487 try: 480 try:
488 load_and_run( 481 load_and_run(
489 options.in_file, remote, options.cost_usd_hour, options.start, 482 options.in_file, remote, options.cost_usd_hour, options.start,
490 options.out_file) 483 options.out_file)
491 return 0 484 return 0
492 finally: 485 finally:
493 logging.info('quitting') 486 logging.info('quitting')
OLDNEW
« no previous file with comments | « appengine/swarming/swarming_bot/__main__.py ('k') | appengine/swarming/tools/start_bot.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698