Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1922)

Unified Diff: appengine/swarming/swarming_bot/bot_code/task_runner.py

Issue 1949613002: task_runner: always use run_isolated (Closed) Base URL: https://chromium.googlesource.com/external/github.com/luci/luci-py@master
Patch Set: Fix task_runner.py Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/swarming/swarming_bot/bot_code/task_runner.py
diff --git a/appengine/swarming/swarming_bot/bot_code/task_runner.py b/appengine/swarming/swarming_bot/bot_code/task_runner.py
index 05e6b0f3cac7162ff9020e73e57cea5c32e01fc0..fd2abd64537c52bdd57d9cc8b04b907be6e6d50b 100644
--- a/appengine/swarming/swarming_bot/bot_code/task_runner.py
+++ b/appengine/swarming/swarming_bot/bot_code/task_runner.py
@@ -84,11 +84,16 @@ def get_isolated_cmd(
if os.path.isfile(isolated_result):
os.remove(isolated_result)
cmd = get_run_isolated()
+ if task_details.inputs_ref:
+ cmd.extend(
+ [
+ '--isolated', task_details.inputs_ref['isolated'].encode('utf-8'),
+ '--namespace', task_details.inputs_ref['namespace'].encode('utf-8'),
+ '-I', task_details.inputs_ref['isolatedserver'].encode('utf-8'),
+ ])
+
cmd.extend(
[
- '--isolated', task_details.inputs_ref['isolated'].encode('utf-8'),
- '--namespace', task_details.inputs_ref['namespace'].encode('utf-8'),
- '-I', task_details.inputs_ref['isolatedserver'].encode('utf-8'),
'--json', isolated_result,
'--log-file', os.path.join(bot_dir, 'logs', 'run_isolated.log'),
'--cache', os.path.join(bot_dir, 'cache'),
@@ -101,8 +106,11 @@ def get_isolated_cmd(
cmd.extend(('--hard-timeout', str(task_details.hard_timeout)))
if task_details.grace_period:
cmd.extend(('--grace-period', str(task_details.grace_period)))
- if task_details.extra_args:
- cmd.append('--')
+
+ cmd.append('--')
+ if task_details.command:
+ cmd.extend(task_details.command)
+ elif task_details.extra_args:
cmd.extend(task_details.extra_args)
return cmd
@@ -296,21 +304,15 @@ def run_command(
}
post_update(swarming_server, params, None, '', 0)
- if task_details.command:
- # Raw command.
- cmd = task_details.command
- isolated_result = None
- else:
- # Isolated task.
- isolated_result = os.path.join(work_dir, 'isolated_result.json')
- cmd = get_isolated_cmd(
- work_dir, task_details, isolated_result, min_free_space)
- # Hard timeout enforcement is deferred to run_isolated. Grace is doubled to
- # give one 'grace_period' slot to the child process and one slot to upload
- # the results back.
- task_details.hard_timeout = 0
- if task_details.grace_period:
- task_details.grace_period *= 2
+ isolated_result = os.path.join(work_dir, 'isolated_result.json')
+ cmd = get_isolated_cmd(
+ work_dir, task_details, isolated_result, min_free_space)
+ # Hard timeout enforcement is deferred to run_isolated. Grace is doubled to
+ # give one 'grace_period' slot to the child process and one slot to upload
+ # the results back.
+ task_details.hard_timeout = 0
+ if task_details.grace_period:
+ task_details.grace_period *= 2
try:
# TODO(maruel): Support both channels independently and display stderr in
@@ -353,7 +355,6 @@ def run_command(
output_chunk_start = 0
stdout = ''
exit_code = None
- had_hard_timeout = False
had_io_timeout = False
must_signal_internal_failure = None
kill_sent = False
@@ -388,12 +389,6 @@ def run_command(
logging.warning('I/O timeout; sending SIGTERM')
proc.terminate()
timed_out = monotonic_time()
- elif (task_details.hard_timeout and
- now - start > task_details.hard_timeout):
- had_hard_timeout = True
- logging.warning('Hard timeout; sending SIGTERM')
- proc.terminate()
- timed_out = monotonic_time()
else:
# During grace period.
if not kill_sent and now >= timed_out + task_details.grace_period:
@@ -417,7 +412,6 @@ def run_command(
proc, task_details.grace_period, 'signal %d' % e.signal)
except (IOError, OSError):
# Something wrong happened, try to kill the child process.
- had_hard_timeout = True
exit_code = kill_and_wait(
proc, task_details.grace_period, 'exception %s' % e)
@@ -427,64 +421,66 @@ def run_command(
params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60.
params['duration'] = now - start
params['io_timeout'] = had_io_timeout
- params['hard_timeout'] = had_hard_timeout
- if isolated_result:
- try:
- if ((had_io_timeout or had_hard_timeout) and
- not os.path.isfile(isolated_result)):
- # It's possible that run_isolated failed to quit quickly enough; it
- # could be because there was too much data to upload back or something
- # else. Do not create an internal error, just send back the (partial)
- # view as task_runner saw it, for example the real exit_code is
- # unknown.
- logging.warning('TIMED_OUT and there\'s no result file')
+ had_hard_timeout = False
+ try:
+ if not os.path.isfile(isolated_result):
+ # It's possible if
+ # - run_isolated.py did not start
+ # - run_isolated.py started, but arguments were invalid
+ # - host in a situation unable to fork
+ # - grand child process outliving the child process deleting everything
+ # it can
+ # Do not create an internal error, just send back the (partial)
+ # view as task_runner saw it, for example the real exit_code is
+ # unknown.
+ logging.warning('there\'s no result file')
+ if exit_code is None:
exit_code = -1
- else:
- # See run_isolated.py for the format.
- with open(isolated_result, 'rb') as f:
- run_isolated_result = json.load(f)
- logging.debug('run_isolated:\n%s', run_isolated_result)
- # TODO(maruel): Grab statistics (cache hit rate, data downloaded,
- # mapping time, etc) from run_isolated and push them to the server.
- if run_isolated_result['outputs_ref']:
- params['outputs_ref'] = run_isolated_result['outputs_ref']
- had_hard_timeout = (
- had_hard_timeout or run_isolated_result['had_hard_timeout'])
- params['hard_timeout'] = had_hard_timeout
- if not had_io_timeout and not had_hard_timeout:
- if run_isolated_result['internal_failure']:
- must_signal_internal_failure = (
- run_isolated_result['internal_failure'])
- logging.error('%s', must_signal_internal_failure)
- elif exit_code:
- # TODO(maruel): Grab stdout from run_isolated.
- must_signal_internal_failure = (
- 'run_isolated internal failure %d' % exit_code)
- logging.error('%s', must_signal_internal_failure)
- exit_code = run_isolated_result['exit_code']
- if run_isolated_result.get('duration') is not None:
- # Calculate the real task duration as measured by run_isolated and
- # calculate the remaining overhead.
- params['bot_overhead'] = params['duration']
- params['duration'] = run_isolated_result['duration']
- params['bot_overhead'] -= params['duration']
- params['bot_overhead'] -= run_isolated_result.get(
- 'download', {}).get('duration', 0)
- params['bot_overhead'] -= run_isolated_result.get(
- 'upload', {}).get('duration', 0)
- if params['bot_overhead'] < 0:
- params['bot_overhead'] = 0
- stats = run_isolated_result.get('stats')
- if stats:
- params['isolated_stats'] = stats
- except (IOError, OSError, ValueError) as e:
- logging.error('Swallowing error: %s', e)
- if not must_signal_internal_failure:
- must_signal_internal_failure = str(e)
+ else:
+ # See run_isolated.py for the format.
+ with open(isolated_result, 'rb') as f:
+ run_isolated_result = json.load(f)
+ logging.debug('run_isolated:\n%s', run_isolated_result)
+ # TODO(maruel): Grab statistics (cache hit rate, data downloaded,
+ # mapping time, etc) from run_isolated and push them to the server.
+ if run_isolated_result['outputs_ref']:
+ params['outputs_ref'] = run_isolated_result['outputs_ref']
+ had_hard_timeout = run_isolated_result['had_hard_timeout']
+ if not had_io_timeout and not had_hard_timeout:
+ if run_isolated_result['internal_failure']:
+ must_signal_internal_failure = (
+ run_isolated_result['internal_failure'])
+ logging.error('%s', must_signal_internal_failure)
+ elif exit_code:
+ # TODO(maruel): Grab stdout from run_isolated.
+ must_signal_internal_failure = (
+ 'run_isolated internal failure %d' % exit_code)
+ logging.error('%s', must_signal_internal_failure)
+ exit_code = run_isolated_result['exit_code']
+ if run_isolated_result.get('duration') is not None:
+ # Calculate the real task duration as measured by run_isolated and
+ # calculate the remaining overhead.
+ params['bot_overhead'] = params['duration']
+ params['duration'] = run_isolated_result['duration']
+ params['bot_overhead'] -= params['duration']
+ params['bot_overhead'] -= run_isolated_result.get(
+ 'download', {}).get('duration', 0)
+ params['bot_overhead'] -= run_isolated_result.get(
+ 'upload', {}).get('duration', 0)
+ if params['bot_overhead'] < 0:
+ params['bot_overhead'] = 0
+ stats = run_isolated_result.get('stats')
+ if stats:
+ params['isolated_stats'] = stats
+ except (IOError, OSError, ValueError) as e:
+ logging.error('Swallowing error: %s', e)
+ if not must_signal_internal_failure:
+ must_signal_internal_failure = str(e)
# TODO(maruel): Send the internal failure here instead of sending it through
# bot_main, this causes a race condition.
if exit_code is None:
exit_code = -1
+ params['hard_timeout'] = had_hard_timeout
post_update(swarming_server, params, exit_code, stdout, output_chunk_start)
return {
u'exit_code': exit_code,
@@ -494,11 +490,10 @@ def run_command(
u'version': OUT_VERSION,
}
finally:
- if isolated_result:
- try:
- os.remove(isolated_result)
- except OSError:
- pass
+ try:
+ os.remove(isolated_result)
+ except OSError:
+ pass
def main(args):

Powered by Google App Engine
This is Rietveld 408576698