appengine/swarming/swarming_bot/bot_code/task_runner.py - Issue 2443663002: Pass args in file from task_runner to run_isolated

Side by Side Diff: appengine/swarming/swarming_bot/bot_code/task_runner.py

Issue 2443663002: Pass args in file from task_runner to run_isolated (Closed)

Patch Set: Respond to code review comments other than changes to options processing Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 # Copyright 2013 The LUCI Authors. All rights reserved.	1 # Copyright 2013 The LUCI Authors. All rights reserved.

2 # Use of this source code is governed under the Apache License, Version 2.0	2 # Use of this source code is governed under the Apache License, Version 2.0

3 # that can be found in the LICENSE file.	3 # that can be found in the LICENSE file.

4	4

5 """Runs a Swarming task.	5 """Runs a Swarming task.

6	6

7 Downloads all the necessary files to run the task, executes the command and	7 Downloads all the necessary files to run the task, executes the command and

8 streams results back to the Swarming server.	8 streams results back to the Swarming server.

9	9

10 The process exit code is 0 when the task was executed, even if the task itself	10 The process exit code is 0 when the task was executed, even if the task itself

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
76	76

77	77

78 def get_run_isolated():	78 def get_run_isolated():

79 """Returns the path to itself to run run_isolated.	79 """Returns the path to itself to run run_isolated.

80	80

81 Mocked in test to point to the real run_isolated.py script.	81 Mocked in test to point to the real run_isolated.py script.

82 """	82 """

83 return [sys.executable, THIS_FILE, 'run_isolated']	83 return [sys.executable, THIS_FILE, 'run_isolated']

84	84

85	85

86 def get_isolated_cmd(	86 def get_isolated_args(

87 work_dir, task_details, isolated_result, bot_file, min_free_space):	87 work_dir, task_details, isolated_result, bot_file, min_free_space):

88 """Returns the command to call run_isolated. Mocked in tests."""	88 """Returns the command to call run_isolated. Mocked in tests."""

89 assert (bool(task_details.command) !=	89 assert (bool(task_details.command) !=

90 bool(task_details.isolated and task_details.isolated.get('input')))	90 bool(task_details.isolated and task_details.isolated.get('input')))

91 bot_dir = os.path.dirname(work_dir)	91 bot_dir = os.path.dirname(work_dir)

92 if os.path.isfile(isolated_result):	92 if os.path.isfile(isolated_result):

93 os.remove(isolated_result)	93 os.remove(isolated_result)

94 cmd = get_run_isolated()	94 cmd = []

95	95

96 if task_details.isolated:	96 if task_details.isolated:

97 cmd.extend(	97 cmd.extend(

98 [	98 [

99 '-I', task_details.isolated['server'].encode('utf-8'),	99 '-I', task_details.isolated['server'].encode('utf-8'),

100 '--namespace', task_details.isolated['namespace'].encode('utf-8'),	100 '--namespace', task_details.isolated['namespace'].encode('utf-8'),

101 ])	101 ])

102 isolated_input = task_details.isolated.get('input')	102 isolated_input = task_details.isolated.get('input')

103 if isolated_input:	103 if isolated_input:

104 cmd.extend(	104 cmd.extend(

(...skipping 231 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
336 try:	336 try:

337 proc.wait(grace_period)	337 proc.wait(grace_period)

338 except subprocess42.TimeoutError:	338 except subprocess42.TimeoutError:

339 logging.warning('SIGKILL finally due to %s', reason)	339 logging.warning('SIGKILL finally due to %s', reason)

340 proc.kill()	340 proc.kill()

341 exit_code = proc.wait()	341 exit_code = proc.wait()

342 logging.info('Waiting for process exit in finally - done')	342 logging.info('Waiting for process exit in finally - done')

343 return exit_code	343 return exit_code

344	344

345	345

	346 def fail_without_command(remote, bot_id, task_id, params, cost_usd_hour,

	347 task_start, exit_code, stdout):

	348 now = monotonic_time()

	349 params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60.

	350 params['duration'] = now - task_start

	351 params['io_timeout'] = False

	352 params['hard_timeout'] = False

	353 # Ignore server reply to stop.

	354 remote.post_task_update(task_id, bot_id, params, (stdout, 0), 1)

	355 return {

	356 u'exit_code': exit_code,

	357 u'hard_timeout': False,

	358 u'io_timeout': False,

	359 u'must_signal_internal_failure': None,

	360 u'version': OUT_VERSION,

	361 }

	362

	363

346 def run_command(remote, task_details, work_dir, cost_usd_hour,	364 def run_command(remote, task_details, work_dir, cost_usd_hour,

347 task_start, min_free_space, bot_file, extra_env):	365 task_start, min_free_space, bot_file, extra_env):

348 """Runs a command and sends packets to the server to stream results back.	366 """Runs a command and sends packets to the server to stream results back.

349	367

350 Implements both I/O and hard timeouts. Sends the packets numbered, so the	368 Implements both I/O and hard timeouts. Sends the packets numbered, so the

351 server can ensure they are processed in order.	369 server can ensure they are processed in order.

352	370

353 Returns:	371 Returns:

354 Metadata dict with the execution result.	372 Metadata dict with the execution result.

355	373

(...skipping 14 matching lines...) Expand all Loading...
370 # Don't even bother, the task was already canceled.	388 # Don't even bother, the task was already canceled.

371 return {	389 return {

372 u'exit_code': -1,	390 u'exit_code': -1,

373 u'hard_timeout': False,	391 u'hard_timeout': False,

374 u'io_timeout': False,	392 u'io_timeout': False,

375 u'must_signal_internal_failure': None,	393 u'must_signal_internal_failure': None,

376 u'version': OUT_VERSION,	394 u'version': OUT_VERSION,

377 }	395 }

378	396

379 isolated_result = os.path.join(work_dir, 'isolated_result.json')	397 isolated_result = os.path.join(work_dir, 'isolated_result.json')

380 cmd = get_isolated_cmd(	398 args_path = os.path.join(work_dir, 'run_isolated_args.json')

	399 cmd = get_run_isolated()

	400 cmd.extend(['-f', args_path])

	401 args = get_isolated_args(

381 work_dir, task_details, isolated_result, bot_file, min_free_space)	402 work_dir, task_details, isolated_result, bot_file, min_free_space)

382 # Hard timeout enforcement is deferred to run_isolated. Grace is doubled to	403 # Hard timeout enforcement is deferred to run_isolated. Grace is doubled to

383 # give one 'grace_period' slot to the child process and one slot to upload	404 # give one 'grace_period' slot to the child process and one slot to upload

384 # the results back.	405 # the results back.

385 task_details.hard_timeout = 0	406 task_details.hard_timeout = 0

386 if task_details.grace_period:	407 if task_details.grace_period:

387 task_details.grace_period *= 2	408 task_details.grace_period *= 2

388	409

389 try:	410 try:

390 # TODO(maruel): Support both channels independently and display stderr in	411 # TODO(maruel): Support both channels independently and display stderr in

391 # red.	412 # red.

392 env = os.environ.copy()	413 env = os.environ.copy()

393 for env_to_add in (task_details.env, extra_env):	414 for env_to_add in (task_details.env, extra_env):

394 for key, value in (env_to_add or {}).iteritems():	415 for key, value in (env_to_add or {}).iteritems():

395 if not value:	416 if not value:

396 env.pop(key, None)	417 env.pop(key, None)

397 else:	418 else:

398 env[key] = value	419 env[key] = value

399 logging.info('cmd=%s', cmd)	420 logging.info('cmd=%s', cmd)

400 logging.info('cwd=%s', work_dir)	421 logging.info('cwd=%s', work_dir)

401 logging.info('env=%s', env)	422 logging.info('env=%s', env)

	423 fail_on_start = lambda exit_code, stdout: fail_without_command(

	424 remote, bot_id, task_id, params, cost_usd_hour, task_start,

	425 exit_code, stdout)

	426

	427 # We write args to a file since there may be more of them than the OS

	428 # can handle.

	429 try:

	430 args_file = open(args_path, "w")
	M-A Ruel 2016/10/24 16:26:52 with open(... with open(... aludwin 2016/10/24 17:05:48 Do I put that inside a try-catch? Show quoted text On 2016/10/24 16:26:52, M-A Ruel wrote: > with open(... Do I put that inside a try-catch? M-A Ruel 2016/10/24 17:11:53 yes, so it looks like: try: with open(args_path Show quoted text On 2016/10/24 17:05:48, aludwin wrote: > On 2016/10/24 16:26:52, M-A Ruel wrote: > > with open(... > > Do I put that inside a try-catch? yes, so it looks like: try: with open(args_path, 'w') as f: json.dump(args, f) except (IOError, OSError) as e: ... Notes: - single quotes - use f instead of args_file - order the exception class name alphabetically
	431 json.dump(args, args_file)

	432 args_file.close()

	433 except (OSError, IOError) as e:

	434 return fail_on_start(

	435 -1,

	436 'Could not write args to %s: %s' % (args_path, e))

	437

	438 # Start the command

402 try:	439 try:

403 assert cmd and all(isinstance(a, basestring) for a in cmd)	440 assert cmd and all(isinstance(a, basestring) for a in cmd)

404 proc = subprocess42.Popen(	441 proc = subprocess42.Popen(

405 cmd,	442 cmd,

406 env=env,	443 env=env,

407 cwd=work_dir,	444 cwd=work_dir,

408 detached=True,	445 detached=True,

409 stdout=subprocess42.PIPE,	446 stdout=subprocess42.PIPE,

410 stderr=subprocess42.STDOUT,	447 stderr=subprocess42.STDOUT,

411 stdin=subprocess42.PIPE)	448 stdin=subprocess42.PIPE)

412 except OSError as e:	449 except OSError as e:

413 stdout = 'Command "%s" failed to start.\nError: %s' % (' '.join(cmd), e)	450 return fail_on_start(

414 now = monotonic_time()	451 1,

415 params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60.	452 'Command "%s" failed to start.\nError: %s' % (' '.join(cmd), e))

416 params['duration'] = now - start

417 params['io_timeout'] = False

418 params['hard_timeout'] = False

419 # Ignore server reply to stop.

420 remote.post_task_update(task_id, bot_id, params, (stdout, 0), 1)

421 return {

422 u'exit_code': 1,

423 u'hard_timeout': False,

424 u'io_timeout': False,

425 u'must_signal_internal_failure': None,

426 u'version': OUT_VERSION,

427 }

428	453

	454 # Monitor the task

429 output_chunk_start = 0	455 output_chunk_start = 0

430 stdout = ''	456 stdout = ''

431 exit_code = None	457 exit_code = None

432 had_io_timeout = False	458 had_io_timeout = False

433 must_signal_internal_failure = None	459 must_signal_internal_failure = None

434 kill_sent = False	460 kill_sent = False

435 timed_out = None	461 timed_out = None

436 try:	462 try:

437 calc = lambda: calc_yield_wait(	463 calc = lambda: calc_yield_wait(

438 task_details, start, last_io, timed_out, stdout)	464 task_details, start, last_io, timed_out, stdout)

(...skipping 186 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
625 options.start = now	651 options.start = now

626	652

627 try:	653 try:

628 load_and_run(	654 load_and_run(

629 options.in_file, options.swarming_server, options.cost_usd_hour,	655 options.in_file, options.swarming_server, options.cost_usd_hour,

630 options.start, options.out_file, options.min_free_space,	656 options.start, options.out_file, options.min_free_space,

631 options.bot_file, options.auth_params_file)	657 options.bot_file, options.auth_params_file)

632 return 0	658 return 0

633 finally:	659 finally:

634 logging.info('quitting')	660 logging.info('quitting')

OLD	NEW

« no previous file with comments | « appengine/swarming/swarming_bot/__main__.py ('k') | appengine/swarming/swarming_bot/bot_code/task_runner_test.py » ('j') | client/run_isolated.py » ('J')