scripts/slave/swarming/swarming_run_shim.py - Issue 139343011: Add swarming_run_shim.py to run swarming tasks as annotated tasks.

Side by Side Diff: scripts/slave/swarming/swarming_run_shim.py

Issue 139343011: Add swarming_run_shim.py to run swarming tasks as annotated tasks. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/build

Patch Set: tested Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright 2014 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Drives tests on Swarming. Both trigger and collect results.

	7

	8 This is the shim that is called through buildbot.

	9 """

	10

	11 import json

	12 import logging

	13 import optparse

	14 import os

	15 import subprocess

	16 import sys

	17 import threading

	18 import Queue

	19

	20 from common import chromium_utils

	21 from common import find_depot_tools # pylint: disable=W0611

	22

	23 from common import annotator

	24 from slave.swarming import swarming_utils

	25

	26 # From depot tools/

	27 import fix_encoding

	28

	29

	30 def v0_3(

	31 client, swarming_server, isolate_server, priority, dimensions,

	32 task_name, isolated_hash, env, shards):

	33 """Handles swarm_client/swarming.py starting 7c543276f08.

	34

	35 It was rolled in src on r237619 on 2013-11-27.

	36 """

	37 cmd = [

	38 sys.executable,

	39 os.path.join(client, 'swarming.py'),

	40 'run',

	41 '--swarming', swarming_server,

	42 '--isolate-server', isolate_server,

	43 '--priority', str(priority),

	44 '--shards', str(shards),

	45 '--task-name', task_name,

	46 isolated_hash,

	47 ]

	48 for name, value in dimensions.iteritems():

	49 if name != 'os':

	50 cmd.extend(('--dimension', name, value))

	51 else:

	52 # Sadly, older version of swarming.py need special handling of os.

	53 old_value = [

	54 k for k, v in swarming_utils.OS_MAPPING.iteritems() if v == value

	55 ]

	56 assert len(old_value) == 1

	57 cmd.extend(('--os', old_value[0]))

	58

	59 # Enable profiling on the -dev server.

	60 if '-dev' in swarming_server:

	61 cmd.append('--profile')

	62 for name, value in env.iteritems():

	63 cmd.extend(('--env', name, value))

	64 return cmd

	65

	66

	67 def v0_4(

	68 client, swarming_server, isolate_server, priority, dimensions,

	69 task_name, isolated_hash, env, shards):

	70 """Handles swarm_client/swarming.py starting b39e8cf08c.

	71

	72 It was rolled in src on r246113 on 2014-01-21.

	73 """

	74 cmd = [

	75 sys.executable,

	76 os.path.join(client, 'swarming.py'),

	77 'run',

	78 '--swarming', swarming_server,

	79 '--isolate-server', isolate_server,

	80 '--priority', str(priority),

	81 '--shards', str(shards),

	82 '--task-name', task_name,

	83 isolated_hash,

	84 ]

	85 for name, value in dimensions.iteritems():

	86 cmd.extend(('--dimension', name, value))

	87 # Enable profiling on the -dev server.

	88 if '-dev' in swarming_server:

	89 cmd.append('--profile')

	90 for name, value in env.iteritems():

	91 cmd.extend(('--env', name, value))

	92 return cmd

	93

	94

	95 def stream_process(cmd):

	96 """Calls process cmd and yields its output.

	97

	98 This is not the most efficient nor safe way to do it but it is only meant to

	99 be run on linux so it should be fine. Fix if necessary.

	100 """

	101 p = subprocess.Popen(

	102 cmd, bufsize=1, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

	103 try:

	104 while True:

	105 try:

	106 i = p.stdout.readline()

	107 if i == '':
	Vadim Sh. 2014/01/24 20:22:11 nit: if not i ? nit: if not i ? M-A Ruel 2014/01/24 20:41:41 Done. Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > nit: if not i ? Done.
	108 if p.poll() is None:

	109 continue

	110 break

	111 yield i

	112 except OSError:

	113 if p.poll() is None:

	114 continue

	115 break

	116 yield p.returncode
	Vadim Sh. 2014/01/24 20:22:11 nit: add assert p.returncode is not None just to b nit: add assert p.returncode is not None just to be sure M-A Ruel 2014/01/24 20:41:41 It can't be. But I refactored to make this clearer Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > nit: add assert p.returncode is not None just to be sure It can't be. But I refactored to make this clearer.
	117 finally:

	118 if p.poll() is None:

	119 p.kill()

	120

	121

	122 def drive_one(

	123 client, version, swarming_server, isolate_server, priority, dimensions,

	124 task_name, isolated_hash, env, shards, out):

	125 """Executes the proper handler based on the code layout and --version support.

	126 """

	127 def send_back(l):

	128 out.put((task_name, l))

	129 if version < (0, 4):

	130 cmd = v0_3(

	131 client, swarming_server, isolate_server, priority, dimensions,

	132 task_name, isolated_hash, env, shards)

	133 else:

	134 cmd = v0_4(

	135 client, swarming_server, isolate_server, priority, dimensions,

	136 task_name, isolated_hash, env, shards)

	137 try:

	138 for i in stream_process(cmd):

	139 send_back(i)

	140 except Exception as e:

	141 send_back(e)

	142

	143

	144 def drive_many(

	145 client, version, swarming_server, isolate_server, priority, dimensions,

	146 steps):

	147 logging.info(

	148 'drive_many(%s, %s, %s, %s, %s, %s, %s)',

	149 client, version, swarming_server, isolate_server, priority, dimensions,

	150 steps)

	151 return _drive_many(

	152 client, version, swarming_server, isolate_server, priority, dimensions,

	153 steps, Queue.Queue())

	154

	155

	156 def step_name_to_cursor(x):

	157 """The cursor is buildbot's step name. It is only the base test name for

	158 simplicity.

	159

	160 But the swarming task name is longer, it is

	161 "<name>/<dimensions>/<isolated hash>".

	162 """

	163 return x.split('/', 1)[0]

	164

	165

	166 def _drive_many(

	167 client, version, swarming_server, isolate_server, priority, dimensions,

	168 steps, out):

	169 """Internal version, exposed so it can be hooked in test."""

	170 stream = annotator.AdvancedAnnotationStream(sys.stdout, False)

	171 for step_name in sorted(steps):

	172 # Seeds the step first before doing the cursors otherwise it is interleaved

	173 # in the logs of other steps.

	174 stream.seed_step(step_name)

	175

	176 threads = []

	177 # Create the boxes in buildbot in order for consistency.

	178 steps_annotations = {}

	179 for step_name, isolated_hash in sorted(steps.iteritems()):

	180 env = {}

	181 # TODO(maruel): Propagate GTEST_FILTER.

	182 #if gtest_filter not in (None, '', '.', '*'):

	183 # env['GTEST_FILTER'] = gtest_filter

	184 shards = swarming_utils.TESTS_SHARDS.get(step_name, 1)

	185 # This will be the key in steps_annotations.

	186 task_name = '%s/%s/%s' % (step_name, dimensions['os'], isolated_hash)

	187 t = threading.Thread(

	188 target=drive_one,

	189 args=(client, version, swarming_server, isolate_server, priority,

	190 dimensions, task_name, isolated_hash, env, shards, out))

	191 t.daemon = True

	192 t.start()

	193 threads.append(t)

	194 steps_annotations[task_name] = annotator.AdvancedAnnotationStep(

	195 sys.stdout, False)

	196 items = task_name.split('/', 2)

	197 assert step_name == items[0]

	198 assert step_name == step_name_to_cursor(task_name)

	199 # It is important data to surface through buildbot.

	200 stream.step_cursor(step_name)

	201 steps_annotations[task_name].step_text(items[1])

	202 steps_annotations[task_name].step_text(items[2])

	203 collect(threads, stream, steps_annotations, out)

	204 return 0
	Vadim Sh. 2014/01/24 20:22:11 Hm... This entire script always finishes with zero Hm... This entire script always finishes with zero status, even if some steps fail. It doesn't look right to me. Can we do something like: return collect(...) and in collect return != 0 if any test failed? M-A Ruel 2014/01/24 20:41:41 Each step will have its own result code. Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > Hm... This entire script always finishes with zero status, even if some steps > fail. It doesn't look right to me. Can we do something like: > > return collect(...) > > and in collect return != 0 if any test failed? Each step will have its own result code.
	205

	206

	207 def collect(threads, stream, steps_annotations, out):

	208 last_cursor = None

	209 while True:

	210 threads = [t for t in threads if t.is_alive()]
	Vadim Sh. 2014/01/24 20:22:11 I don't trust 't.is_alive()'... What do you think I don't trust 't.is_alive()'... What do you think about: def collect(stream, steps_annotations, out): while steps_annotations: try: package = out.get(timeout=10) # no need to poll each 100 ms... except Queue.Empty: continue ... You delete from \|step_annotations\| now anyway whenever task finishes. M-A Ruel 2014/01/24 20:41:41 Done. Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > I don't trust 't.is_alive()'... > > What do you think about: > > def collect(stream, steps_annotations, out): > while steps_annotations: > try: > package = out.get(timeout=10) # no need to poll each 100 ms... > except Queue.Empty: > continue > ... > > You delete from \|step_annotations\| now anyway whenever task finishes. Done.
	211 try:

	212 # Polling FTW.

	213 packet = out.get(timeout=0.1)

	214 except Queue.Empty:

	215 if not threads:

	216 break

	217 continue

	218 task_name, item = packet

	219 if isinstance(item, int):

	220 # Signals it's completed.

	221 if last_cursor != task_name:

	222 stream.step_cursor(step_name_to_cursor(task_name))

	223 last_cursor = task_name

	224 if item:

	225 steps_annotations[task_name].step_failure()

	226 steps_annotations[task_name].step_closed()

	227 del steps_annotations[task_name]

	228 last_cursor = None

	229 elif isinstance(item, Exception):

	230 if last_cursor != task_name:

	231 stream.step_cursor(step_name_to_cursor(task_name))

	232 last_cursor = task_name

	233 steps_annotations[task_name].step_failure()

	234 del steps_annotations[task_name]

	235 last_cursor = None

	236 else:

	237 assert isinstance(item, str), item

	238 if last_cursor != task_name:

	239 stream.step_cursor(step_name_to_cursor(task_name))

	240 last_cursor = task_name

	241 sys.stdout.write(item)

	242 out.task_done()
	Vadim Sh. 2014/01/24 20:22:11 As I proposed above: return non 0 from 'collect' i As I proposed above: return non 0 from 'collect' if at least one step fails. M-A Ruel 2014/01/24 20:41:41 That's not how annotated steps work. And this woul Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > As I proposed above: return non 0 from 'collect' if at least one step fails. That's not how annotated steps work. And this would be very confusing for the users.
	243

	244

	245 def determine_steps_to_run(isolated_hashes, default_swarming_tests, testfilter):

	246 """Returns a dict of test:hash for the test that should be run thru Swarming.
	Vadim Sh. 2014/01/24 20:22:11 nit: typo 'thru' nit: typo 'thru' M-A Ruel 2014/01/24 20:41:41 Done. Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > nit: typo 'thru' Done.
	247

	248 This is done by looking at the build properties to figure out what should be

	249 run.

	250 """

	251 logging.info(

	252 'determine_steps_to_run(%s, %s, %s)',

	253 isolated_hashes, default_swarming_tests, testfilter)

	254 # TODO(maruel): Support gtest filter.

	255 def should_run(name):

	256 return (

	257 ((name in default_swarming_tests or not default_swarming_tests) and

	258 'defaulttests' in testfilter) or

	259 (name + '_swarm' in testfilter))

	260

	261 return dict(

	262 (name, isolated_hash)

	263 for name, isolated_hash in isolated_hashes.iteritems()

	264 if should_run(name))

	265

	266

	267 def process_build_properties(options):

	268 """Converts build properties and factory properties into expected flags."""

	269 # target_os is not defined when using a normal builder, contrary to a

	270 # xx_swarm_triggered buildbot<->swarming builder, and it's not needed since

	271 # the OS match, it's defined in builder/tester configurations.

	272 slave_os = options.build_properties.get('target_os', sys.platform)

	273 priority = swarming_utils.build_to_priority(options.build_properties)

	274 steps = determine_steps_to_run(

	275 options.build_properties.get('swarm_hashes', {}),

	276 options.build_properties.get('run_default_swarm_tests', []),

	277 options.build_properties.get('testfilter', ['defaulttests']))

	278 return slave_os, priority, steps

	279

	280

	281 def main(args):

	282 """Note: this is solely to run the current master's code and can totally

	283 differ from the underlying script flags.

	284

	285 To update these flags:

	286 - Update the following code to support both the previous flag and the new

	287 flag.

	288 - Change scripts/master/factory/swarm_commands.py to pass the new flag.

	289 - Restart all the masters using swarming.

	290 - Remove the old flag from this code.

	291 """

	292 client = swarming_utils.find_client(os.getcwd())

	293 if not client:

	294 print >> sys.stderr, 'Failed to find swarm(ing)_client'

	295 return 1

	296 if os.path.isfile(os.path.join(client, 'swarm_get_results.py')):
	Vadim Sh. 2014/01/24 20:22:11 Is this check necessary? Doesn't version check bel Is this check necessary? Doesn't version check below covers this case as well? M-A Ruel 2014/01/24 20:41:41 Confirmed that (None < (0, 1)) == True so removed. Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > Is this check necessary? Doesn't version check below covers this case as well? Confirmed that (None < (0, 1)) == True so removed.
	297 print >> sys.stderr, '%s is too old. Please run the test locally' % client

	298 return 1

	299 version = swarming_utils.get_version(client)

	300 if version < (0, 3):

	301 print >> sys.stderr, (

	302 '%s is version %s which is too old. Please run the test locally' %

	303 (client, '.'.join(version)))

	304 return 1

	305

	306 parser = optparse.OptionParser(description=sys.modules[__name__].__doc__)

	307 parser.add_option('--verbose', action='store_true', default=True)
	Vadim Sh. 2014/01/24 20:22:11 err... default=True and action='store_true'. IIUC err... default=True and action='store_true'. IIUC there's no way to make it non-verbose. M-A Ruel 2014/01/24 20:41:41 Oops, removed. I had added it during testing. Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > err... default=True and action='store_true'. IIUC there's no way to make it > non-verbose. Oops, removed. I had added it during testing.
	308 parser.add_option('--swarming')

	309 parser.add_option('--isolate-server')

	310 chromium_utils.AddPropertiesOptions(parser)

	311 options, args = parser.parse_args(args)

	312 if args:

	313 parser.error('Unsupported args: %s' % args)

	314 if not options.swarming or not options.isolate_server:

	315 parser.error('Require both --swarming and --isolate-server')

	316

	317 # Quick hack to always get details about jobs on the swarming master. Can be

	318 # removed if not necessary anymore.

	319 if (options.build_properties.get('buildbotURL') ==
	Vadim Sh. 2014/01/24 20:22:11 maybe .endswith('/chromium.swarm/') (the same way maybe .endswith('/chromium.swarm/') (the same way priority checking function does). Or even move this into swarming_utils: if swarming_utils.is_on_canary_master(options.build_properties): ...
	320 'http://build.chromium.org/p/chromium.swarm/'):

	321 options.verbose = True
	M-A Ruel 2014/01/24 03:19:00 Actually, this doesn't seem to work in my tests. Actually, this doesn't seem to work in my tests. M-A Ruel 2014/01/24 20:41:41 No it doesn't, I removed the code. Show quoted text On 2014/01/24 03:19:00, M-A Ruel wrote: > Actually, this doesn't seem to work in my tests. No it doesn't, I removed the code.
	322

	323 logging.basicConfig(level=logging.DEBUG if options.verbose else logging.ERROR)

	324 logging.debug(

	325 'Build properties:\n%s',

	326 json.dumps(options.build_properties, indent=2, sort_keys=True))

	327 # Loads the other flags implicitly.

	328 slave_os, priority, steps = process_build_properties(options)

	329 logging.info('To run: %s, %s, %s', slave_os, priority, steps)

	330 if not steps:

	331 # TODO(maruel): Returns a warning so it's clear that something is not

	332 # normal. Not sure how to do this.

	333 print('Nothing to trigger')

	334 return 0
	Vadim Sh. 2014/01/24 20:22:11 why not fail? why not fail? M-A Ruel 2014/01/24 20:41:41 I wanted to put a warning. In practice it could be Show quoted text On 2014/01/24 20:22:11, Vadim Sh. wrote: > why not fail? I wanted to put a warning. In practice it could be fine to fail. But for some reason 88 is not recognized as a warning. Mike, do you know why?
	335 print('Selected tests:')

	336 print('\n'.join(' %s' % s for s in sorted(steps)))

	337 selected_os = swarming_utils.OS_MAPPING[slave_os]

	338 print('Selected OS: %s' % selected_os)

	339 return drive_many(

	340 client,

	341 version,

	342 options.swarming,

	343 options.isolate_server,

	344 priority,

	345 {'os': selected_os},

	346 steps)

	347

	348

	349 if __name__ == '__main__':

	350 fix_encoding.fix_encoding()

	351 sys.exit(main(sys.argv[1:]))

OLD	NEW