Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: swarming.py

Issue 22980008: Merge all swarm_*.py scripts into swarming.py. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/swarm_client
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 import StringIO
7 import datetime
8 import getpass
9 import hashlib
10 import json
11 import logging
12 import optparse
13 import os
14 import shutil
15 import subprocess
16 import sys
17 import tempfile
18 import threading
19 import time
20 import urllib
21 import zipfile
22
23 import trace_inputs
24 from third_party.depot_tools import fix_encoding
Vadim Sh. 2013/08/16 18:58:58 nit: I'd rearrange this imports into two groups:
M-A Ruel 2013/08/18 00:18:18 Done.
25 from third_party.depot_tools import subcommand
26 import run_isolated
27
28
29 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
30 TOOLS_PATH = os.path.join(ROOT_DIR, 'tools')
31
32
33 # Default servers.
34 # TODO(maruel): Chromium-specific.
35 ISOLATE_SERVER = 'https://isolateserver-dev.appspot.com/'
Vadim Sh. 2013/08/16 18:58:58 At some point we need to get rid of this defaults
M-A Ruel 2013/08/18 00:18:18 Yes. But it's out of scope for this CL.
36 SWARM_SERVER = 'https://chromium-swarm-dev.appspot.com'
37
38
39 # The default time to wait for a shard to finish running.
40 DEFAULT_SHARD_WAIT_TIME = 40 * 60.
41
42
43 PLATFORM_MAPPING = {
44 'cygwin': 'Windows',
45 'darwin': 'Mac',
46 'linux2': 'Linux',
47 'win32': 'Windows',
48 }
49
50
51 class Failure(Exception):
52 """Generic failure."""
53 pass
54
55
56 class Manifest(object):
57 """Represents a Swarming task manifest.
58
59 Also includes code to zip code and upload itself.
60 """
61 def __init__(
62 self, manifest_hash, test_name, shards, test_filter, os_image,
63 working_dir, data_server, verbose, profile, priority):
64 """Populates a manifest object.
65 Args:
66 manifest_hash - The manifest's sha-1 that the slave is going to fetch.
67 test_name - The name to give the test request.
68 shards - The number of swarm shards to request.
69 test_filter - The gtest filter to apply when running the test.
70 os_image - OS to run on.
71 working_dir - Relative working directory to start the script.
72 data_server - isolate server url.
73 verbose - if True, have the slave print more details.
74 profile - if True, have the slave print more timing data.
75 priority - int between 0 and 1000, lower the higher priority
76 """
77 self.manifest_hash = manifest_hash
78 self._test_name = test_name
79 self._shards = shards
80 self._test_filter = test_filter
81 self._target_platform = PLATFORM_MAPPING[os_image]
82 self._working_dir = working_dir
83
84 base_url = data_server.rstrip('/')
85 self.data_server_retrieval = base_url + '/content/retrieve/default/'
86 self._data_server_storage = base_url + '/content/store/default/'
87 self._data_server_has = base_url + '/content/contains/default'
88 self._data_server_get_token = base_url + '/content/get_token'
89
90 self.verbose = bool(verbose)
91 self.profile = bool(profile)
92 self.priority = priority
93
94 self._zip_file_hash = ''
95 self._tasks = []
96 self._files = {}
97 self._token_cache = None
98
99 def _token(self):
100 if not self._token_cache:
101 result = run_isolated.url_open(self._data_server_get_token)
102 if not result:
103 # TODO(maruel): Implement authentication.
104 raise Failure('Failed to get token, need authentication')
105 # Quote it right away, so creating the urls is simpler.
106 self._token_cache = urllib.quote(result.read())
107 return self._token_cache
108
109 def add_task(self, task_name, actions, time_out=600):
110 """Appends a new task to the swarm manifest file."""
111 # See swarming/src/common/test_request_message.py TestObject constructor for
112 # the valid flags.
113 self._tasks.append(
114 {
115 'action': actions,
116 'decorate_output': self.verbose,
117 'test_name': task_name,
118 'time_out': time_out,
119 })
120
121 def add_file(self, source_path, rel_path):
122 self._files[source_path] = rel_path
123
124 def zip_and_upload(self):
125 """Zips up all the files necessary to run a shard and uploads to Swarming
126 master.
127 """
128 assert not self._zip_file_hash
129 start_time = time.time()
130
131 zip_memory_file = StringIO.StringIO()
132 zip_file = zipfile.ZipFile(zip_memory_file, 'w')
133
134 for source, relpath in self._files.iteritems():
135 zip_file.write(source, relpath)
136
137 zip_file.close()
138 print 'Zipping completed, time elapsed: %f' % (time.time() - start_time)
139
140 zip_memory_file.flush()
141 zip_contents = zip_memory_file.getvalue()
142 zip_memory_file.close()
143
144 self._zip_file_hash = hashlib.sha1(zip_contents).hexdigest()
145
146 response = run_isolated.url_open(
147 self._data_server_has + '?token=%s' % self._token(),
148 data=self._zip_file_hash,
149 content_type='application/octet-stream')
150 if response is None:
151 print >> sys.stderr, (
152 'Unable to query server for zip file presence, aborting.')
153 return False
154
155 if response.read(1) == chr(1):
156 print 'Zip file already on server, no need to reupload.'
157 return True
158
159 print 'Zip file not on server, starting uploading.'
160
161 url = '%s%s?priority=0&token=%s' % (
162 self._data_server_storage, self._zip_file_hash, self._token())
163 response = run_isolated.url_open(
164 url, data=zip_contents, content_type='application/octet-stream')
165 if response is None:
166 print >> sys.stderr, 'Failed to upload the zip file: %s' % url
167 return False
168
169 return True
170
171 def to_json(self):
172 """Exports the current configuration into a swarm-readable manifest file.
173
174 This function doesn't mutate the object.
175 """
176 test_case = {
177 'test_case_name': self._test_name,
178 'data': [
179 [self.data_server_retrieval + urllib.quote(self._zip_file_hash),
180 'swarm_data.zip'],
181 ],
182 'tests': self._tasks,
183 'env_vars': {},
184 'configurations': [
185 {
186 'min_instances': self._shards,
187 'config_name': self._target_platform,
188 'dimensions': {
189 'os': self._target_platform,
190 },
191 },
192 ],
193 'working_dir': self._working_dir,
194 'restart_on_failure': True,
195 'cleanup': 'root',
196 'priority': self.priority,
197 }
198
199 # These flags are googletest specific.
200 if self._test_filter and self._test_filter != '*':
Vadim Sh. 2013/08/16 18:58:58 Actually that part looks more chromium specific (o
M-A Ruel 2013/08/18 00:18:18 That's what the comment one line above at line 199
201 test_case['env_vars']['GTEST_FILTER'] = self._test_filter
202 if self._shards > 1:
203 test_case['env_vars']['GTEST_SHARD_INDEX'] = '%(instance_index)s'
204 test_case['env_vars']['GTEST_TOTAL_SHARDS'] = '%(num_instances)s'
205
206 return json.dumps(test_case, separators=(',',':'))
207
208
209 class Bit(object):
210 """Thread safe setable bit."""
211 _lock = threading.Lock()
212 _value = False
213
214 def get(self):
215 with self._lock:
216 return self._value
217
218 def set(self):
219 with self._lock:
220 self._value = True
221
222
223 def now():
224 """Exists so it can be mocked easily."""
225 return time.time()
226
227
228 def get_test_keys(swarm_base_url, test_name, _=None):
229 """Returns the Swarm test key for each shards of test_name."""
230 # TODO(maruel): Remove the parameter '_' once the
231 # build/scripts/slave/get_swarm_results.py stops passing it.
Vadim Sh. 2013/08/16 18:58:58 Is it still passing it? :)
M-A Ruel 2013/08/18 00:18:18 Done.
232 key_data = urllib.urlencode([('name', test_name)])
233 url = '%s/get_matching_test_cases?%s' % (swarm_base_url, key_data)
234
235 for i in range(run_isolated.URL_OPEN_MAX_ATTEMPTS):
236 response = run_isolated.url_open(url, retry_404=True)
237 if response is None:
238 raise Failure(
239 'Error: Unable to find any tests with the name, %s, on swarm server'
240 % test_name)
241
242 result = response.read()
243 # TODO(maruel): Compare exact string.
244 if 'No matching' in result:
245 logging.warning('Unable to find any tests with the name, %s, on swarm '
246 'server' % test_name)
247 if i != run_isolated.URL_OPEN_MAX_ATTEMPTS:
248 run_isolated.HttpService.sleep_before_retry(i, None)
249 continue
250 return json.loads(result)
251
252 raise Failure(
253 'Error: Unable to find any tests with the name, %s, on swarm server'
254 % test_name)
255
256
257 def retrieve_results(base_url, test_key, timeout, should_stop):
258 """Retrieves results for a single test_key."""
259 assert isinstance(timeout, float)
260 params = [('r', test_key)]
261 result_url = '%s/get_result?%s' % (base_url, urllib.urlencode(params))
262 start = now()
263 while True:
264 if timeout and (now() - start) >= timeout:
265 logging.error('retrieve_results(%s) timed out', base_url)
266 return {}
267 # Do retries ourselves.
268 response = run_isolated.url_open(
269 result_url, retry_404=False, retry_50x=False)
270 if response is None:
271 # Aggressively poll for results. Do not use retry_404 so
272 # should_stop is polled more often.
273 remaining = min(5, timeout - (now() - start)) if timeout else 5
274 if remaining > 0:
275 run_isolated.HttpService.sleep_before_retry(1, remaining)
276 else:
277 try:
278 data = json.load(response) or {}
279 except (ValueError, TypeError):
280 logging.warning(
281 'Received corrupted data for test_key %s. Retrying.', test_key)
282 else:
283 if data['output']:
284 return data
285 if should_stop.get():
286 return {}
287
288
289 def yield_results(swarm_base_url, test_keys, timeout, max_threads):
290 """Yields swarm test results from the swarm server as (index, result).
291
292 Duplicate shards are ignored, the first one to complete is returned.
293
294 max_threads is optional and is used to limit the number of parallel fetches
295 done. Since in general the number of test_keys is in the range <=10, it's not
296 worth normally to limit the number threads. Mostly used for testing purposes.
297 """
298 shards_remaining = range(len(test_keys))
299 number_threads = (
300 min(max_threads, len(test_keys)) if max_threads else len(test_keys))
301 should_stop = Bit()
302 results_remaining = len(test_keys)
303 with run_isolated.ThreadPool(number_threads, number_threads, 0) as pool:
304 try:
305 for test_key in test_keys:
306 pool.add_task(
307 0, retrieve_results, swarm_base_url, test_key, timeout, should_stop)
308 while shards_remaining and results_remaining:
309 result = pool.get_one_result()
310 results_remaining -= 1
311 if not result:
312 # Failed to retrieve one key.
313 logging.error('Failed to retrieve the results for a swarm key')
314 continue
315 shard_index = result['config_instance_index']
316 if shard_index in shards_remaining:
317 shards_remaining.remove(shard_index)
318 yield shard_index, result
319 else:
320 logging.warning('Ignoring duplicate shard index %d', shard_index)
321 # Pop the last entry, there's no such shard.
322 shards_remaining.pop()
323 finally:
324 # Done, kill the remaining threads.
325 should_stop.set()
326
327
328 def chromium_setup(manifest):
329 """Sets up the commands to run.
330
331 Highly chromium specific.
Vadim Sh. 2013/08/16 18:58:58 Actually 'run_isolated.py' is not that chromium sp
M-A Ruel 2013/08/18 00:18:18 But still. We'll have to make this less hardcoded.
332 """
333 cleanup_script_name = 'swarm_cleanup.py'
334 cleanup_script_path = os.path.join(TOOLS_PATH, cleanup_script_name)
335 run_test_name = 'run_isolated.py'
336 run_test_path = os.path.join(ROOT_DIR, run_test_name)
337
338 manifest.add_file(run_test_path, run_test_name)
339 manifest.add_file(cleanup_script_path, cleanup_script_name)
340 run_cmd = [
341 'python', run_test_name,
342 '--hash', manifest.manifest_hash,
343 '--remote', manifest.data_server_retrieval.rstrip('/') + '-gzip/',
344 ]
345 if manifest.verbose or manifest.profile:
346 # Have it print the profiling section.
347 run_cmd.append('--verbose')
348 manifest.add_task('Run Test', run_cmd)
349
350 # Clean up
351 manifest.add_task('Clean Up', ['python', cleanup_script_name])
352
353
354 def process_manifest(
355 file_sha1, test_name, shards, test_filter, os_image, working_dir,
356 data_server, swarm_url, verbose, profile, priority):
357 """Process the manifest file and send off the swarm test request."""
358 try:
359 manifest = Manifest(
360 file_sha1, test_name, shards, test_filter, os_image, working_dir,
361 data_server, verbose, profile, priority)
362 except ValueError as e:
363 print >> sys.stderr, 'Unable to process %s: %s' % (test_name, e)
364 return 1
365
366 chromium_setup(manifest)
367
368 # Zip up relevent files
369 print "Zipping up files..."
370 if not manifest.zip_and_upload():
371 return 1
372
373 # Send test requests off to swarm.
374 print('Sending test requests to swarm.')
375 print('Server: %s' % swarm_url)
376 print('Job name: %s' % test_name)
377 test_url = swarm_url.rstrip('/') + '/test'
378 manifest_text = manifest.to_json()
379 result = run_isolated.url_open(test_url, data={'request': manifest_text})
380 if not result:
381 print >> sys.stderr, 'Failed to send test for %s\n%s' % (
382 test_name, test_url)
383 return 1
384 try:
385 json.load(result)
386 except (ValueError, TypeError) as e:
387 print >> sys.stderr, 'Failed to send test for %s' % test_name
388 print >> sys.stderr, 'Manifest: %s' % manifest_text
389 print >> sys.stderr, str(e)
390 return 1
391 return 0
392
393
394 def run(cmd, verbose):
395 if verbose:
396 print('Running: %s' % ' '.join(cmd))
397 cmd = [sys.executable, os.path.join(ROOT_DIR, cmd[0])] + cmd[1:]
398 if verbose and sys.platform != 'win32':
399 cmd = ['time', '-p'] + cmd
400 subprocess.check_call(cmd)
401
402
403 def trigger_and_return(
Vadim Sh. 2013/08/16 18:58:58 Is this function covered by any test?
M-A Ruel 2013/08/18 00:18:18 Nope. That's why we have a canary master.
404 isolate, isolated, swarm_server, cad_server, slave_os, verbose):
405 """Does the archive, trigger and get results dance."""
406 prefix = getpass.getuser() + '-' + datetime.datetime.now().isoformat() + '-'
407 shards = 1
408
409 if verbose:
410 os.environ.setdefault('ISOLATE_DEBUG', '2')
411
412 tempdir = None
413 try:
414 if not isolated:
415 # A directory is used because isolated + '.state' will also be created.
416 tempdir = tempfile.mkdtemp(prefix='swarm_trigger_and_get_results')
417 isolated = os.path.join(tempdir, 'swarm_trigger.isolated')
418 step_name = os.path.basename(isolate).split('.', 1)[0]
419 else:
420 step_name = os.path.basename(isolated).split('.', 1)[0]
421
422 print('Archiving')
423 cmd = [
424 'isolate.py',
425 'hashtable',
426 '--outdir', cad_server,
427 '--isolated', isolated,
428 ]
429 if isolate:
430 cmd.extend(('--isolate', isolate))
431 if slave_os:
432 cmd.extend(('-V', 'OS', run_isolated.FLAVOR_MAPPING[slave_os]))
433 run(cmd, verbose)
434
435 print('\nRunning')
436 hashval = hashlib.sha1(open(isolated, 'rb').read()).hexdigest()
437 cmd = [
438 'swarm_trigger_step.py',
Vadim Sh. 2013/08/16 18:58:58 'swarm_trigger_step.py' is no more.
M-A Ruel 2013/08/18 00:18:18 This code was not completed yet, as I hadn't sent
439 '--swarm-url', swarm_server,
440 '--test-name-prefix', prefix,
441 '--data-server', cad_server,
442 '--run_from_hash',
443 hashval,
444 step_name,
445 str(shards),
446 '',
447 ]
448 if slave_os:
449 cmd.extend(('--os_image', slave_os))
450 run(cmd, verbose)
451
452 print('\nGetting results')
453 run(
454 [
455 'swarm_get_results.py',
Vadim Sh. 2013/08/16 18:58:58 Same here. Actually, maybe do it all in the same
456 '--url', swarm_server,
457 prefix + step_name,
458 ],
459 verbose)
460 return 0
461 finally:
462 if tempdir:
463 shutil.rmtree(tempdir)
464
465
466 @subcommand.usage('test_name')
467 def CMDresults(parser, args):
468 """Retrieves results of a Swarming job.
469
470 The result can be in multiple part if the execution was sharded. It can
471 potentially have retries.
472 """
473 parser.add_option(
474 '-u', '--url', default=SWARM_SERVER,
Vadim Sh. 2013/08/16 18:58:58 --url here, --swarm-url below :(
M-A Ruel 2013/08/18 00:18:18 Yep, being addressed.
475 help='Specify the url of the Swarm server, defaults: %default')
476 parser.add_option(
477 '-t', '--timeout',
478 type='float',
479 default=DEFAULT_SHARD_WAIT_TIME,
480 help='Timeout to wait for result, set to 0 for no timeout; default: '
481 '%default s')
482 # TODO(maruel): Remove once the masters have been updated.
483 parser.add_option(
484 '-s', '--shards',
485 help=optparse.SUPPRESS_HELP)
486
487 (options, args) = parser.parse_args(args)
488 if not args:
489 parser.error('Must specify one test name.')
490 elif len(args) > 1:
491 parser.error('Must specify only one test name.')
492
493 options.url = options.url.rstrip('/')
494 test_name = args[0]
495
496 try:
497 test_keys = get_test_keys(options.url, test_name)
498 except Failure as e:
499 parser.error(e.args[0])
500 if not test_keys:
501 parser.error('No test keys to get results with.')
502
503 options.shards = len(test_keys) if options.shards == -1 else options.shards
504 exit_code = None
505 for _index, output in yield_results(
506 options.url, test_keys, options.timeout, None):
507 print(
508 '%s/%s: %s' % (
509 output['machine_id'], output['machine_tag'], output['exit_codes']))
510 print(''.join(' %s\n' % l for l in output['output'].splitlines()))
511 exit_code = max(exit_code, max(map(int, output['exit_codes'].split(','))))
512
513 return exit_code
514
515
516 def CMDrun(parser, args):
517 """Archives a .isolated file, triggers it on Swarm and get the results.
518
519 Basically, everything to run a command remotely.
520 """
521 parser.add_option('-i', '--isolate', help='.isolate file to use')
522 parser.add_option(
523 '-s', '--isolated',
524 help='.isolated file to use. One of -i or -s must be used.')
525 parser.add_option(
526 '-o', '--os_image',
527 metavar='OS',
528 help='Swarm OS image to request. Should be one of the valid sys.platform '
529 'values like darwin, linux2 or win32.')
530 parser.add_option(
531 '-u', '--swarm-url',
532 metavar='URL',
533 default=SWARM_SERVER,
534 help='Specify the url of the Swarm server. Defaults to %default')
535 parser.add_option(
536 '-d', '--data-server',
537 default=ISOLATE_SERVER,
538 metavar='URL',
539 help='The server where all the test data is stored. Defaults to %default')
540 options, args = parser.parse_args(args)
541
542 if args:
543 parser.error('Use one of -i or -s but no unsupported arguments: %s' % args)
544 if not options.isolate and not options.isolated:
545 parser.error('Use one of -i or -s')
546
547 return trigger_and_return(
548 options.isolate,
549 options.isolated,
550 options.swarm_url,
551 options.data_server,
552 options.os_image,
553 options.verbose)
554
555
556 def CMDtrigger(parser, args):
557 """Triggers a Swarm request based off of a .isolated file.
558
559 This script takes a .isolated file, packages it, and sends a Swarm manifest
560 file to the Swarm server. This is expected to be called as a build step with
561 the cwd as the parent of the src/ directory.
562 """
563 parser.add_option('-w', '--working_dir', default='swarm_tests',
564 help='Desired working direction on the swarm slave side. '
Vadim Sh. 2013/08/16 18:58:58 'direction'? Also why do we need this argument?
M-A Ruel 2013/08/18 00:18:18 Copy-pasted code from the original script.
csharp 2013/08/19 13:46:04 This lets swarm know what directory to download, u
565 'Defaults to %default.')
566 parser.add_option('-o', '--os_image',
567 help='Swarm OS image to request.')
568 parser.add_option('-u', '--swarm-url', default=SWARM_SERVER,
569 help='Specify the url of the Swarm server. '
570 'Defaults to %default')
571 parser.add_option('-d', '--data-server',
Vadim Sh. 2013/08/16 18:58:58 At some point we need to converge on single term f
M-A Ruel 2013/08/18 00:18:18 Doing now.
572 help='The server where all the test data is stored.')
573 parser.add_option('-t', '--test-name-prefix', default='',
574 help='Specify the prefix to give the swarm test request. '
575 'Defaults to %default')
576 parser.add_option('--run_from_hash', nargs=4, action='append', default=[],
577 help='Specify a hash to run on swarm. The format is '
578 '(hash, hash_test_name, shards, test_filter). This may be '
579 'used multiple times to send multiple hashes.')
580 parser.add_option('--profile', action='store_true',
581 default=bool(os.environ.get('ISOLATE_DEBUG')),
582 help='Have run_isolated.py print profiling info')
583 parser.add_option('--priority', type='int', default=100,
584 help='The lower value, the more important the task is')
585 (options, args) = parser.parse_args(args)
586
587 if args:
588 parser.error('Unknown args: %s' % args)
589 if not options.data_server:
590 parser.error('Must specify the data directory')
591 if not options.run_from_hash:
592 parser.error('At least one --run_from_hash is required.')
593
594 if not options.os_image or options.os_image == 'None':
595 # This means the Try Server/user wants to use the current OS.
596 options.os_image = sys.platform
597
598 highest_exit_code = 0
599 # Send off the hash swarm test requests.
600 for (file_sha1, test_name, shards, testfilter) in options.run_from_hash:
601 exit_code = process_manifest(
602 file_sha1,
603 options.test_name_prefix + test_name,
604 int(shards),
605 testfilter,
606 options.os_image,
607 options.working_dir,
608 options.data_server,
609 options.swarm_url,
610 options.verbose,
611 options.profile,
612 options.priority)
613 highest_exit_code = max(highest_exit_code, exit_code)
614 return highest_exit_code
615
616
617 def main(args):
618 dispatcher = subcommand.CommandDispatcher(__name__)
619 try:
620 return dispatcher.execute(
621 trace_inputs.OptionParserWithLogging(prog='swarming.py'), args)
Vadim Sh. 2013/08/16 18:58:58 I'm really looking forward to a time when we'll ha
M-A Ruel 2013/08/18 00:18:18 Yes but that won't happen until run_isolated.py is
622 except (
623 Failure,
624 run_isolated.MappingError,
625 run_isolated.ConfigError) as e:
626 sys.stderr.write('\nError: ')
627 sys.stderr.write(str(e))
628 sys.stderr.write('\n')
629 return 1
630
631
632 if __name__ == '__main__':
633 fix_encoding.fix_encoding()
634 run_isolated.disable_buffering()
635 sys.exit(main(sys.argv[1:]))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698