Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/env python | |
| 2 # Copyright 2013 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 """Triggers a ton of fake jobs to test its handling under high load. | |
| 7 | |
| 8 Generates an histogram with the latencies to process the tasks and number of | |
| 9 retries. | |
| 10 """ | |
| 11 | |
| 12 import hashlib | |
| 13 import json | |
| 14 import logging | |
| 15 import optparse | |
| 16 import os | |
| 17 import Queue | |
| 18 import socket | |
| 19 import StringIO | |
| 20 import sys | |
| 21 import threading | |
| 22 import time | |
| 23 import zipfile | |
| 24 | |
| 25 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| 26 | |
| 27 sys.path.insert(0, ROOT_DIR) | |
| 28 | |
| 29 from third_party import colorama | |
| 30 from third_party.requests.packages import urllib3 | |
| 31 | |
| 32 from utils import graph | |
| 33 from utils import net | |
| 34 from utils import threading_utils | |
| 35 | |
| 36 # Line too long (NN/80) | |
| 37 # pylint: disable=C0301 | |
| 38 | |
| 39 | |
| 40 def print_results(results, columns, buckets): | |
| 41 delays = [i for i in results if isinstance(i, float)] | |
| 42 failures = [i for i in results if not isinstance(i, float)] | |
| 43 | |
| 44 print('%sDELAYS%s:' % (colorama.Fore.RED, colorama.Fore.RESET)) | |
| 45 graph.print_histogram( | |
| 46 graph.generate_histogram(delays, buckets), columns, ' %.3f') | |
| 47 print('') | |
| 48 print('Total items : %d' % len(results)) | |
| 49 average = 0 | |
| 50 if delays: | |
| 51 average = sum(delays)/ len(delays) | |
| 52 print('Average delay: %s' % graph.to_units(average)) | |
| 53 #print('Average overhead: %s' % graph.to_units(total_size / len(sizes))) | |
|
csharp
2013/10/02 21:56:24
Remove or uncomment
M-A Ruel
2013/10/02 23:44:25
Done.
| |
| 54 print('') | |
| 55 if failures: | |
|
csharp
2013/10/02 21:56:24
Nit: new line above
M-A Ruel
2013/10/02 23:44:25
Done.
| |
| 56 print('%sEVENTS%s:' % (colorama.Fore.RED, colorama.Fore.RESET)) | |
| 57 values = {} | |
| 58 for f in failures: | |
| 59 values.setdefault(f, 0) | |
| 60 values[f] += 1 | |
| 61 graph.print_histogram(values, columns, ' %s') | |
| 62 print('') | |
| 63 | |
| 64 | |
| 65 def calculate_version(url): | |
|
csharp
2013/10/02 21:56:24
Neat way of tricking the server
| |
| 66 """Retrieves the swarm_bot code and returns the SHA-1 for it.""" | |
| 67 # Cannot use url_open() since zipfile requires .seek(). | |
| 68 archive = zipfile.ZipFile(StringIO.StringIO(net.url_read(url))) | |
| 69 # See | |
| 70 # https://code.google.com/p/swarming/source/browse/src/common/version.py?repo= swarming-server | |
| 71 files = ( | |
| 72 'slave_machine.py', | |
| 73 'swarm_bot/local_test_runner.py', | |
| 74 'common/__init__.py', | |
| 75 'common/swarm_constants.py', | |
| 76 'common/version.py', | |
| 77 'common/test_request_message.py', | |
| 78 'common/url_helper.py', | |
| 79 ) | |
| 80 d = hashlib.sha1() | |
| 81 for f in files: | |
| 82 d.update(archive.read(f)) | |
| 83 return d.hexdigest() | |
| 84 | |
| 85 | |
| 86 class FakeSwarmBot(object): | |
| 87 """This is a Fake swarm_bot implementation simulating it is running AIX. | |
| 88 | |
| 89 If someones fires up a real AIX slave, well, sorry. | |
|
csharp
2013/10/02 21:56:24
But as the greatest OS ever, shouldn't this be a b
M-A Ruel
2013/10/02 23:44:25
We can fix that once it becomes a problem. I could
| |
| 90 | |
| 91 It polls for job, acts as if it was processing them and return the fake | |
| 92 result. | |
| 93 """ | |
| 94 def __init__( | |
| 95 self, swarming_url, index, progress, duration, ping, events, | |
| 96 kill_event): | |
| 97 self._lock = threading.Lock() | |
| 98 self._swarming = swarming_url | |
| 99 self._index = index | |
| 100 self._progress = progress | |
| 101 self._duration = duration | |
| 102 self._ping = ping | |
| 103 self._events = events | |
| 104 self._kill_event = kill_event | |
| 105 | |
| 106 # See | |
| 107 # https://code.google.com/p/swarming/source/browse/src/swarm_bot/slave_machi ne.py?repo=swarming-server | |
| 108 # and | |
| 109 # https://chromium.googlesource.com/chromium/tools/build.git/+/master/script s/tools/swarm_bootstrap/swarm_bootstrap.py | |
| 110 # for more details. | |
| 111 self._attributes = { | |
| 112 'dimensions': { | |
| 113 # Use improbable values to reduce the chance of interferring with real | |
| 114 # slaves. | |
| 115 'bits': '36', | |
| 116 'machine': os.uname()[4] + '-experimental', | |
| 117 'os': ['AIX'], | |
| 118 }, | |
| 119 # Use an impossible hostname. | |
| 120 'id': '%s-%d' % (socket.getfqdn().lower(), index), | |
| 121 'try_count': 0, | |
| 122 'tag': '%s-%d' % (socket.getfqdn().lower(), index), | |
| 123 # Wait for UpdateSlave RPC to be able to calculate the proper SHA-1. | |
| 124 'version': '0' * 40, | |
| 125 } | |
| 126 | |
| 127 self._thread = threading.Thread(target=self._run, name='bot%d' % index) | |
| 128 self._thread.daemon = True | |
| 129 self._thread.start() | |
| 130 | |
| 131 def join(self): | |
| 132 self._thread.join() | |
| 133 | |
| 134 def is_alive(self): | |
| 135 return self._thread.is_alive() | |
| 136 | |
| 137 def _run(self): | |
| 138 try: | |
| 139 self._progress.update_item('%d alive' % self._index, bots=1) | |
| 140 while True: | |
| 141 if self._kill_event.get(): | |
| 142 return | |
| 143 # Insert real code to fetch task from Swarming here. | |
|
csharp
2013/10/02 21:56:24
I'm a bit confused by this comment, it kind of rea
M-A Ruel
2013/10/02 23:44:25
It was a todo that I forgot, removed.
| |
| 144 data = {'attributes': json.dumps(self._attributes)} | |
| 145 try: | |
| 146 request = net.url_open(self._swarming + '/poll_for_test', data=data) | |
| 147 except urllib3.exceptions.ClosedPoolError: | |
| 148 # Work around an internal bug in urllib3. | |
|
csharp
2013/10/02 21:56:24
If this is an internal urllib3 bug why are be stor
M-A Ruel
2013/10/02 23:44:25
The upgrade to requests v2 should have fixed it, I
| |
| 149 self._events.put('poll_for_test_urllib3') | |
| 150 continue | |
| 151 if request is None: | |
| 152 self._events.put('poll_for_test_empty') | |
| 153 continue | |
| 154 start = time.time() | |
| 155 try: | |
| 156 manifest = json.load(request) | |
| 157 except ValueError: | |
| 158 self._progress.update_item('Failed to poll') | |
| 159 self._events.put('poll_for_test_invalid') | |
| 160 continue | |
| 161 | |
| 162 commands = [c['function'] for c in manifest.get('commands', [])] | |
| 163 if not commands: | |
| 164 # Nothing to run. | |
| 165 self._events.put('sleep') | |
| 166 time.sleep(manifest['come_back']) | |
|
csharp
2013/10/02 21:56:24
Why we do we follow this parameter when we ignore
M-A Ruel
2013/10/02 23:44:25
No specific reason. What do you think is best?
csharp
2013/10/03 16:08:04
After thinking about this over night, I actually t
Marc-Antoine Ruel (Google)
2013/10/03 16:47:11
Done.
| |
| 167 continue | |
| 168 | |
| 169 if commands == ['UpdateSlave']: | |
|
csharp
2013/10/02 21:56:24
Why not just call this once before creating all th
M-A Ruel
2013/10/02 23:44:25
Done.
| |
| 170 # Calculate the proper SHA-1 and loop again. | |
| 171 self._attributes['version'] = calculate_version( | |
| 172 manifest['commands'][0]['args']) | |
| 173 self._events.put('update_slave') | |
| 174 continue | |
| 175 | |
| 176 if commands != ['StoreFiles', 'RunCommands']: | |
| 177 self._progress.update_item( | |
| 178 'Unexpected RPC call %s\n%s' % (commands, manifest)) | |
| 179 self._events.put('unknown_rpc') | |
| 180 break | |
| 181 | |
| 182 # The normal way Swarming works is that it 'stores' a test_run.swarm | |
| 183 # file and then defer control to swarm_bot/local_test_runner.py. | |
| 184 store_cmd = manifest['commands'][0] | |
| 185 assert len(store_cmd['args']) == 1, store_cmd['args'] | |
| 186 filepath, filename, test_run_content = store_cmd['args'][0] | |
| 187 assert filepath == '' | |
| 188 assert filename == 'test_run.swarm' | |
| 189 assert manifest['commands'][1] == { | |
|
csharp
2013/10/02 21:56:24
I'm not sure we should be checking that the exact
csharp
2013/10/03 16:08:04
ping?
Marc-Antoine Ruel (Google)
2013/10/03 16:47:11
Done.
| |
| 190 u'function': u'RunCommands', | |
| 191 u'args': [ | |
| 192 u'swarm_bot/local_test_runner.py', u'-f', | |
| 193 u'test_run.swarm', u'--restart_on_failure', | |
| 194 ], | |
| 195 }, manifest['commands'][1] | |
| 196 result_url = manifest['result_url'] | |
| 197 test_run = json.loads(test_run_content) | |
| 198 assert result_url == test_run['result_url'] | |
| 199 ping_url = test_run['ping_url'] | |
| 200 self._progress.update_item('%d processing' % self._index, processing=1) | |
| 201 | |
| 202 # Fake activity and send ping request every 0.5 second. | |
| 203 while True: | |
|
csharp
2013/10/02 21:56:24
Since we know how many times we should execute thi
M-A Ruel
2013/10/02 23:44:25
Because the call net.url_read() takes a indetermin
| |
| 204 remaining = max(0, time.time() - start - self._duration) | |
| 205 if remaining > self._ping: | |
| 206 # In theory, we should use test_run['ping_delay'] but this is a load | |
| 207 # test. Make sure the server melts down. | |
| 208 result = net.url_read(ping_url) | |
| 209 assert result == 'OK' | |
| 210 remaining = max(0, time.time() - start - self._duration) | |
| 211 if not remaining: | |
| 212 break | |
| 213 time.sleep(remaining) | |
| 214 | |
| 215 data = { | |
| 216 'c': test_run['configuration']['config_name'], | |
| 217 'n': test_run['test_run_name'], | |
| 218 'o': False, | |
| 219 'result_output': 'This task ran with great success', | |
|
csharp
2013/10/02 21:56:24
Excellent output :)
| |
| 220 's': True, | |
| 221 'x': '0', | |
| 222 } | |
| 223 result = net.url_read(manifest['result_url'], data=data) | |
| 224 self._progress.update_item( | |
| 225 '%d processed' % self._index, processing=-1, processed=1) | |
| 226 if not result: | |
| 227 self._events.put('result_url_fail') | |
| 228 else: | |
| 229 assert result == 'Successfully update the runner results.', result | |
| 230 self._events.put(time.time() - start) | |
| 231 finally: | |
| 232 self._progress.update_item('%d quit' % self._index, bots=-1) | |
| 233 | |
| 234 | |
| 235 def main(): | |
| 236 colorama.init() | |
| 237 parser = optparse.OptionParser(description=sys.modules[__name__].__doc__) | |
| 238 parser.add_option( | |
| 239 '-S', '--swarming', | |
| 240 metavar='URL', default='', | |
| 241 help='Swarming server to use') | |
| 242 | |
| 243 group = optparse.OptionGroup(parser, 'Load generated') | |
| 244 group.add_option( | |
| 245 '--slaves', type='int', default=300, metavar='N', | |
| 246 help='Number of swarm bot slaves, default: %default') | |
| 247 group.add_option( | |
| 248 '-c', '--consume', type='float', default=60., metavar='N', | |
| 249 help='Duration (s) for consuming a request, default: %default') | |
| 250 group.add_option( | |
| 251 '-p', '--ping', type='float', default=0.5, metavar='N', | |
| 252 help='Ping delay (s) while consuming a request, normally, it would be in ' | |
| 253 'the range of 30s but this is a load test, default: %default') | |
| 254 parser.add_option_group(group) | |
| 255 | |
| 256 group = optparse.OptionGroup(parser, 'Display options') | |
| 257 group.add_option( | |
| 258 '--columns', type='int', default=graph.get_console_width(), metavar='N', | |
| 259 help='For histogram display, default:%default') | |
| 260 group.add_option( | |
| 261 '--buckets', type='int', default=20, metavar='N', | |
| 262 help='Number of buckets for histogram display, default:%default') | |
| 263 parser.add_option_group(group) | |
| 264 | |
| 265 parser.add_option( | |
| 266 '--dump', metavar='FOO.JSON', help='Dumps to json file') | |
| 267 parser.add_option( | |
| 268 '-v', '--verbose', action='store_true', help='Enables logging') | |
| 269 | |
| 270 options, args = parser.parse_args() | |
| 271 logging.basicConfig(level=logging.INFO if options.verbose else logging.FATAL) | |
| 272 if args: | |
| 273 parser.error('Unsupported args: %s' % args) | |
| 274 options.swarming = options.swarming.rstrip('/') | |
| 275 if not options.swarming: | |
| 276 parser.error('--swarming is required.') | |
| 277 if options.consume <= 0: | |
| 278 parser.error('Needs --consume > 0. 0.01 is a valid value.') | |
| 279 | |
| 280 print( | |
| 281 'Running %d slaves, each task lasting %.1fs' % ( | |
| 282 options.slaves, options.consume)) | |
| 283 | |
| 284 print('Ctrl-C to exit.') | |
| 285 print('[processing/processed/bots]') | |
| 286 columns = [('processing', 0), ('processed', 0), ('bots', 0)] | |
| 287 progress = threading_utils.Progress(columns) | |
| 288 events = Queue.Queue() | |
| 289 start = time.time() | |
| 290 kill_event = threading_utils.Bit() | |
| 291 slaves = [ | |
| 292 FakeSwarmBot( | |
| 293 options.swarming, i, progress, options.consume, options.ping, events, | |
| 294 kill_event) | |
| 295 for i in range(options.slaves) | |
| 296 ] | |
| 297 try: | |
| 298 # Wait for all the slaves to come alive. | |
| 299 while not all(s.is_alive() for s in slaves): | |
| 300 time.sleep(0.01) | |
| 301 progress.update_item('Ready to run') | |
| 302 while slaves: | |
| 303 progress.print_update() | |
| 304 time.sleep(0.01) | |
| 305 # The slaves could be told to suicide. | |
| 306 slaves = [s for s in slaves if s.is_alive()] | |
| 307 except KeyboardInterrupt: | |
| 308 kill_event.set() | |
| 309 | |
| 310 progress.update_item('Waiting for slaves to quit.', raw=True) | |
| 311 progress.update_item('') | |
| 312 while slaves: | |
| 313 progress.print_update() | |
| 314 slaves = [s for s in slaves if s.is_alive()] | |
| 315 # At this point, progress is not used anymore. | |
| 316 print('') | |
| 317 print('Ran for %.1fs.' % (time.time() - start)) | |
| 318 print('') | |
| 319 results = events.queue | |
| 320 print_results(results, options.columns, options.buckets) | |
| 321 if options.dump: | |
| 322 with open(options.dump, 'w') as f: | |
| 323 json.dump(results, f, separators=(',',':')) | |
| 324 return 0 | |
| 325 | |
| 326 | |
| 327 if __name__ == '__main__': | |
| 328 sys.exit(main()) | |
| OLD | NEW |