Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1082)

Side by Side Diff: tools/swarming_load_test_bot.py

Issue 25530003: Rename load_test to isolateserver_load_test, create swarming_load_test. (Closed) Base URL: https://chromium.googlesource.com/a/chromium/tools/swarm_client@2_exception
Patch Set: Use named columns Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tools/swarming_load_test_client.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Triggers a ton of fake jobs to test its handling under high load.
7
8 Generates an histogram with the latencies to process the tasks and number of
9 retries.
10 """
11
12 import hashlib
13 import json
14 import logging
15 import optparse
16 import os
17 import Queue
18 import socket
19 import StringIO
20 import sys
21 import threading
22 import time
23 import zipfile
24
25 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
26
27 sys.path.insert(0, ROOT_DIR)
28
29 from third_party import colorama
30 from third_party.requests.packages import urllib3
31
32 from utils import graph
33 from utils import net
34 from utils import threading_utils
35
36 # Line too long (NN/80)
37 # pylint: disable=C0301
38
39
40 def print_results(results, columns, buckets):
41 delays = [i for i in results if isinstance(i, float)]
42 failures = [i for i in results if not isinstance(i, float)]
43
44 print('%sDELAYS%s:' % (colorama.Fore.RED, colorama.Fore.RESET))
45 graph.print_histogram(
46 graph.generate_histogram(delays, buckets), columns, ' %.3f')
47 print('')
48 print('Total items : %d' % len(results))
49 average = 0
50 if delays:
51 average = sum(delays)/ len(delays)
52 print('Average delay: %s' % graph.to_units(average))
53 #print('Average overhead: %s' % graph.to_units(total_size / len(sizes)))
csharp 2013/10/02 21:56:24 Remove or uncomment
M-A Ruel 2013/10/02 23:44:25 Done.
54 print('')
55 if failures:
csharp 2013/10/02 21:56:24 Nit: new line above
M-A Ruel 2013/10/02 23:44:25 Done.
56 print('%sEVENTS%s:' % (colorama.Fore.RED, colorama.Fore.RESET))
57 values = {}
58 for f in failures:
59 values.setdefault(f, 0)
60 values[f] += 1
61 graph.print_histogram(values, columns, ' %s')
62 print('')
63
64
65 def calculate_version(url):
csharp 2013/10/02 21:56:24 Neat way of tricking the server
66 """Retrieves the swarm_bot code and returns the SHA-1 for it."""
67 # Cannot use url_open() since zipfile requires .seek().
68 archive = zipfile.ZipFile(StringIO.StringIO(net.url_read(url)))
69 # See
70 # https://code.google.com/p/swarming/source/browse/src/common/version.py?repo= swarming-server
71 files = (
72 'slave_machine.py',
73 'swarm_bot/local_test_runner.py',
74 'common/__init__.py',
75 'common/swarm_constants.py',
76 'common/version.py',
77 'common/test_request_message.py',
78 'common/url_helper.py',
79 )
80 d = hashlib.sha1()
81 for f in files:
82 d.update(archive.read(f))
83 return d.hexdigest()
84
85
86 class FakeSwarmBot(object):
87 """This is a Fake swarm_bot implementation simulating it is running AIX.
88
89 If someones fires up a real AIX slave, well, sorry.
csharp 2013/10/02 21:56:24 But as the greatest OS ever, shouldn't this be a b
M-A Ruel 2013/10/02 23:44:25 We can fix that once it becomes a problem. I could
90
91 It polls for job, acts as if it was processing them and return the fake
92 result.
93 """
94 def __init__(
95 self, swarming_url, index, progress, duration, ping, events,
96 kill_event):
97 self._lock = threading.Lock()
98 self._swarming = swarming_url
99 self._index = index
100 self._progress = progress
101 self._duration = duration
102 self._ping = ping
103 self._events = events
104 self._kill_event = kill_event
105
106 # See
107 # https://code.google.com/p/swarming/source/browse/src/swarm_bot/slave_machi ne.py?repo=swarming-server
108 # and
109 # https://chromium.googlesource.com/chromium/tools/build.git/+/master/script s/tools/swarm_bootstrap/swarm_bootstrap.py
110 # for more details.
111 self._attributes = {
112 'dimensions': {
113 # Use improbable values to reduce the chance of interferring with real
114 # slaves.
115 'bits': '36',
116 'machine': os.uname()[4] + '-experimental',
117 'os': ['AIX'],
118 },
119 # Use an impossible hostname.
120 'id': '%s-%d' % (socket.getfqdn().lower(), index),
121 'try_count': 0,
122 'tag': '%s-%d' % (socket.getfqdn().lower(), index),
123 # Wait for UpdateSlave RPC to be able to calculate the proper SHA-1.
124 'version': '0' * 40,
125 }
126
127 self._thread = threading.Thread(target=self._run, name='bot%d' % index)
128 self._thread.daemon = True
129 self._thread.start()
130
131 def join(self):
132 self._thread.join()
133
134 def is_alive(self):
135 return self._thread.is_alive()
136
137 def _run(self):
138 try:
139 self._progress.update_item('%d alive' % self._index, bots=1)
140 while True:
141 if self._kill_event.get():
142 return
143 # Insert real code to fetch task from Swarming here.
csharp 2013/10/02 21:56:24 I'm a bit confused by this comment, it kind of rea
M-A Ruel 2013/10/02 23:44:25 It was a todo that I forgot, removed.
144 data = {'attributes': json.dumps(self._attributes)}
145 try:
146 request = net.url_open(self._swarming + '/poll_for_test', data=data)
147 except urllib3.exceptions.ClosedPoolError:
148 # Work around an internal bug in urllib3.
csharp 2013/10/02 21:56:24 If this is an internal urllib3 bug why are be stor
M-A Ruel 2013/10/02 23:44:25 The upgrade to requests v2 should have fixed it, I
149 self._events.put('poll_for_test_urllib3')
150 continue
151 if request is None:
152 self._events.put('poll_for_test_empty')
153 continue
154 start = time.time()
155 try:
156 manifest = json.load(request)
157 except ValueError:
158 self._progress.update_item('Failed to poll')
159 self._events.put('poll_for_test_invalid')
160 continue
161
162 commands = [c['function'] for c in manifest.get('commands', [])]
163 if not commands:
164 # Nothing to run.
165 self._events.put('sleep')
166 time.sleep(manifest['come_back'])
csharp 2013/10/02 21:56:24 Why we do we follow this parameter when we ignore
M-A Ruel 2013/10/02 23:44:25 No specific reason. What do you think is best?
csharp 2013/10/03 16:08:04 After thinking about this over night, I actually t
Marc-Antoine Ruel (Google) 2013/10/03 16:47:11 Done.
167 continue
168
169 if commands == ['UpdateSlave']:
csharp 2013/10/02 21:56:24 Why not just call this once before creating all th
M-A Ruel 2013/10/02 23:44:25 Done.
170 # Calculate the proper SHA-1 and loop again.
171 self._attributes['version'] = calculate_version(
172 manifest['commands'][0]['args'])
173 self._events.put('update_slave')
174 continue
175
176 if commands != ['StoreFiles', 'RunCommands']:
177 self._progress.update_item(
178 'Unexpected RPC call %s\n%s' % (commands, manifest))
179 self._events.put('unknown_rpc')
180 break
181
182 # The normal way Swarming works is that it 'stores' a test_run.swarm
183 # file and then defer control to swarm_bot/local_test_runner.py.
184 store_cmd = manifest['commands'][0]
185 assert len(store_cmd['args']) == 1, store_cmd['args']
186 filepath, filename, test_run_content = store_cmd['args'][0]
187 assert filepath == ''
188 assert filename == 'test_run.swarm'
189 assert manifest['commands'][1] == {
csharp 2013/10/02 21:56:24 I'm not sure we should be checking that the exact
csharp 2013/10/03 16:08:04 ping?
Marc-Antoine Ruel (Google) 2013/10/03 16:47:11 Done.
190 u'function': u'RunCommands',
191 u'args': [
192 u'swarm_bot/local_test_runner.py', u'-f',
193 u'test_run.swarm', u'--restart_on_failure',
194 ],
195 }, manifest['commands'][1]
196 result_url = manifest['result_url']
197 test_run = json.loads(test_run_content)
198 assert result_url == test_run['result_url']
199 ping_url = test_run['ping_url']
200 self._progress.update_item('%d processing' % self._index, processing=1)
201
202 # Fake activity and send ping request every 0.5 second.
203 while True:
csharp 2013/10/02 21:56:24 Since we know how many times we should execute thi
M-A Ruel 2013/10/02 23:44:25 Because the call net.url_read() takes a indetermin
204 remaining = max(0, time.time() - start - self._duration)
205 if remaining > self._ping:
206 # In theory, we should use test_run['ping_delay'] but this is a load
207 # test. Make sure the server melts down.
208 result = net.url_read(ping_url)
209 assert result == 'OK'
210 remaining = max(0, time.time() - start - self._duration)
211 if not remaining:
212 break
213 time.sleep(remaining)
214
215 data = {
216 'c': test_run['configuration']['config_name'],
217 'n': test_run['test_run_name'],
218 'o': False,
219 'result_output': 'This task ran with great success',
csharp 2013/10/02 21:56:24 Excellent output :)
220 's': True,
221 'x': '0',
222 }
223 result = net.url_read(manifest['result_url'], data=data)
224 self._progress.update_item(
225 '%d processed' % self._index, processing=-1, processed=1)
226 if not result:
227 self._events.put('result_url_fail')
228 else:
229 assert result == 'Successfully update the runner results.', result
230 self._events.put(time.time() - start)
231 finally:
232 self._progress.update_item('%d quit' % self._index, bots=-1)
233
234
235 def main():
236 colorama.init()
237 parser = optparse.OptionParser(description=sys.modules[__name__].__doc__)
238 parser.add_option(
239 '-S', '--swarming',
240 metavar='URL', default='',
241 help='Swarming server to use')
242
243 group = optparse.OptionGroup(parser, 'Load generated')
244 group.add_option(
245 '--slaves', type='int', default=300, metavar='N',
246 help='Number of swarm bot slaves, default: %default')
247 group.add_option(
248 '-c', '--consume', type='float', default=60., metavar='N',
249 help='Duration (s) for consuming a request, default: %default')
250 group.add_option(
251 '-p', '--ping', type='float', default=0.5, metavar='N',
252 help='Ping delay (s) while consuming a request, normally, it would be in '
253 'the range of 30s but this is a load test, default: %default')
254 parser.add_option_group(group)
255
256 group = optparse.OptionGroup(parser, 'Display options')
257 group.add_option(
258 '--columns', type='int', default=graph.get_console_width(), metavar='N',
259 help='For histogram display, default:%default')
260 group.add_option(
261 '--buckets', type='int', default=20, metavar='N',
262 help='Number of buckets for histogram display, default:%default')
263 parser.add_option_group(group)
264
265 parser.add_option(
266 '--dump', metavar='FOO.JSON', help='Dumps to json file')
267 parser.add_option(
268 '-v', '--verbose', action='store_true', help='Enables logging')
269
270 options, args = parser.parse_args()
271 logging.basicConfig(level=logging.INFO if options.verbose else logging.FATAL)
272 if args:
273 parser.error('Unsupported args: %s' % args)
274 options.swarming = options.swarming.rstrip('/')
275 if not options.swarming:
276 parser.error('--swarming is required.')
277 if options.consume <= 0:
278 parser.error('Needs --consume > 0. 0.01 is a valid value.')
279
280 print(
281 'Running %d slaves, each task lasting %.1fs' % (
282 options.slaves, options.consume))
283
284 print('Ctrl-C to exit.')
285 print('[processing/processed/bots]')
286 columns = [('processing', 0), ('processed', 0), ('bots', 0)]
287 progress = threading_utils.Progress(columns)
288 events = Queue.Queue()
289 start = time.time()
290 kill_event = threading_utils.Bit()
291 slaves = [
292 FakeSwarmBot(
293 options.swarming, i, progress, options.consume, options.ping, events,
294 kill_event)
295 for i in range(options.slaves)
296 ]
297 try:
298 # Wait for all the slaves to come alive.
299 while not all(s.is_alive() for s in slaves):
300 time.sleep(0.01)
301 progress.update_item('Ready to run')
302 while slaves:
303 progress.print_update()
304 time.sleep(0.01)
305 # The slaves could be told to suicide.
306 slaves = [s for s in slaves if s.is_alive()]
307 except KeyboardInterrupt:
308 kill_event.set()
309
310 progress.update_item('Waiting for slaves to quit.', raw=True)
311 progress.update_item('')
312 while slaves:
313 progress.print_update()
314 slaves = [s for s in slaves if s.is_alive()]
315 # At this point, progress is not used anymore.
316 print('')
317 print('Ran for %.1fs.' % (time.time() - start))
318 print('')
319 results = events.queue
320 print_results(results, options.columns, options.buckets)
321 if options.dump:
322 with open(options.dump, 'w') as f:
323 json.dump(results, f, separators=(',',':'))
324 return 0
325
326
327 if __name__ == '__main__':
328 sys.exit(main())
OLDNEW
« no previous file with comments | « no previous file | tools/swarming_load_test_client.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698