Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(314)

Side by Side Diff: gpu/tools/check_gpu_bots.py

Issue 588603003: Add utility script for GPU Pixel Wranglers. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2
3 # Copyright 2014 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file.
6
7 import argparse
8 import datetime
9 import getpass
10 import json
11 import os
12 import smtplib
13 import sys
14 import time
15 import urllib
16 import urllib2
17
18 class Emailer:
19 DEFAULT_EMAIL_PASSWORD_FILE = '.email_password'
20 GMAIL_SMTP_SERVER = 'smtp.gmail.com:587'
21 SUBJECT = 'Chrome GPU Bots Notification'
22
23 def __init__(self, email_from, email_to, email_password_file):
24 self.email_from = email_from
25 self.email_to = email_to
26 self.email_password = Emailer._getEmailPassword(email_password_file)
27
28 @staticmethod
29 def format_email_body(time_str, offline_str, failed_str, noteworthy_str):
30 return '%s%s%s%s' % (time_str, offline_str, failed_str, noteworthy_str)
31
32 def send_email(self, body):
33 message = 'From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s' % (self.email_from,
34 ','.join(self.email_to), Emailer.SUBJECT, body)
35
36 try:
37 server = smtplib.SMTP(Emailer.GMAIL_SMTP_SERVER)
38 server.starttls()
39 server.login(self.email_from, self.email_password)
40 server.sendmail(self.email_from, self.email_to, message)
41 server.quit()
42 except Exception as e:
43 print 'Error sending email: %s' % str(e)
44
45 def testEmailLogin(self):
46 server = smtplib.SMTP(Emailer.GMAIL_SMTP_SERVER)
47 server.starttls()
48 server.login(self.email_from, self.email_password)
49 server.quit()
50
51 @staticmethod
52 def _getEmailPassword(email_password_file):
53 password = ''
54
55 password_file = (email_password_file if email_password_file is not None
56 else Emailer.DEFAULT_EMAIL_PASSWORD_FILE)
57
58 if os.path.isfile(password_file):
59 with open(password_file, 'r') as f:
60 password = f.read().strip()
61 else:
62 password = getpass.getpass(
63 'Please enter email password for source email account: ')
64
65 return password
66
67 class GpuBot:
68 def __init__(self, waterfall_name, bot_name, bot_data):
69 self.waterfall_name = waterfall_name
70 self.bot_name = bot_name
71 self.bot_data = bot_data
72 self._end_time = None
73 self._hours_since_last_run = None
74 self.failure_string = None
75 self.bot_url = None
76 self.build_url = None
77
78 def getEndTime(self):
79 return self._end_time
80
81 def setEndTime(self, end_time):
82 self._end_time = end_time
83 self._hours_since_last_run = \
84 roughTimeDiffInHours(end_time, time.localtime())
85
86 def getHoursSinceLastRun(self):
87 return self._hours_since_last_run
88
89 def toDict(self):
90 dict = {'waterfall_name': self.waterfall_name, 'bot_name': self.bot_name}
91
92 if self._end_time is not None:
93 dict['end_time'] = serialTime(self._end_time)
94 dict['hours_since_last_run'] = self._hours_since_last_run
95
96 if self.failure_string is not None:
97 dict['failure_string'] = self.failure_string
98
99 if self.bot_url is not None:
100 dict['bot_url'] = self.bot_url
101
102 if self.build_url is not None:
103 dict['build_url'] = self.build_url
104
105 return dict
106
107 @staticmethod
108 def fromDict(dict):
109 gpu_bot = GpuBot(dict['waterfall_name'], dict['bot_name'], None)
110
111 if 'end_time' in dict:
112 gpu_bot._end_time = unserializeTime(dict['end_time'])
113
114 if 'hours_since_last_run' in dict:
115 self._hours_since_last_run = dict['hours_since_last_run']
116
117 if 'failure_string' in dict:
118 self.failure_string = dict['failure_string']
119
120 if 'bot_url' in dict:
121 self.bot_url = dict['bot_url']
122
123 if 'build_url' in dict:
124 self.build_url = dict['build_url']
125
126 return gpu_bot
127
128 def errorNoMostRecentBuild(waterfall_name, bot_name):
129 print 'No most recent build available: %s::%s' % (waterfall_name, bot_name)
130
131 class Waterfall:
132 BASE_URL = 'http://build.chromium.org/p/'
133 BASE_BUILD_URL = BASE_URL + '%s/builders/%s'
134 SPECIFIC_BUILD_URL = BASE_URL + '%s/builders/%s/builds/%s'
135 BASE_JSON_BUILDERS_URL = BASE_URL + '%s/json/builders'
136 BASE_JSON_BUILDS_URL = BASE_URL + '%s/json/builders/%s/builds'
137 REGULAR_WATERFALLS = ['chromium.gpu',
138 'tryserver.chromium.gpu',
139 'chromium.gpu.fyi']
140 WEBKIT_GPU_BOTS = ['GPU Win Builder',
141 'GPU Win Builder (dbg)',
142 'GPU Win7 (NVIDIA)',
143 'GPU Win7 (dbg) (NVIDIA)',
144 'GPU Mac Builder',
145 'GPU Mac Builder (dbg)',
146 'GPU Mac10.7',
147 'GPU Mac10.7 (dbg)',
148 'GPU Linux Builder',
149 'GPU Linux Builder (dbg)',
150 'GPU Linux (NVIDIA)',
151 'GPU Linux (dbg) (NVIDIA)']
152 FILTERED_WATERFALLS = [('chromium.webkit', WEBKIT_GPU_BOTS)]
153
154 @staticmethod
155 def getJsonFromUrl(url):
156 conn = urllib2.urlopen(url)
157 result = conn.read()
158 conn.close()
159 return json.loads(result)
160
161 @staticmethod
162 def getBuildersJsonForWaterfall(waterfall):
163 querystring = '?filter'
164 return (Waterfall.getJsonFromUrl((Waterfall.BASE_JSON_BUILDERS_URL + '%s')
165 % (waterfall, querystring)))
166
167 @staticmethod
168 def getLastNBuildsForBuilder(n, waterfall, builder):
169 if n <= 0:
170 return {}
171
172 querystring = '?'
173
174 for i in range(n):
175 querystring += 'select=-%d&' % (i + 1)
176
177 querystring += 'filter'
178
179 return Waterfall.getJsonFromUrl((Waterfall.BASE_JSON_BUILDS_URL + '%s') %
180 (waterfall, urllib.quote(builder), querystring))
181
182 @staticmethod
183 def getFilteredBuildersJsonForWaterfall(waterfall, filter):
184 querystring = '?'
185
186 for bot_name in filter:
187 querystring += 'select=%s&' % urllib.quote(bot_name)
188
189 querystring += 'filter'
190
191 return Waterfall.getJsonFromUrl((Waterfall.BASE_JSON_BUILDERS_URL + '%s')
192 % (waterfall, querystring))
193
194 @staticmethod
195 def getAllGpuBots():
196 allbots = {k: Waterfall.getBuildersJsonForWaterfall(k)
197 for k in Waterfall.REGULAR_WATERFALLS}
198
199 filteredbots = {k[0]:
200 Waterfall.getFilteredBuildersJsonForWaterfall(k[0], k[1])
201 for k in Waterfall.FILTERED_WATERFALLS}
202
203 allbots.update(filteredbots)
204
205 return allbots
206
207 @staticmethod
208 def getOfflineBots(bots):
209 offline_bots = []
210
211 for waterfall_name in bots:
212 waterfall = bots[waterfall_name]
213
214 for bot_name in waterfall:
215 bot = waterfall[bot_name]
216
217 if bot['state'] != 'offline':
218 continue
219
220 gpu_bot = GpuBot(waterfall_name, bot_name, bot)
221 gpu_bot.bot_url = Waterfall.BASE_BUILD_URL % (waterfall_name,
222 urllib.quote(bot_name))
223
224 most_recent_build = Waterfall.getMostRecentlyCompletedBuildForBot(
225 gpu_bot)
226
227 if (most_recent_build and 'times' in most_recent_build and
228 most_recent_build['times']):
229 gpu_bot.setEndTime(time.localtime(most_recent_build['times'][1]))
230 else:
231 errorNoMostRecentBuild(waterfall_name, bot_name)
232
233 offline_bots.append(gpu_bot)
234
235 return offline_bots
236
237 @staticmethod
238 def getMostRecentlyCompletedBuildForBot(bot):
239 if bot.bot_data is not None and 'most_recent_build' in bot.bot_data:
240 return bot.bot_data['most_recent_build']
241
242 # Unfortunately, the JSON API doesn't provide a "most recent completed
243 # build" call. We just have to get some number of the most recent (including
244 # current, in-progress builds) and give up if that's not enough.
245 NUM_BUILDS = 10
246 builds = Waterfall.getLastNBuildsForBuilder(NUM_BUILDS, bot.waterfall_name,
247 bot.bot_name)
248
249 for i in range(NUM_BUILDS):
250 current_build_name = '-%d' % (i + 1)
251 current_build = builds[current_build_name]
252
253 if 'results' in current_build and current_build['results'] is not None:
254 if bot.bot_data is not None:
255 bot.bot_data['most_recent_build'] = current_build
256
257 return current_build
258
259 return None
260
261 @staticmethod
262 def getFailedBots(bots):
263 failed_bots = []
264
265 for waterfall_name in bots:
266 waterfall = bots[waterfall_name]
267
268 for bot_name in waterfall:
269 bot = waterfall[bot_name]
270 gpu_bot = GpuBot(waterfall_name, bot_name, bot)
271 gpu_bot.bot_url = Waterfall.BASE_BUILD_URL % (waterfall_name,
272 urllib.quote(bot_name))
273
274 most_recent_build = Waterfall.getMostRecentlyCompletedBuildForBot(
275 gpu_bot)
276
277 if (most_recent_build and 'text' in most_recent_build and
278 'failed' in most_recent_build['text']):
279 gpu_bot.failure_string = ' '.join(most_recent_build['text'])
280 gpu_bot.build_url = Waterfall.SPECIFIC_BUILD_URL % (waterfall_name,
281 urllib.quote(bot_name), most_recent_build['number'])
282 failed_bots.append(gpu_bot)
283 elif not most_recent_build:
284 errorNoMostRecentBuild(waterfall_name, bot_name)
285
286 return failed_bots
287
288 def formatTime(t):
289 return time.strftime("%a, %d %b %Y %H:%M:%S", t)
290
291 def roughTimeDiffInHours(t1, t2):
292 datetimes = []
293
294 for t in [t1, t2]:
295 datetimes.append(datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday,
296 t.tm_hour, t.tm_min, t.tm_sec))
297
298 datetime_diff = datetimes[0] - datetimes[1]
299
300 hours = float(datetime_diff.total_seconds()) / 3600.0
301
302 return abs(hours)
303
304 def getBotStr(bot):
305 s = ' %s::%s\n' % (bot.waterfall_name, bot.bot_name)
306
307 if bot.failure_string is not None:
308 s += ' failure: %s\n' % bot.failure_string
309
310 if bot.getEndTime() is not None:
311 s += (' last build end time: %s (roughly %f hours ago)\n' %
312 (formatTime(bot.getEndTime()), bot.getHoursSinceLastRun()))
313
314 if bot.bot_url is not None:
315 s += ' bot url: %s\n' % bot.bot_url
316
317 if bot.build_url is not None:
318 s += ' build url: %s\n' % bot.build_url
319
320 s += '\n'
321 return s
322
323 def getBotsStr(bots):
324 s = ''
325
326 for bot in bots:
327 s += getBotStr(bot)
328
329 s += '\n'
330 return s
331
332 def getOfflineBotsStr(offline_bots):
333 return 'Offline bots:\n%s' % getBotsStr(offline_bots)
334
335 def getFailedBotsStr(failed_bots):
336 return 'Failed bots:\n%s' % getBotsStr(failed_bots)
337
338 def getBotDicts(bots):
339 dicts = []
340
341 for bot in bots:
342 dicts.append(bot.toDict())
343
344 return dicts
345
346 def unserializeTime(t):
347 return time.struct_time((t['year'], t['mon'], t['day'], t['hour'], t['min'],
348 t['sec'], 0, 0, 0))
349
350 def serialTime(t):
351 return {'year': t.tm_year, 'mon': t.tm_mon, 'day': t.tm_mday,
352 'hour': t.tm_hour, 'min': t.tm_min, 'sec': t.tm_sec}
353
354 def getSummary(offline_bots, failed_bots):
355 offline_bot_dict = getBotDicts(offline_bots)
356 failed_bot_dict = getBotDicts(failed_bots)
357 return {'offline': offline_bot_dict, 'failed': failed_bot_dict}
358
359 def findBot(name, lst):
360 for bot in lst:
361 if bot.bot_name == name:
362 return bot
363
364 return None
365
366 def getNoteworthyEvents(offline_bots, failed_bots, previous_results):
367 CRITICAL_NUM_HOURS = 1.0
368
369 previous_offline = (previous_results['offline'] if 'offline'
370 in previous_results else [])
371
372 previous_failures = (previous_results['failed'] if 'failed'
373 in previous_results else [])
374
375 noteworthy_offline = []
376 for bot in offline_bots:
377 if bot.getHoursSinceLastRun() >= CRITICAL_NUM_HOURS:
378 previous_bot = findBot(bot.bot_name, previous_offline)
379
380 if (previous_bot is None or
381 previous_bot.getHoursSinceLastRun() < CRITICAL_NUM_HOURS):
382 noteworthy_offline.append(bot)
383
384 noteworthy_new_failures = []
385 for bot in failed_bots:
386 previous_bot = findBot(bot.bot_name, previous_failures)
387
388 if previous_bot is None:
389 noteworthy_new_failures.append(bot)
390
391 noteworthy_new_offline_recoveries = []
392 for bot in previous_offline:
393 if bot.getHoursSinceLastRun() < CRITICAL_NUM_HOURS:
394 continue
395
396 current_bot = findBot(bot.bot_name, offline_bots)
397 if current_bot is None:
398 noteworthy_new_offline_recoveries.append(bot)
399
400 noteworthy_new_failure_recoveries = []
401 for bot in previous_failures:
402 current_bot = findBot(bot.bot_name, failed_bots)
403
404 if current_bot is None:
405 noteworthy_new_failure_recoveries.append(bot)
406
407 return {'offline': noteworthy_offline, 'failed': noteworthy_new_failures,
408 'recovered_failures': noteworthy_new_failure_recoveries,
409 'recovered_offline': noteworthy_new_offline_recoveries}
410
411 def getNoteworthyStr(noteworthy_events):
412 s = ''
413
414 if noteworthy_events['offline']:
415 s += 'IMPORTANT bots newly offline for over an hour:\n'
416
417 for bot in noteworthy_events['offline']:
418 s += getBotStr(bot)
419
420 s += '\n'
421
422 if noteworthy_events['failed']:
423 s += 'IMPORTANT new failing bots:\n'
424
425 for bot in noteworthy_events['failed']:
426 s += getBotStr(bot)
427
428 s += '\n'
429
430 if noteworthy_events['recovered_offline']:
431 s += 'IMPORTANT newly recovered previously offline bots:\n'
432
433 for bot in noteworthy_events['recovered_offline']:
434 s += getBotStr(bot)
435
436 s += '\n'
437
438 if noteworthy_events['recovered_failures']:
439 s += 'IMPORTANT newly recovered failing bots:\n'
440
441 for bot in noteworthy_events['recovered_failures']:
442 s += getBotStr(bot)
443
444 s += '\n'
445
446 return s
447
448 def dictsToBots(bots):
449 offline_bots = []
450 for bot in bots['offline']:
451 offline_bots.append(GpuBot.fromDict(bot))
452
453 failed_bots = []
454 for bot in bots['failed']:
455 failed_bots.append(GpuBot.fromDict(bot))
456
457 return {'offline': offline_bots, 'failed': failed_bots}
458
459 class GpuBotPoller:
460 DEFAULT_PREVIOUS_RESULTS_FILE = '.check_gpu_bots_previous_results'
461
462 def __init__(self, emailer, send_email_for_recovered_offline_bots,
463 send_email_for_recovered_failing_bots, send_email_on_error,
464 previous_results_file):
465 self.emailer = emailer
466
467 self.send_email_for_recovered_offline_bots = \
468 send_email_for_recovered_offline_bots
469
470 self.send_email_for_recovered_failing_bots = \
471 send_email_for_recovered_failing_bots
472
473 self.send_email_on_error = send_email_on_error
474 self.previous_results_file = previous_results_file
475
476 def shouldEmail(self, noteworthy_events):
477 if noteworthy_events['offline'] or noteworthy_events['failed']:
478 return True
479
480 if (self.send_email_for_recovered_offline_bots and
481 noteworthy_events['recovered_offline']):
482 return True
483
484 if (self.send_email_for_recovered_failing_bots and
485 noteworthy_events['recovered_failures']):
486 return True
487
488 return False
489
490 def writeResults(self, summary):
491 results_file = (self.previous_results_file
492 if self.previous_results_file is not None
493 else GpuBotPoller.DEFAULT_PREVIOUS_RESULTS_FILE)
494
495 with open(results_file, 'w') as f:
496 f.write(json.dumps(summary))
497
498 def getPreviousResults(self):
499 previous_results_file = (self.previous_results_file
500 if self.previous_results_file is not None
501 else GpuBotPoller.DEFAULT_PREVIOUS_RESULTS_FILE)
502
503 previous_results = {}
504 if os.path.isfile(previous_results_file):
505 with open(previous_results_file, 'r') as f:
506 previous_results = dictsToBots(json.loads(f.read()))
507
508 return previous_results
509
510 def checkBots(self):
511 time_str = 'Current time: %s\n\n' % (formatTime(time.localtime()))
512 print time_str
513
514 try:
515 bots = Waterfall.getAllGpuBots()
516
517 offline_bots = Waterfall.getOfflineBots(bots)
518 offline_str = getOfflineBotsStr(offline_bots)
519 print offline_str
520
521 failed_bots = Waterfall.getFailedBots(bots)
522 failed_str = getFailedBotsStr(failed_bots)
523 print failed_str
524
525 previous_results = self.getPreviousResults()
526 noteworthy_events = getNoteworthyEvents(offline_bots, failed_bots,
527 previous_results)
528
529 noteworthy_str = getNoteworthyStr(noteworthy_events)
530 print noteworthy_str
531
532 summary = getSummary(offline_bots, failed_bots)
533 self.writeResults(summary)
534
535 if (self.emailer is not None and self.shouldEmail(noteworthy_events)):
536 self.emailer.send_email(Emailer.format_email_body(time_str, offline_str,
537 failed_str, noteworthy_str))
538 except Exception as e:
539 error_str = 'Error: %s' % str(e)
540 print error_str
541
542 if self.send_email_on_error:
543 self.emailer.send_email(error_str)
544
545 def parseArgs(sys_args):
546 parser = argparse.ArgumentParser(prog=sys_args[0],
547 description='Query the Chromium GPU Bots Waterfall, output ' +
548 'potential problems, and optionally repeat automatically and/or ' +
549 'email notifications of results.')
550
551 parser.add_argument('--repeat-delay', type=int, dest='repeat_delay',
552 required=False,
553 help='How often to automatically re-run the script, in minutes.')
554
555 parser.add_argument('--email-from', type=str, dest='email_from',
556 required=False,
557 help='Email address to send from. Requires also specifying ' +
558 '\'--email-to\'.')
559
560 parser.add_argument('--email-to', type=str, dest='email_to', required=False,
561 nargs='+',
562 help='Email address(es) to send to. Requires also specifying ' +
563 '\'--email-from\'')
564
565 parser.add_argument('--send-email-for-recovered-offline-bots',
566 dest='send_email_for_recovered_offline_bots', action='store_true',
567 default=False,
568 help='Send an email out when a bot which has been offline for more ' +
569 'than 1 hour goes back online.')
570
571 parser.add_argument('--send-email-for-recovered-failing-bots',
572 dest='send_email_for_recovered_failing_bots',
573 action='store_true', default=False,
574 help='Send an email when a failing bot recovers.')
575
576 parser.add_argument('--send-email-on-error',
577 dest='send_email_on_error',
578 action='store_true', default=False,
579 help='Send an email when the script has an error. For example, if ' +
580 'the server is unreachable.')
581
582 parser.add_argument('--email-password-file',
583 dest='email_password_file',
584 required=False,
585 help=(('File containing the plaintext password of the source email ' +
586 'account. By default, \'%s\' will be tried. If it does not exist, ' +
587 'you will be prompted. If you opt to store your password on disk ' +
588 'in plaintext, use of a dummy account is strongly recommended.')
589 % Emailer.DEFAULT_EMAIL_PASSWORD_FILE))
590
591 parser.add_argument('--previous-results-file',
592 dest='previous_results_file',
593 required=False,
594 help=(('File to store the results of the previous invocation of ' +
595 'this script. By default, \'%s\' will be used.')
596 % GpuBotPoller.DEFAULT_PREVIOUS_RESULTS_FILE))
597
598 args = parser.parse_args(sys_args[1:])
599
600 if args.email_from is not None and args.email_to is None:
601 parser.error('--email-from requires --email-to.')
602 elif args.email_to is not None and args.email_from is None:
603 parser.error('--email-to requires --email-from.')
604 elif args.email_from is None and args.send_email_for_recovered_offline_bots:
605 parser.error('--send-email-for-recovered-offline-bots requires ' +
606 '--email-to and --email-from.')
607 elif (args.email_from is None and args.send_email_for_recovered_failing_bots):
608 parser.error('--send-email-for-recovered-failing-bots ' +
609 'requires --email-to and --email-from.')
610 elif (args.email_from is None and args.send_email_on_error):
611 parser.error('--send-email-on-error ' +
612 'requires --email-to and --email-from.')
613 elif (args.email_password_file and
614 not os.path.isfile(args.email_password_file)):
615 parser.error('File does not exist: %s' % args.email_password_file)
616
617 return args
618
619 def main(sys_args):
620 args = parseArgs(sys_args)
621
622 emailer = None
623 if args.email_from is not None and args.email_to is not None:
624 emailer = Emailer(args.email_from, args.email_to, args.email_password_file)
625
626 try:
627 emailer.testEmailLogin()
628 except Exception as e:
629 print 'Error logging into email account: %s' % str(e)
630 return 1
631
632 poller = GpuBotPoller(emailer,
633 args.send_email_for_recovered_offline_bots,
634 args.send_email_for_recovered_failing_bots,
635 args.send_email_on_error,
636 args.previous_results_file)
637
638 while True:
639 poller.checkBots()
640
641 if args.repeat_delay is None:
642 break
643
644 print 'Will run again in %d minutes...\n' % args.repeat_delay
645 time.sleep(args.repeat_delay * 60)
646
647 return 0
648
649 if __name__ == '__main__':
650 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698