Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(206)

Side by Side Diff: verification/try_server.py

Issue 7108020: Add automatic retry mechanism and LKGR support. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/commit-queue
Patch Set: Created 9 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« tests/try_server_test.py ('K') | « thirdparty/datastructures.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # coding=utf8 1 # coding=utf8
2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 """Sends patches to the Try server and reads back results. 5 """Sends patches to the Try server and reads back results.
6 """ 6 """
7 7
8 import time 8 import time
9 import logging 9 import logging
10 import urllib2 10 import urllib2
11 11
12 import find_depot_tools # pylint: disable=W0611 12 import find_depot_tools # pylint: disable=W0611
13 import trychange 13 import trychange
14 14
15 import buildbot_json 15 import buildbot_json
16 import model 16 import model
17 from thirdparty.datastructures import SortedDict
18 from verification import base 17 from verification import base
19 18
20 19
21 # We don't want to have trychange use gcl so block it. 20 # We don't want to have trychange use gcl so block it.
22 trychange.gcl = None 21 trychange.gcl = None
23 # Hack out trychange logging.info() 22 # Hack out trychange logging.info()
24 trychange.logging = logging.getLogger('trychange') 23 trychange.logging = logging.getLogger('trychange')
25 trychange.logging.setLevel(logging.WARNING) 24 trychange.logging.setLevel(logging.WARNING)
26 25
27 26
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
73 self.try_jobs = [] 72 self.try_jobs = []
74 73
75 def get_state(self): 74 def get_state(self):
76 if not self.try_jobs: 75 if not self.try_jobs:
77 return base.PROCESSING 76 return base.PROCESSING
78 states = set(i.get_state() for i in self.try_jobs) 77 states = set(i.get_state() for i in self.try_jobs)
79 assert states.issubset(base.VALID_STATES) 78 assert states.issubset(base.VALID_STATES)
80 return max(states) 79 return max(states)
81 80
82 81
82 def steps_quality(steps):
83 if not steps:
84 return None
85 return all(v in (True, None) for v in steps)
86
87
83 class StepDb(object): 88 class StepDb(object):
84 """Lists all steps for each revision known to have passed at least *once*.""" 89 """Keeps statistics about all steps for each revisions."""
85 max_cache = 200 90 max_cache = 200
86 91
87 def __init__(self, builders): 92 def __init__(self, builders, buildbot):
88 self._need_full = True 93 self._need_full = True
89 # Builds with a patch. 94 self.builders = builders
90 self.patched_builds = dict((b, SortedDict()) for b in builders) 95 self.buildbot = buildbot
91 # Builds without a patch (with or without a clobber).
92 self.clean_builds = dict((b, SortedDict()) for b in builders)
93 96
94 def need_full(self): 97 def need_full(self):
95 result = self._need_full 98 result = self._need_full
96 self._need_full = False 99 self._need_full = False
97 return result 100 return result
98 101
99 def step_quality(self, builder, revision, step): 102 def revision_quality_builder_steps(self, builder, revision):
100 """Returns if a step is known to have passed at least one time, in the 103 steps = None
101 closed revision. 104 nb_builds = 0
105 for build in self.buildbot.builders[builder].builds.cached_children:
106 if build.revision != revision:
107 continue
108 nb_builds += 1
109 assert not steps or len(steps) == len(build.steps)
110 if not steps or len(steps) != len(build.steps):
111 # If the number of steps changed after a master restart, we need to
112 # ditch the previous steps.
113 # One workaround is to key by name.
114 steps = [None] * len(build.steps)
115 for step in build.steps:
116 steps[step.number] = or_3_way(
117 steps[step.number], step.simplified_result)
118 return steps, nb_builds
102 119
103 Warning: A step index is not comparable across builders since each builder 120 def last_good_revision_builder(self, builder):
104 has different steps in a different order. 121 """Returns LKGR for this builder."""
105 """ 122 state = {}
106 if (revision not in self.patched_builds[builder] and 123 for build in self.buildbot.builders[builder].builds.cached_children:
107 revision not in self.clean_builds[builder]): 124 state.setdefault(build.revision, [None] * len(build.steps))
125 for step in build.steps:
126 state[build.revision][step.number] = or_3_way(
127 state[build.revision][step.number],
128 step.simplified_result)
129
130 revisions = [
131 revision for revision in sorted(state)
132 if all(v in (True, None) for v in state[revision])
133 ]
134 if not revisions:
108 return None 135 return None
109 return ( 136 return revisions[-1]
110 (revision in self.patched_builds[builder] and
111 self.patched_builds[builder][revision][step]) or
112 (revision in self.clean_builds[builder] and
113 self.clean_builds[builder][revision][step]))
114
115 def revision_quality(self, revision):
116 """Returns True if a revision succeeded at least one time on at least one
117 builder.
118 """
119 return reduce(or_3_way,
120 (self.revision_quality_builder(b, revision)
121 for b in self.patched_builds))
122
123 def revision_quality_builder(self, builder, revision):
124 """Returns if a revision succeeded at least one time.
125
126 Warning: A step index is not comparable across builders since each builder
127 has different steps in a different order.
128 """
129 if (revision not in self.patched_builds[builder] and
130 revision not in self.clean_builds[builder]):
131 return None
132 bad_steps = []
133 for i, value in enumerate(self.patched_builds[builder].get(revision, [])):
134 if value is False:
135 bad_steps.append(i)
136 for i, value in enumerate(self.clean_builds[builder].get(revision, [])):
137 if value is False:
138 bad_steps.append(i)
139 if value and i in bad_steps:
140 bad_steps.remove(i)
141 return not bad_steps
142
143 def seen_revisions(self):
144 """Returns all revisions that returned some status."""
145 revisions = set()
146 for builder in self.patched_builds:
147 revisions |= set(self.patched_builds[builder].keys())
148 revisions |= set(self.clean_builds[builder].keys())
149 return sorted(revisions)
150
151 def good_revisions(self):
152 """Returns all revisions that succeeded on all builders."""
153 for revision in self.seen_revisions():
154 if self.revision_quality(revision):
155 yield revision
156
157 def bad_revisions(self):
158 """Returns all revisions that never succeeded on any builder."""
159 for revision in self.seen_revisions():
160 if self.revision_quality(revision) is False:
161 yield revision
162
163 def update(self, buildbot):
164 """Updates the internal db."""
165 for builder in self.clean_builds:
166 # Only access builds already cached.
167 for build in buildbot.builders[builder].builds.cached_children:
168 if build.data['sourceStamp'].get('hasPatch', False):
169 b = self.patched_builds[builder]
170 else:
171 b = self.clean_builds[builder]
172 new_values = [s.simplified_result for s in build.steps]
173 if build.revision not in b:
174 b[build.revision] = new_values
175 else:
176 len_b = len(b[build.revision])
177 len_n = len(new_values)
178 new_length = max(len_b, len_n)
179 b[build.revision].extend([None] * (new_length - len_b))
180 new_values.extend([None] * (new_length - len_n))
181 b[build.revision] = [
182 or_3_way(old_value, new_values[i])
183 for i, old_value in enumerate(b[build.revision])
184 ]
185
186 for builds in self.patched_builds.itervalues():
187 while len(builds) > self.max_cache:
188 builds.popitem(builds.keyOrders[0])
189 for builds in self.clean_builds.itervalues():
190 while len(builds) > self.max_cache:
191 builds.popitem(builds.keyOrders[0])
192 137
193 138
194 class TryRunner(base.Verifier): 139 class TryRunner(base.Verifier):
195 """Stateless communication with a try server. 140 """Stateless communication with a try server.
196 141
197 Sends try jobs and reads try job status. 142 Sends try jobs and reads try job status.
198 143
199 Analysis goes as following: 144 Analysis goes as following:
200 - compile step is not flaky. compile.py already takes care of most flakiness 145 - compile step is not flaky. compile.py already takes care of most flakiness
201 and clobber build is done by default. 146 and clobber build is done by default.
(...skipping 10 matching lines...) Expand all
212 """ 157 """
213 name = 'try server' 158 name = 'try server'
214 159
215 # A try job sent this long ago and that hasn't started yet is deemed to be 160 # A try job sent this long ago and that hasn't started yet is deemed to be
216 # lost. 161 # lost.
217 lost_try_job_delay = 15*60 162 lost_try_job_delay = 15*60
218 163
219 # Only updates a job status once every 60 seconds. 164 # Only updates a job status once every 60 seconds.
220 update_latency = 60 165 update_latency = 60
221 166
222 def __init__(self, try_server_url, commit_user, builders, tests, extra_flags): 167 def __init__(self, try_server_url, commit_user, builders, tests, extra_flags,
168 lkgr):
223 super(TryRunner, self).__init__() 169 super(TryRunner, self).__init__()
224 self.commit_user = commit_user 170 self.commit_user = commit_user
225 self.try_server_url = try_server_url 171 self.try_server_url = try_server_url
226 self.builders = builders 172 self.builders = builders
227 self.tests = tests 173 self.tests = tests
228 self.extra_flags = extra_flags or [] 174 self.extra_flags = extra_flags or []
229 self.status = buildbot_json.Buildbot(self.try_server_url) 175 self.status = buildbot_json.Buildbot(self.try_server_url)
230 self.step_db = StepDb(builders) 176 self.step_db = StepDb(self.builders, self.status)
231 self.last_update = time.time() - self.update_latency 177 self.last_update = time.time() - self.update_latency
178 self.lkgr = lkgr
232 179
233 def verify(self, pending, revision): 180 def verify(self, pending, revision):
234 """Sends a try job to the try server and returns a TryJob list.""" 181 """Sends a try job to the try server and returns a TryJob list."""
235 jobs = pending.verifications.setdefault(self.name, TryJobs()) 182 jobs = pending.verifications.setdefault(self.name, TryJobs())
236 jobs.try_jobs = jobs.try_jobs or [] 183 jobs.try_jobs = jobs.try_jobs or []
237 assert not jobs.try_jobs 184 assert not jobs.try_jobs
238 new_jobs = [ 185 new_jobs = [
239 TryJob(builder, str(revision), False) for builder in self.builders] 186 TryJob(builder, str(revision), False) for builder in self.builders]
240 jobs.try_jobs.extend(new_jobs) 187 jobs.try_jobs.extend(new_jobs)
241 self._send_jobs(pending, new_jobs) 188 self._send_jobs(pending, new_jobs)
242 # Slightly postpone next check. 189 # Slightly postpone next check.
243 self.last_update = min( 190 self.last_update = min(
244 time.time(), self.last_update + (self.update_latency / 4)) 191 time.time(), self.last_update + (self.update_latency / 4))
245 192
246 def update_status(self, queue): 193 def update_status(self, queue):
247 """Grabs the current status of all try jobs and update self.queue. 194 """Grabs the current status of all try jobs and update self.queue.
248 195
249 Note: it would be more efficient to be event based. 196 Note: it would be more efficient to be event based.
250 """ 197 """
251 if not queue: 198 if not queue:
252 logging.debug('The list is empty, nothing to do') 199 logging.debug('The list is empty, nothing to do')
253 return 200 return
254 201
255 if time.time() - self.last_update < self.update_latency: 202 if time.time() - self.last_update < self.update_latency:
256 logging.debug('Throttling updates') 203 logging.debug('Throttling updates')
257 return 204 return
258 self.last_update = time.time() 205 self.last_update = time.time()
259 206
260 self._reset_cache(queue) 207 self._reset_cache(queue)
261 self.step_db.update(self.status)
262 208
263 # Do the actual processing to update the TryJob status. 209 # Do the actual processing to update the TryJob status.
264 for pending, jobs in self.loop(queue, TryJobs, True): 210 for pending, jobs in self.loop(queue, TryJobs, True):
265 for job in jobs.try_jobs: 211 for job in jobs.try_jobs:
266 if job.get_state() != base.PROCESSING: 212 if job.get_state() != base.PROCESSING:
267 continue 213 continue
268 # There's one try job per builder. 214 # There's one try job per builder.
269 # TODO(maruel): There should be differentiation when there's multiple 215 # TODO(maruel): There should be differentiation when there's multiple
270 # jobs for a single builder. 216 # jobs for a single builder.
271 build = None 217 build = None
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
321 self._upgrade(queue) 267 self._upgrade(queue)
322 268
323 jobs_to_update = [] 269 jobs_to_update = []
324 for _, jobs in self.loop(queue, TryJobs, True): 270 for _, jobs in self.loop(queue, TryJobs, True):
325 jobs_to_update.extend( 271 jobs_to_update.extend(
326 job for job in jobs.try_jobs if job.get_state() == base.PROCESSING) 272 job for job in jobs.try_jobs if job.get_state() == base.PROCESSING)
327 273
328 # First determine what data is needed. 274 # First determine what data is needed.
329 builds_to_cache = {} 275 builds_to_cache = {}
330 if self.step_db.need_full(): 276 if self.step_db.need_full():
331 logging.info('Fetching all try jobs status because of good_revisions') 277 logging.info('Fetching all try jobs status to fetch good revisions')
332 builders_to_cache = self.builders 278 builders_to_cache = self.builders
333 else: 279 else:
334 builders_to_cache = set() 280 builders_to_cache = set()
335 for job in jobs_to_update: 281 for job in jobs_to_update:
336 if job.build is None: 282 if job.build is None:
337 builders_to_cache.add(job.builder) 283 builders_to_cache.add(job.builder)
338 else: 284 else:
339 if job.get_state() == base.PROCESSING: 285 if job.get_state() == base.PROCESSING:
340 builds_to_cache.setdefault(job.builder, []).append(job.build) 286 builds_to_cache.setdefault(job.builder, []).append(job.build)
341 287
(...skipping 12 matching lines...) Expand all
354 del builds_to_cache[builder] 300 del builds_to_cache[builder]
355 301
356 # Cache remaining builds. Sort to make testing simpler. 302 # Cache remaining builds. Sort to make testing simpler.
357 for builder, builds in sorted( 303 for builder, builds in sorted(
358 builds_to_cache.iteritems(), key=lambda x: x[0]): 304 builds_to_cache.iteritems(), key=lambda x: x[0]):
359 self.status.builders[builder].builds.cache_partial(builds) 305 self.status.builders[builder].builds.cache_partial(builds)
360 306
361 def _send_job(self, pending, revision, clobber, builders, tests=None): 307 def _send_job(self, pending, revision, clobber, builders, tests=None):
362 """Sends a try job.""" 308 """Sends a try job."""
363 # TODO(maruel): If revision is in self.bad_revisions[builder], choose 309 # TODO(maruel): If revision is in self.bad_revisions[builder], choose
364 # max(self.good_revisions[builder]) ? That can't easily be done since the 310 # self.last_good_revision_builder(builder) ? That can't easily be done since
365 # patch was already applied. 311 # the patch was already applied.
366 builders = builders or self.builders 312 builders = builders or self.builders
367 tests = tests or self.tests 313 tests = tests or self.tests
368 cmd = [ 314 cmd = [
369 '--no_gclient', 315 '--no_gclient',
370 '--bot', ','.join(builders), 316 '--bot', ','.join(builders),
371 '--revision', str(revision), 317 '--revision', str(revision),
372 '--name', pending.pending_name(), 318 '--name', pending.pending_name(),
373 '--user', self.commit_user.split('@', 1)[0], 319 '--user', self.commit_user.split('@', 1)[0],
374 '--email', ','.join((self.commit_user, pending.owner)), 320 '--email', ','.join((self.commit_user, pending.owner)),
375 '--rietveld_url', pending.patch_url(), 321 '--rietveld_url', pending.patch_url(),
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
423 job.build = build.number 369 job.build = build.number
424 return build 370 return build
425 return None 371 return None
426 372
427 def _build_status_url(self, job): 373 def _build_status_url(self, job):
428 """Html url for this try job.""" 374 """Html url for this try job."""
429 assert job.build is not None, str(job) 375 assert job.build is not None, str(job)
430 return '%s/buildstatus?builder=%s&number=%s' % ( 376 return '%s/buildstatus?builder=%s&number=%s' % (
431 self.try_server_url.rstrip('/'), job.builder, job.build) 377 self.try_server_url.rstrip('/'), job.builder, job.build)
432 378
379 def _error_msg(self, name, job, failed_steps):
380 """Constructs the error message."""
381 def steps_to_str(steps):
382 if len(steps) > 1:
383 return 'steps "%s"' % ', '.join(steps)
384 else:
385 return 'step "%s"' % steps[0]
386
387 msg = u'Try job failure for %s on %s for %s' % (
388 name, job.builder, steps_to_str(failed_steps))
389 if job.clobber:
390 msg += ' (clobber build)'
391 msg += '.'
392 if job.failed_steps:
393 msg += u'\nIt\'s a second try, previously, %s failed.' % (
394 steps_to_str(job.failed_steps))
395 msg += '\n%s' % self._build_status_url(job)
396 logging.info(msg)
397 return msg
398
399 def get_lkgr(self, builder):
400 """Caches builds for a builder so lkgr is more useful."""
401 return max(self.step_db.last_good_revision_builder(builder), self.lkgr())
402
433 def _handle_try_job(self, pending, jobs, job, build): 403 def _handle_try_job(self, pending, jobs, job, build):
434 """Determines if the try job is a good signal to commit the patch.""" 404 """Determines if the try job is a good signal to commit the patch."""
435 if build.simplified_result is None: 405 if build.simplified_result is None:
436 # The build hasn't completed yet. 406 # The build hasn't completed yet.
437 return 407 return
438 assert job.result is None 408 assert job.result is None
439 assert job.build is not None 409 assert job.build is not None
440 job.result = build.result 410 job.result = build.result
411 # Warning: This code assumes that steps do not abort build on failure.
441 failed_steps = [ 412 failed_steps = [
442 step.name for step in build.steps if step.simplified_result is False] 413 step.name for step in build.steps if step.simplified_result is False]
443 if job.get_state() != base.FAILED: 414 if job.get_state() != base.FAILED:
444 assert not failed_steps 415 assert not failed_steps
445 logging.info(u'Try job status for %s on %s: %s\n%s' % ( 416 logging.info(u'Try job status for %s on %s: %s\n%s' % (
446 pending.pending_name(), 417 pending.pending_name(),
447 job.builder, 418 job.builder,
448 job.result, 419 job.result,
449 self._build_status_url(job))) 420 self._build_status_url(job)))
450 return 421 return
451 422
452 assert failed_steps 423 msg = self._error_msg(pending.pending_name(), job, failed_steps)
453 msg = (u'Try job failure for %s on %s for step%s %s:\n%s' % ( 424 steps, _ = self.step_db.revision_quality_builder_steps(
454 pending.pending_name(), 425 job.builder, int(job.revision))
455 job.builder, 426 quality = steps_quality(steps)
456 's' if len(failed_steps) > 1 else '',
457 ', '.join(failed_steps),
458 self._build_status_url(job)))
459 logging.info(msg)
460 427
461 # Special case update and compile. 428 def retry(msg2, **kwargs):
462 if 'update' in job.failed_steps: 429 """Retry a try job. Will use LKGR if quality is bad."""
463 logging.debug('update is always a major failure') 430 if not quality:
464 jobs.error_message = msg 431 lkgr = self.get_lkgr(job.builder)
432 if lkgr is None:
433 logging.error('lkgr should never be None.')
434 fail('Couldn\'t find a good revision, aborting.')
435 return
436 job.revision = lkgr
437 logging.info(
438 'Retrying %s on %s, %s; rev=%s; %s' %
439 (pending.pending_name(), job.builder, kwargs, job.revision, msg2))
465 job.failed_steps = failed_steps 440 job.failed_steps = failed_steps
466 return 441 self._send_jobs(pending, [job], **kwargs)
467 if failed_steps == ['compile'] and not job.clobber: 442
468 logging.info('Trying again with clobber') 443 def fail(msg2):
469 # Note this would reset flaky if tests if there has been. This is fine 444 jobs.error_message = msg + msg2
470 # since a slave could be broken. 445 logging.info(jobs.error_message)
471 job.failed_steps = failed_steps 446 job.failed_steps = failed_steps
472 job.clobber = True
473 self._send_jobs(pending, [job])
474 return
475 447
476 # Look at the quality of the revision on this builder. 448 if 'update' in failed_steps:
477 # TODO(maruel): We should record the number of builds that were done on this 449 # Look at update quality specifically since it's a special step.
478 # revision? One or 2 builds don't give much signal. 450 # Do not take in account nb_builds == 1.
479 quality = self.step_db.revision_quality_builder( 451 if not quality and not steps[build.steps['update'].number]:
480 job.builder, int(job.revision)) 452 # 'update' never passed.
453 return retry('update has no quality')
454
455 return fail(
456 '\n\nStep "update" is always a major failure.\n'
457 'Look at the try server FAQ for more details.')
458
459 if 'compile' in failed_steps:
460 if not job.clobber:
461 # Note: this resets previous test failure if there has been on the
462 # second previous try. This is fine since a slave could be broken.
463 job.clobber = True
464 return retry('retry compile with clobber')
465
466 return fail('')
481 467
482 if quality: 468 if quality:
483 if job.failed_steps: 469 if job.failed_steps:
484 logging.info('It\'s a second retry for %s on %s, abort' % ( 470 # The job had already failed.
485 pending.pending_name(), job.builder)) 471 return fail('')
486 jobs.error_message = msg
487 job.failed_steps = failed_steps
488 else:
489 logging.info(
490 'Retrying %s on %s' % (pending.pending_name(), job.builder))
491 job.failed_steps = failed_steps
492 self._send_jobs(pending, [job], tests=job.failed_steps)
493 return
494 472
495 # TODO(maruel): Implement better auto-retry. 473 return retry('Quality but first try', tests=failed_steps)
496 jobs.error_message = msg 474
497 job.failed_steps = failed_steps 475 # TODO(maruel): It would make sense to do a clobber build to see if the
476 # revision is indeed broken, since this algorithm assumes that the try
477 # server is continuously used for recent revisions!
478 # The revision looks like it's broken, retry with lkgr instead.
479 return retry('No quality, no idea', tests=failed_steps)
OLDNEW
« tests/try_server_test.py ('K') | « thirdparty/datastructures.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698