Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(214)

Unified Diff: verification/try_server.py

Issue 7108020: Add automatic retry mechanism and LKGR support. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/commit-queue
Patch Set: Created 9 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« tests/try_server_test.py ('K') | « thirdparty/datastructures.py ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: verification/try_server.py
diff --git a/verification/try_server.py b/verification/try_server.py
index d9c17da49b003546e2e9e178403fe9c60813007f..c6d410dd634a8c9bf389c4328875f62544d5df82 100644
--- a/verification/try_server.py
+++ b/verification/try_server.py
@@ -14,7 +14,6 @@ import trychange
import buildbot_json
import model
-from thirdparty.datastructures import SortedDict
from verification import base
@@ -80,115 +79,61 @@ class TryJobs(base.IVerifierStatus):
return max(states)
+def steps_quality(steps):
+ if not steps:
+ return None
+ return all(v in (True, None) for v in steps)
+
+
class StepDb(object):
- """Lists all steps for each revision known to have passed at least *once*."""
+ """Keeps statistics about all steps for each revisions."""
max_cache = 200
- def __init__(self, builders):
+ def __init__(self, builders, buildbot):
self._need_full = True
- # Builds with a patch.
- self.patched_builds = dict((b, SortedDict()) for b in builders)
- # Builds without a patch (with or without a clobber).
- self.clean_builds = dict((b, SortedDict()) for b in builders)
+ self.builders = builders
+ self.buildbot = buildbot
def need_full(self):
result = self._need_full
self._need_full = False
return result
- def step_quality(self, builder, revision, step):
- """Returns if a step is known to have passed at least one time, in the
- closed revision.
-
- Warning: A step index is not comparable across builders since each builder
- has different steps in a different order.
- """
- if (revision not in self.patched_builds[builder] and
- revision not in self.clean_builds[builder]):
- return None
- return (
- (revision in self.patched_builds[builder] and
- self.patched_builds[builder][revision][step]) or
- (revision in self.clean_builds[builder] and
- self.clean_builds[builder][revision][step]))
-
- def revision_quality(self, revision):
- """Returns True if a revision succeeded at least one time on at least one
- builder.
- """
- return reduce(or_3_way,
- (self.revision_quality_builder(b, revision)
- for b in self.patched_builds))
-
- def revision_quality_builder(self, builder, revision):
- """Returns if a revision succeeded at least one time.
-
- Warning: A step index is not comparable across builders since each builder
- has different steps in a different order.
- """
- if (revision not in self.patched_builds[builder] and
- revision not in self.clean_builds[builder]):
+ def revision_quality_builder_steps(self, builder, revision):
+ steps = None
+ nb_builds = 0
+ for build in self.buildbot.builders[builder].builds.cached_children:
+ if build.revision != revision:
+ continue
+ nb_builds += 1
+ assert not steps or len(steps) == len(build.steps)
+ if not steps or len(steps) != len(build.steps):
+ # If the number of steps changed after a master restart, we need to
+ # ditch the previous steps.
+ # One workaround is to key by name.
+ steps = [None] * len(build.steps)
+ for step in build.steps:
+ steps[step.number] = or_3_way(
+ steps[step.number], step.simplified_result)
+ return steps, nb_builds
+
+ def last_good_revision_builder(self, builder):
+ """Returns LKGR for this builder."""
+ state = {}
+ for build in self.buildbot.builders[builder].builds.cached_children:
+ state.setdefault(build.revision, [None] * len(build.steps))
+ for step in build.steps:
+ state[build.revision][step.number] = or_3_way(
+ state[build.revision][step.number],
+ step.simplified_result)
+
+ revisions = [
+ revision for revision in sorted(state)
+ if all(v in (True, None) for v in state[revision])
+ ]
+ if not revisions:
return None
- bad_steps = []
- for i, value in enumerate(self.patched_builds[builder].get(revision, [])):
- if value is False:
- bad_steps.append(i)
- for i, value in enumerate(self.clean_builds[builder].get(revision, [])):
- if value is False:
- bad_steps.append(i)
- if value and i in bad_steps:
- bad_steps.remove(i)
- return not bad_steps
-
- def seen_revisions(self):
- """Returns all revisions that returned some status."""
- revisions = set()
- for builder in self.patched_builds:
- revisions |= set(self.patched_builds[builder].keys())
- revisions |= set(self.clean_builds[builder].keys())
- return sorted(revisions)
-
- def good_revisions(self):
- """Returns all revisions that succeeded on all builders."""
- for revision in self.seen_revisions():
- if self.revision_quality(revision):
- yield revision
-
- def bad_revisions(self):
- """Returns all revisions that never succeeded on any builder."""
- for revision in self.seen_revisions():
- if self.revision_quality(revision) is False:
- yield revision
-
- def update(self, buildbot):
- """Updates the internal db."""
- for builder in self.clean_builds:
- # Only access builds already cached.
- for build in buildbot.builders[builder].builds.cached_children:
- if build.data['sourceStamp'].get('hasPatch', False):
- b = self.patched_builds[builder]
- else:
- b = self.clean_builds[builder]
- new_values = [s.simplified_result for s in build.steps]
- if build.revision not in b:
- b[build.revision] = new_values
- else:
- len_b = len(b[build.revision])
- len_n = len(new_values)
- new_length = max(len_b, len_n)
- b[build.revision].extend([None] * (new_length - len_b))
- new_values.extend([None] * (new_length - len_n))
- b[build.revision] = [
- or_3_way(old_value, new_values[i])
- for i, old_value in enumerate(b[build.revision])
- ]
-
- for builds in self.patched_builds.itervalues():
- while len(builds) > self.max_cache:
- builds.popitem(builds.keyOrders[0])
- for builds in self.clean_builds.itervalues():
- while len(builds) > self.max_cache:
- builds.popitem(builds.keyOrders[0])
+ return revisions[-1]
class TryRunner(base.Verifier):
@@ -219,7 +164,8 @@ class TryRunner(base.Verifier):
# Only updates a job status once every 60 seconds.
update_latency = 60
- def __init__(self, try_server_url, commit_user, builders, tests, extra_flags):
+ def __init__(self, try_server_url, commit_user, builders, tests, extra_flags,
+ lkgr):
super(TryRunner, self).__init__()
self.commit_user = commit_user
self.try_server_url = try_server_url
@@ -227,8 +173,9 @@ class TryRunner(base.Verifier):
self.tests = tests
self.extra_flags = extra_flags or []
self.status = buildbot_json.Buildbot(self.try_server_url)
- self.step_db = StepDb(builders)
+ self.step_db = StepDb(self.builders, self.status)
self.last_update = time.time() - self.update_latency
+ self.lkgr = lkgr
def verify(self, pending, revision):
"""Sends a try job to the try server and returns a TryJob list."""
@@ -258,7 +205,6 @@ class TryRunner(base.Verifier):
self.last_update = time.time()
self._reset_cache(queue)
- self.step_db.update(self.status)
# Do the actual processing to update the TryJob status.
for pending, jobs in self.loop(queue, TryJobs, True):
@@ -328,7 +274,7 @@ class TryRunner(base.Verifier):
# First determine what data is needed.
builds_to_cache = {}
if self.step_db.need_full():
- logging.info('Fetching all try jobs status because of good_revisions')
+ logging.info('Fetching all try jobs status to fetch good revisions')
builders_to_cache = self.builders
else:
builders_to_cache = set()
@@ -361,8 +307,8 @@ class TryRunner(base.Verifier):
def _send_job(self, pending, revision, clobber, builders, tests=None):
"""Sends a try job."""
# TODO(maruel): If revision is in self.bad_revisions[builder], choose
- # max(self.good_revisions[builder]) ? That can't easily be done since the
- # patch was already applied.
+ # self.last_good_revision_builder(builder) ? That can't easily be done since
+ # the patch was already applied.
builders = builders or self.builders
tests = tests or self.tests
cmd = [
@@ -430,6 +376,30 @@ class TryRunner(base.Verifier):
return '%s/buildstatus?builder=%s&number=%s' % (
self.try_server_url.rstrip('/'), job.builder, job.build)
+ def _error_msg(self, name, job, failed_steps):
+ """Constructs the error message."""
+ def steps_to_str(steps):
+ if len(steps) > 1:
+ return 'steps "%s"' % ', '.join(steps)
+ else:
+ return 'step "%s"' % steps[0]
+
+ msg = u'Try job failure for %s on %s for %s' % (
+ name, job.builder, steps_to_str(failed_steps))
+ if job.clobber:
+ msg += ' (clobber build)'
+ msg += '.'
+ if job.failed_steps:
+ msg += u'\nIt\'s a second try, previously, %s failed.' % (
+ steps_to_str(job.failed_steps))
+ msg += '\n%s' % self._build_status_url(job)
+ logging.info(msg)
+ return msg
+
+ def get_lkgr(self, builder):
+ """Caches builds for a builder so lkgr is more useful."""
+ return max(self.step_db.last_good_revision_builder(builder), self.lkgr())
+
def _handle_try_job(self, pending, jobs, job, build):
"""Determines if the try job is a good signal to commit the patch."""
if build.simplified_result is None:
@@ -438,6 +408,7 @@ class TryRunner(base.Verifier):
assert job.result is None
assert job.build is not None
job.result = build.result
+ # Warning: This code assumes that steps do not abort build on failure.
failed_steps = [
step.name for step in build.steps if step.simplified_result is False]
if job.get_state() != base.FAILED:
@@ -449,49 +420,60 @@ class TryRunner(base.Verifier):
self._build_status_url(job)))
return
- assert failed_steps
- msg = (u'Try job failure for %s on %s for step%s %s:\n%s' % (
- pending.pending_name(),
- job.builder,
- 's' if len(failed_steps) > 1 else '',
- ', '.join(failed_steps),
- self._build_status_url(job)))
- logging.info(msg)
-
- # Special case update and compile.
- if 'update' in job.failed_steps:
- logging.debug('update is always a major failure')
- jobs.error_message = msg
+ msg = self._error_msg(pending.pending_name(), job, failed_steps)
+ steps, _ = self.step_db.revision_quality_builder_steps(
+ job.builder, int(job.revision))
+ quality = steps_quality(steps)
+
+ def retry(msg2, **kwargs):
+ """Retry a try job. Will use LKGR if quality is bad."""
+ if not quality:
+ lkgr = self.get_lkgr(job.builder)
+ if lkgr is None:
+ logging.error('lkgr should never be None.')
+ fail('Couldn\'t find a good revision, aborting.')
+ return
+ job.revision = lkgr
+ logging.info(
+ 'Retrying %s on %s, %s; rev=%s; %s' %
+ (pending.pending_name(), job.builder, kwargs, job.revision, msg2))
job.failed_steps = failed_steps
- return
- if failed_steps == ['compile'] and not job.clobber:
- logging.info('Trying again with clobber')
- # Note this would reset flaky if tests if there has been. This is fine
- # since a slave could be broken.
+ self._send_jobs(pending, [job], **kwargs)
+
+ def fail(msg2):
+ jobs.error_message = msg + msg2
+ logging.info(jobs.error_message)
job.failed_steps = failed_steps
- job.clobber = True
- self._send_jobs(pending, [job])
- return
- # Look at the quality of the revision on this builder.
- # TODO(maruel): We should record the number of builds that were done on this
- # revision? One or 2 builds don't give much signal.
- quality = self.step_db.revision_quality_builder(
- job.builder, int(job.revision))
+ if 'update' in failed_steps:
+ # Look at update quality specifically since it's a special step.
+ # Do not take in account nb_builds == 1.
+ if not quality and not steps[build.steps['update'].number]:
+ # 'update' never passed.
+ return retry('update has no quality')
+
+ return fail(
+ '\n\nStep "update" is always a major failure.\n'
+ 'Look at the try server FAQ for more details.')
+
+ if 'compile' in failed_steps:
+ if not job.clobber:
+ # Note: this resets previous test failure if there has been on the
+ # second previous try. This is fine since a slave could be broken.
+ job.clobber = True
+ return retry('retry compile with clobber')
+
+ return fail('')
if quality:
if job.failed_steps:
- logging.info('It\'s a second retry for %s on %s, abort' % (
- pending.pending_name(), job.builder))
- jobs.error_message = msg
- job.failed_steps = failed_steps
- else:
- logging.info(
- 'Retrying %s on %s' % (pending.pending_name(), job.builder))
- job.failed_steps = failed_steps
- self._send_jobs(pending, [job], tests=job.failed_steps)
- return
+ # The job had already failed.
+ return fail('')
+
+ return retry('Quality but first try', tests=failed_steps)
- # TODO(maruel): Implement better auto-retry.
- jobs.error_message = msg
- job.failed_steps = failed_steps
+ # TODO(maruel): It would make sense to do a clobber build to see if the
+ # revision is indeed broken, since this algorithm assumes that the try
+ # server is continuously used for recent revisions!
+ # The revision looks like it's broken, retry with lkgr instead.
+ return retry('No quality, no idea', tests=failed_steps)
« tests/try_server_test.py ('K') | « thirdparty/datastructures.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698