Index: verification/try_server.py |
diff --git a/verification/try_server.py b/verification/try_server.py |
index d9c17da49b003546e2e9e178403fe9c60813007f..c6d410dd634a8c9bf389c4328875f62544d5df82 100644 |
--- a/verification/try_server.py |
+++ b/verification/try_server.py |
@@ -14,7 +14,6 @@ import trychange |
import buildbot_json |
import model |
-from thirdparty.datastructures import SortedDict |
from verification import base |
@@ -80,115 +79,61 @@ class TryJobs(base.IVerifierStatus): |
return max(states) |
+def steps_quality(steps): |
+ if not steps: |
+ return None |
+ return all(v in (True, None) for v in steps) |
+ |
+ |
class StepDb(object): |
- """Lists all steps for each revision known to have passed at least *once*.""" |
+ """Keeps statistics about all steps for each revisions.""" |
max_cache = 200 |
- def __init__(self, builders): |
+ def __init__(self, builders, buildbot): |
self._need_full = True |
- # Builds with a patch. |
- self.patched_builds = dict((b, SortedDict()) for b in builders) |
- # Builds without a patch (with or without a clobber). |
- self.clean_builds = dict((b, SortedDict()) for b in builders) |
+ self.builders = builders |
+ self.buildbot = buildbot |
def need_full(self): |
result = self._need_full |
self._need_full = False |
return result |
- def step_quality(self, builder, revision, step): |
- """Returns if a step is known to have passed at least one time, in the |
- closed revision. |
- |
- Warning: A step index is not comparable across builders since each builder |
- has different steps in a different order. |
- """ |
- if (revision not in self.patched_builds[builder] and |
- revision not in self.clean_builds[builder]): |
- return None |
- return ( |
- (revision in self.patched_builds[builder] and |
- self.patched_builds[builder][revision][step]) or |
- (revision in self.clean_builds[builder] and |
- self.clean_builds[builder][revision][step])) |
- |
- def revision_quality(self, revision): |
- """Returns True if a revision succeeded at least one time on at least one |
- builder. |
- """ |
- return reduce(or_3_way, |
- (self.revision_quality_builder(b, revision) |
- for b in self.patched_builds)) |
- |
- def revision_quality_builder(self, builder, revision): |
- """Returns if a revision succeeded at least one time. |
- |
- Warning: A step index is not comparable across builders since each builder |
- has different steps in a different order. |
- """ |
- if (revision not in self.patched_builds[builder] and |
- revision not in self.clean_builds[builder]): |
+ def revision_quality_builder_steps(self, builder, revision): |
+ steps = None |
+ nb_builds = 0 |
+ for build in self.buildbot.builders[builder].builds.cached_children: |
+ if build.revision != revision: |
+ continue |
+ nb_builds += 1 |
+ assert not steps or len(steps) == len(build.steps) |
+ if not steps or len(steps) != len(build.steps): |
+ # If the number of steps changed after a master restart, we need to |
+ # ditch the previous steps. |
+ # One workaround is to key by name. |
+ steps = [None] * len(build.steps) |
+ for step in build.steps: |
+ steps[step.number] = or_3_way( |
+ steps[step.number], step.simplified_result) |
+ return steps, nb_builds |
+ |
+ def last_good_revision_builder(self, builder): |
+ """Returns LKGR for this builder.""" |
+ state = {} |
+ for build in self.buildbot.builders[builder].builds.cached_children: |
+ state.setdefault(build.revision, [None] * len(build.steps)) |
+ for step in build.steps: |
+ state[build.revision][step.number] = or_3_way( |
+ state[build.revision][step.number], |
+ step.simplified_result) |
+ |
+ revisions = [ |
+ revision for revision in sorted(state) |
+ if all(v in (True, None) for v in state[revision]) |
+ ] |
+ if not revisions: |
return None |
- bad_steps = [] |
- for i, value in enumerate(self.patched_builds[builder].get(revision, [])): |
- if value is False: |
- bad_steps.append(i) |
- for i, value in enumerate(self.clean_builds[builder].get(revision, [])): |
- if value is False: |
- bad_steps.append(i) |
- if value and i in bad_steps: |
- bad_steps.remove(i) |
- return not bad_steps |
- |
- def seen_revisions(self): |
- """Returns all revisions that returned some status.""" |
- revisions = set() |
- for builder in self.patched_builds: |
- revisions |= set(self.patched_builds[builder].keys()) |
- revisions |= set(self.clean_builds[builder].keys()) |
- return sorted(revisions) |
- |
- def good_revisions(self): |
- """Returns all revisions that succeeded on all builders.""" |
- for revision in self.seen_revisions(): |
- if self.revision_quality(revision): |
- yield revision |
- |
- def bad_revisions(self): |
- """Returns all revisions that never succeeded on any builder.""" |
- for revision in self.seen_revisions(): |
- if self.revision_quality(revision) is False: |
- yield revision |
- |
- def update(self, buildbot): |
- """Updates the internal db.""" |
- for builder in self.clean_builds: |
- # Only access builds already cached. |
- for build in buildbot.builders[builder].builds.cached_children: |
- if build.data['sourceStamp'].get('hasPatch', False): |
- b = self.patched_builds[builder] |
- else: |
- b = self.clean_builds[builder] |
- new_values = [s.simplified_result for s in build.steps] |
- if build.revision not in b: |
- b[build.revision] = new_values |
- else: |
- len_b = len(b[build.revision]) |
- len_n = len(new_values) |
- new_length = max(len_b, len_n) |
- b[build.revision].extend([None] * (new_length - len_b)) |
- new_values.extend([None] * (new_length - len_n)) |
- b[build.revision] = [ |
- or_3_way(old_value, new_values[i]) |
- for i, old_value in enumerate(b[build.revision]) |
- ] |
- |
- for builds in self.patched_builds.itervalues(): |
- while len(builds) > self.max_cache: |
- builds.popitem(builds.keyOrders[0]) |
- for builds in self.clean_builds.itervalues(): |
- while len(builds) > self.max_cache: |
- builds.popitem(builds.keyOrders[0]) |
+ return revisions[-1] |
class TryRunner(base.Verifier): |
@@ -219,7 +164,8 @@ class TryRunner(base.Verifier): |
# Only updates a job status once every 60 seconds. |
update_latency = 60 |
- def __init__(self, try_server_url, commit_user, builders, tests, extra_flags): |
+ def __init__(self, try_server_url, commit_user, builders, tests, extra_flags, |
+ lkgr): |
super(TryRunner, self).__init__() |
self.commit_user = commit_user |
self.try_server_url = try_server_url |
@@ -227,8 +173,9 @@ class TryRunner(base.Verifier): |
self.tests = tests |
self.extra_flags = extra_flags or [] |
self.status = buildbot_json.Buildbot(self.try_server_url) |
- self.step_db = StepDb(builders) |
+ self.step_db = StepDb(self.builders, self.status) |
self.last_update = time.time() - self.update_latency |
+ self.lkgr = lkgr |
def verify(self, pending, revision): |
"""Sends a try job to the try server and returns a TryJob list.""" |
@@ -258,7 +205,6 @@ class TryRunner(base.Verifier): |
self.last_update = time.time() |
self._reset_cache(queue) |
- self.step_db.update(self.status) |
# Do the actual processing to update the TryJob status. |
for pending, jobs in self.loop(queue, TryJobs, True): |
@@ -328,7 +274,7 @@ class TryRunner(base.Verifier): |
# First determine what data is needed. |
builds_to_cache = {} |
if self.step_db.need_full(): |
- logging.info('Fetching all try jobs status because of good_revisions') |
+ logging.info('Fetching all try jobs status to fetch good revisions') |
builders_to_cache = self.builders |
else: |
builders_to_cache = set() |
@@ -361,8 +307,8 @@ class TryRunner(base.Verifier): |
def _send_job(self, pending, revision, clobber, builders, tests=None): |
"""Sends a try job.""" |
# TODO(maruel): If revision is in self.bad_revisions[builder], choose |
- # max(self.good_revisions[builder]) ? That can't easily be done since the |
- # patch was already applied. |
+ # self.last_good_revision_builder(builder) ? That can't easily be done since |
+ # the patch was already applied. |
builders = builders or self.builders |
tests = tests or self.tests |
cmd = [ |
@@ -430,6 +376,30 @@ class TryRunner(base.Verifier): |
return '%s/buildstatus?builder=%s&number=%s' % ( |
self.try_server_url.rstrip('/'), job.builder, job.build) |
+ def _error_msg(self, name, job, failed_steps): |
+ """Constructs the error message.""" |
+ def steps_to_str(steps): |
+ if len(steps) > 1: |
+ return 'steps "%s"' % ', '.join(steps) |
+ else: |
+ return 'step "%s"' % steps[0] |
+ |
+ msg = u'Try job failure for %s on %s for %s' % ( |
+ name, job.builder, steps_to_str(failed_steps)) |
+ if job.clobber: |
+ msg += ' (clobber build)' |
+ msg += '.' |
+ if job.failed_steps: |
+ msg += u'\nIt\'s a second try, previously, %s failed.' % ( |
+ steps_to_str(job.failed_steps)) |
+ msg += '\n%s' % self._build_status_url(job) |
+ logging.info(msg) |
+ return msg |
+ |
+ def get_lkgr(self, builder): |
+ """Caches builds for a builder so lkgr is more useful.""" |
+ return max(self.step_db.last_good_revision_builder(builder), self.lkgr()) |
+ |
def _handle_try_job(self, pending, jobs, job, build): |
"""Determines if the try job is a good signal to commit the patch.""" |
if build.simplified_result is None: |
@@ -438,6 +408,7 @@ class TryRunner(base.Verifier): |
assert job.result is None |
assert job.build is not None |
job.result = build.result |
+ # Warning: This code assumes that steps do not abort build on failure. |
failed_steps = [ |
step.name for step in build.steps if step.simplified_result is False] |
if job.get_state() != base.FAILED: |
@@ -449,49 +420,60 @@ class TryRunner(base.Verifier): |
self._build_status_url(job))) |
return |
- assert failed_steps |
- msg = (u'Try job failure for %s on %s for step%s %s:\n%s' % ( |
- pending.pending_name(), |
- job.builder, |
- 's' if len(failed_steps) > 1 else '', |
- ', '.join(failed_steps), |
- self._build_status_url(job))) |
- logging.info(msg) |
- |
- # Special case update and compile. |
- if 'update' in job.failed_steps: |
- logging.debug('update is always a major failure') |
- jobs.error_message = msg |
+ msg = self._error_msg(pending.pending_name(), job, failed_steps) |
+ steps, _ = self.step_db.revision_quality_builder_steps( |
+ job.builder, int(job.revision)) |
+ quality = steps_quality(steps) |
+ |
+ def retry(msg2, **kwargs): |
+ """Retry a try job. Will use LKGR if quality is bad.""" |
+ if not quality: |
+ lkgr = self.get_lkgr(job.builder) |
+ if lkgr is None: |
+ logging.error('lkgr should never be None.') |
+ fail('Couldn\'t find a good revision, aborting.') |
+ return |
+ job.revision = lkgr |
+ logging.info( |
+ 'Retrying %s on %s, %s; rev=%s; %s' % |
+ (pending.pending_name(), job.builder, kwargs, job.revision, msg2)) |
job.failed_steps = failed_steps |
- return |
- if failed_steps == ['compile'] and not job.clobber: |
- logging.info('Trying again with clobber') |
- # Note this would reset flaky if tests if there has been. This is fine |
- # since a slave could be broken. |
+ self._send_jobs(pending, [job], **kwargs) |
+ |
+ def fail(msg2): |
+ jobs.error_message = msg + msg2 |
+ logging.info(jobs.error_message) |
job.failed_steps = failed_steps |
- job.clobber = True |
- self._send_jobs(pending, [job]) |
- return |
- # Look at the quality of the revision on this builder. |
- # TODO(maruel): We should record the number of builds that were done on this |
- # revision? One or 2 builds don't give much signal. |
- quality = self.step_db.revision_quality_builder( |
- job.builder, int(job.revision)) |
+ if 'update' in failed_steps: |
+ # Look at update quality specifically since it's a special step. |
+ # Do not take in account nb_builds == 1. |
+ if not quality and not steps[build.steps['update'].number]: |
+ # 'update' never passed. |
+ return retry('update has no quality') |
+ |
+ return fail( |
+ '\n\nStep "update" is always a major failure.\n' |
+ 'Look at the try server FAQ for more details.') |
+ |
+ if 'compile' in failed_steps: |
+ if not job.clobber: |
+ # Note: this resets previous test failure if there has been on the |
+ # second previous try. This is fine since a slave could be broken. |
+ job.clobber = True |
+ return retry('retry compile with clobber') |
+ |
+ return fail('') |
if quality: |
if job.failed_steps: |
- logging.info('It\'s a second retry for %s on %s, abort' % ( |
- pending.pending_name(), job.builder)) |
- jobs.error_message = msg |
- job.failed_steps = failed_steps |
- else: |
- logging.info( |
- 'Retrying %s on %s' % (pending.pending_name(), job.builder)) |
- job.failed_steps = failed_steps |
- self._send_jobs(pending, [job], tests=job.failed_steps) |
- return |
+ # The job had already failed. |
+ return fail('') |
+ |
+ return retry('Quality but first try', tests=failed_steps) |
- # TODO(maruel): Implement better auto-retry. |
- jobs.error_message = msg |
- job.failed_steps = failed_steps |
+ # TODO(maruel): It would make sense to do a clobber build to see if the |
+ # revision is indeed broken, since this algorithm assumes that the try |
+ # server is continuously used for recent revisions! |
+ # The revision looks like it's broken, retry with lkgr instead. |
+ return retry('No quality, no idea', tests=failed_steps) |