Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(64)

Unified Diff: commit-queue/verification/try_server.py

Issue 135363007: Delete public commit queue to avoid confusion after move to internal repo (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « commit-queue/verification/try_job_steps.py ('k') | commit-queue/workdir/README » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: commit-queue/verification/try_server.py
===================================================================
--- commit-queue/verification/try_server.py (revision 249146)
+++ commit-queue/verification/try_server.py (working copy)
@@ -1,728 +0,0 @@
-# coding=utf8
-# Copyright (c) 2012 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-"""Sends patches to the Try server and reads back results.
-
-- TryJobs contains TryJob, one per try job on a builder.
-- TryRunnerBase contains the common logic to send try jobs and responds to the
- try job results.
-- TryRunnerSvn uses svn plus /json polling on the try server for status updates.
-"""
-
-import logging
-import os
-import re
-import time
-import urllib2
-
-import find_depot_tools # pylint: disable=W0611
-import presubmit_support
-import trychange
-
-import buildbot_json
-import model
-from verification import base
-
-
-# We don't want to have trychange use gcl so block it.
-trychange.gcl = None
-# Hack out trychange logging.info()
-trychange.logging = logging.getLogger('trychange')
-trychange.logging.setLevel(logging.WARNING)
-
-
-def or_3_way(a, b):
- """Returns highest value, where True > False > None."""
- return None if (a == b == None) else (a or b)
-
-
-def parse_gclient_rev(rev):
- """Returns the absolute number of a gclient revision.
-
- It strips off the solution.
- """
- if rev is None:
- return rev
- return str(rev).split('@')[-1]
-
-
-def unambiguous_revision(checkout, revision):
- """Returns if a revision is unambiguous for the checkout.
-
- HEAD, date or branch name are ambiguous.
- """
- revision = parse_gclient_rev(revision)
- if not revision:
- return False
- name = checkout.__class__.__name__
- if revision.isdigit() and 'Svn' in name:
- # GitSvn should accept revision numbers?
- return True
- if re.match(r'^[a-f0-9]{5,20}$', revision) and 'Git' in name:
- return True
- return False
-
-
-class TryJob(model.PersistentMixIn):
- """Represents a try job for a pending commit.
-
- This data can be regenerated by parsing all the try job names but it is a bit
- hard on the try server.
-
- TODO(maruel): Should use __getstate__(), __setstate__() and __reduce__().
- """
- builder = unicode
- build = (None, int)
- revision = (None, int)
- result = (None, int)
- sent = float
- failed_steps = list
- clobber = bool
- name = (None, unicode)
- # Number of retries for this configuration.
- tries = int
- tests = list
-
- def __init__(self, **kwargs):
- kwargs.setdefault('sent', time.time())
- super(TryJob, self).__init__(**kwargs)
-
- def get_state(self):
- if self.result in (
- buildbot_json.SUCCESS, buildbot_json.WARNINGS, buildbot_json.SKIPPED):
- return base.SUCCEEDED
- elif self.result in (
- buildbot_json.FAILURE, buildbot_json.EXCEPTION, buildbot_json.RETRY):
- return base.FAILED
- else:
- assert self.result == None
- return base.PROCESSING
-
-
-class TryJobs(base.IVerifierStatus):
- """A set of try jobs that were sent for a specific patch."""
- # An array of TryJob objects.
- try_jobs = list
- # When NOTRY=true is specified.
- skipped = bool
-
- def get_state(self):
- if self.skipped:
- return base.SUCCEEDED
- if not self.try_jobs:
- return base.PROCESSING
- states = set(i.get_state() for i in self.try_jobs)
- assert states.issubset(base.VALID_STATES)
- return max(states)
-
- def why_not(self):
- pass
-
-
-def steps_quality(steps):
- if not steps:
- return None
- return all(v in (True, None) for v in steps)
-
-
-class StepDb(object):
- """Calculate statistics about all steps for each revisions."""
- max_cache = 200
-
- def __init__(self, builders, buildbot, checkout):
- self._need_full = True
- self.builders = builders
- self.buildbot = buildbot
- self.checkout = checkout
-
- def need_full(self):
- result = self._need_full
- self._need_full = False
- return result
-
- def revision_quality_builder_steps(self, builder, revision):
- """Calculates the occurence of a successful step execution, for a specific
- builder, for builds on a specific revision.
-
- The return value is a tuple of two elements:
- 1. array of True/False/None, one value per step in a build. True means the
- step passed at least once. False means the step always failed. None
- means the step never ran for this revision on this builder.
- 2. Number of builds that ran on this builder at this revision.
- """
- revision = str(revision)
- steps = None
- nb_builds = 0
- for build in self.buildbot.builders[builder].builds.cached_children:
- if parse_gclient_rev(build.revision) != revision:
- continue
- nb_builds += 1
- if not (not steps or len(steps) == len(build.steps)):
- logging.warn('Invalid build %s' % build)
- continue
- if not steps or len(steps) != len(build.steps):
- # If the number of steps changed after a master restart, we need to
- # ditch the previous steps.
- # One workaround is to key by name but it's not worth the effort here,
- # the worst case is that previous builds that could be considered good
- # will be ignored, making it impossible to determine a lkgr.
- steps = [None] * len(build.steps)
- for step in build.steps:
- steps[step.number] = or_3_way(
- steps[step.number], step.simplified_result)
- return steps, nb_builds
-
- def last_good_revision_builder(self, builder):
- """Returns LKGR for this builder or None if no revision was found.
-
- For a single revision, for each step, make sure step either passed at least
- once or never ran.
-
- For example, if build 1 has [True, True, False, None] and build 2 has [True,
- False, True, None], the revision is known to be good since each step run
- either succeeded one time or never ran.
- """
- state = {}
- for build in self.buildbot.builders[builder].builds.cached_children:
- if not unambiguous_revision(self.checkout, build.revision):
- # Ignore all builds that doesn't use revision numbers. It could be
- # instead svn date format {2011-01-30}, 'HEAD', 'BASE', etc.
- continue
- build_rev = parse_gclient_rev(build.revision)
- state.setdefault(build_rev, [None] * len(build.steps))
- for step in build.steps:
- if len(state[build_rev]) <= step.number:
- continue
- state[build_rev][step.number] = or_3_way(
- state[build_rev][step.number],
- step.simplified_result)
-
- # Insert a None item. It will be returned if no revision was found.
- revisions = sorted(
- int(revision) for revision in state
- if (all(v in (True, None) for v in state[revision])))
- if not revisions:
- return None
- return revisions[-1]
-
-
-class TryRunnerBase(base.VerifierCheckout):
- """Stateless communication with a try server.
-
- Sends try jobs and reads try job status.
-
- Analysis goes as following:
- - compile step is not flaky. compile.py already takes care of most flakiness
- and clobber build is done by default. If compile step fails, try again with
- clobber=True
- - test steps are flaky and can be retried as necessary.
- """
- name = 'try server'
-
- # A try job sent this long ago and that hasn't started yet is deemed to be
- # lost.
- lost_try_job_delay = 15*60
-
- # Only updates a job status once every 60 seconds.
- update_latency = 60
-
- def __init__(
- self, context_obj, try_server_url, commit_user,
- builders_and_tests, ignored_steps, solution):
- super(TryRunnerBase, self).__init__(context_obj)
- self.commit_user = commit_user
- self.try_server_url = try_server_url
- self.builders_and_tests = builders_and_tests
- self.ignored_steps = set(ignored_steps)
- self.last_update = time.time() - self.update_latency
- self.solution = solution
-
- def verify(self, pending):
- """Sends a try job to the try server and returns a TryJob list."""
- jobs = pending.verifications.setdefault(self.name, TryJobs())
- if jobs.try_jobs:
- logging.warning(
- 'Already tried jobs. Let it go. At worst, it\'ll time out soon.')
- return
-
- jobs.try_jobs = jobs.try_jobs or []
- if self._is_skip_try_job(pending):
- # Do not run try job for it.
- jobs.skipped = True
- return
-
- new_jobs = [
- TryJob(
- builder=builder,
- tests=self.builders_and_tests[builder],
- revision=pending.revision,
- clobber=False)
- for builder in sorted(self.builders_and_tests)
- ]
- jobs.try_jobs.extend(new_jobs)
- self._send_jobs(
- pending,
- new_jobs,
- False,
- self.builders_and_tests,
- unicode(pending.pending_name()))
- # Slightly postpone next check.
- self.last_update = min(
- time.time(), self.last_update + (self.update_latency / 4))
-
- def update_status(self, queue):
- """Grabs the current status of all try jobs and update self.queue.
-
- Note: it would be more efficient to be event based.
- """
- if not queue:
- logging.debug('The list is empty, nothing to do')
- return
-
- if time.time() - self.last_update < self.update_latency:
- logging.debug('TS: Throttling updates')
- return
- self.last_update = time.time()
-
- self._update_statuses(queue)
-
- def _send_jobs(
- self, pending, jobs, need_prepare, builders_and_tests, job_name):
- """Prepares the TryJobs instance |jobs| to send try jobs to the try server.
-
- Sending try jobs is deferred to self._send_job().
-
- Arguments:
- - pending: pending_manager.Pending instance.
- - jobs: List of TryJob instances to be executed.
- - need_prepare: The checkout needs to have the patch applied, e.g. this
- function is called from within update_status().
- - builders_and_tests: dict('builder': ['test1', 'test2']) for try jobs to
- run. Can be self.builders_and_tests or a smaller subset when retrying
- jobs.
- - job_name: Job name to use, may have suffix like "retry".
- """
- for job in jobs:
- job.tries = job.tries or 0
- job.tries += 1
- if job.tries > 4:
- raise base.DiscardPending(
- pending,
- ('The commit queue went berserk retrying too often for a\n'
- 'seemingly flaky test. Builder is %s, revision is %s, job name\n'
- 'was %s.') % (job.builder, job.revision, job_name))
-
- builders = sorted(job.builder for job in jobs)
- assert len(set(builders)) == len(builders)
-
- revision = set(job.revision for job in jobs)
- assert len(revision) == 1
- revision = revision.pop()
-
- clobber = set(job.clobber for job in jobs)
- assert len(clobber) == 1
- clobber = clobber.pop()
-
- for job in jobs:
- job.result = None
- job.build = None
- job.name = job_name
- job.tests = builders_and_tests[job.builder]
-
- if need_prepare:
- self._prepare(pending, revision)
- self._send_job(pending, revision, clobber, builders_and_tests, job_name)
- for builder in builders:
- # Signal a new try job was sent.
- info = {
- 'builder': builder,
- 'clobber': job.clobber,
- 'job_name': job_name,
- 'revision': revision,
- }
- self.send_status(pending, info)
- for job in jobs:
- job.sent = time.time()
-
- def _build_status_url(self, job):
- """Html url for this try job."""
- assert job.build is not None, str(job)
- return '%s/buildstatus?builder=%s&number=%s' % (
- self.try_server_url.rstrip('/'), job.builder, job.build)
-
- def _error_msg(self, name, job, failed_steps):
- """Constructs the error message."""
- def steps_to_str(steps):
- if len(steps) > 1:
- return 'steps "%s"' % ', '.join(steps)
- elif steps:
- return 'step "%s"' % steps[0]
- else:
- return ''
-
- msg = u'Try job failure for %s on %s for %s' % (
- name, job.builder, steps_to_str(failed_steps))
- if job.clobber:
- msg += ' (clobber build)'
- msg += '.'
- if job.failed_steps:
- msg += u'\nIt\'s a second try, previously, %s failed.' % (
- steps_to_str(job.failed_steps))
- msg += '\n%s' % self._build_status_url(job)
- logging.info(msg)
- return msg
-
- def _handle_try_job(self, pending, jobs, job, build):
- """Determines if the try job is a good signal to commit the patch."""
- if build.simplified_result is None:
- # The build hasn't completed yet.
- return
- assert job.result is None
- assert job.build is not None
- job.result = build.result
- # Warning: This code assumes that steps do not abort build on failure.
- failed_steps = list(set(
- step.name for step in build.steps if step.simplified_result is False
- ) - self.ignored_steps)
- # If the failed steps are only ignored steps like update_scripts or
- # cleanup_temp, still consider the job as a success. As such, do not use
- # build.result.
- if (not failed_steps and
- all(build.steps[s].simplified_result for s in job.tests
- if s in build.steps.keys)):
- job.result = buildbot_json.SUCCESS
-
- # Signal to the dashboard a try job completed.
- info = {
- 'build': build.number,
- 'builder': job.builder,
- 'duration': build.duration,
- 'job_name': job.name,
- 'result': job.result,
- 'revision': job.revision,
- 'url': self._build_status_url(job),
- }
- self.send_status(pending, info)
-
- if job.get_state() != base.FAILED:
- assert not failed_steps
- logging.info(u'Try job status for %s on %s: %s\n%s' % (
- job.name,
- job.builder,
- job.result,
- self._build_status_url(job)))
- return
-
- msg = self._error_msg(job.name, job, failed_steps)
- quality = self._get_quality(job.builder, int(job.revision))
-
- def retry(msg2, tests=None):
- """Retry a try job. Will use LKGR if quality is bad."""
- if not quality:
- lkgr = self.get_lkgr(job.builder)
- if lkgr is None:
- logging.error('lkgr should never be None.')
- fail('Couldn\'t find a good revision, aborting.')
- return
- job.revision = lkgr
- logging.info(
- 'Retrying %s on %s, %s; rev=%s; %s' %
- (job.name, job.builder, str(tests), job.revision, msg2))
- job.failed_steps = failed_steps
- tests = tests or job.tests
- self._send_jobs(
- pending, [job], True, {job.builder: tests}, u'%s (retry)' % job.name)
-
- def fail(msg2):
- jobs.error_message = '%s\n%s' % (msg, msg2)
- logging.info(jobs.error_message)
- job.failed_steps = failed_steps
-
- if 'update' in failed_steps:
- # Look at update quality specifically since it's a special step.
- return fail(
- '\nStep "update" is always a major failure.\n'
- 'Look at the try server FAQ for more details.')
-
- if 'compile' in failed_steps:
- if not job.clobber:
- # Note: this resets previous test failure if there has been on the
- # second previous try. This is fine since a slave could be broken.
- job.clobber = True
- return retry('retry compile with clobber')
-
- return fail('')
-
- if quality:
- if job.failed_steps:
- # The job had already failed.
- return fail('')
-
- return retry('Quality but first try', failed_steps)
-
- # TODO(maruel): It would make sense to do a clobber build to see if the
- # revision is indeed broken, since this algorithm assumes that the try
- # server is continuously used for recent revisions!
- # The revision looks like it's broken, retry with lkgr instead.
- return retry('No quality, no idea', failed_steps)
-
- @staticmethod
- def _is_skip_try_job(pending):
- """Returns True if a description contains NOTRY=true."""
- match = re.search(r'^NOTRY=(.*)$', pending.description, re.MULTILINE)
- return match and match.group(1).lower() == 'true'
-
- def _prepare(self, pending, revision):
- """Prepares the checkout by applying the patch."""
- raise NotImplementedError()
-
- def _get_quality(self, builder, revision):
- """Gets quality about a revision job."""
- raise NotImplementedError()
-
- def get_lkgr(self, builder):
- """Gets the last known good revision."""
- raise NotImplementedError()
-
- def _send_job(self, pending, revision, clobber, builders_and_tests, job_name):
- """Sends a try job."""
- raise NotImplementedError()
-
- def _update_statuses(self, queue):
- """Updates TryJob status for all the Pending instances in the queue.
-
- Calls to this function are throttled.
- """
- raise NotImplementedError()
-
-
-class TryRunnerSvn(TryRunnerBase):
- """Uses SVN to send the try job.
-
- Keeps a database of steps for each revision for each builder that ever passed,
- to know if it is possible for a step to pass. When unsure, it sends an empty
- build for the said revsion to determine if the revision is simply broken.
-
- TODO(maruel): Ask the main server for details? Still doesn't cover well flaky
- tests.
- """
- def __init__(
- self, context_obj, try_server_url, commit_user,
- builders_and_tests, ignored_steps, solution,
- extra_flags, lkgr):
- super(TryRunnerSvn, self).__init__(
- context_obj, try_server_url, commit_user,
- builders_and_tests, ignored_steps, solution)
- self.status = buildbot_json.Buildbot(self.try_server_url)
- self.step_db = StepDb(
- self.builders_and_tests.keys(), self.status, self.context.checkout)
- self.extra_flags = extra_flags or []
- self.lkgr = lkgr
-
- def _prepare(self, pending, revision):
- """Running from inside update_status(), the patch wasn't applied. Do it now.
- """
- pending.revision = revision
- pending.apply_patch(self.context, True)
-
- def _get_quality(self, builder, revision):
- steps, _ = self.step_db.revision_quality_builder_steps(builder, revision)
- return steps_quality(steps)
-
- def get_lkgr(self, builder):
- return max(self.step_db.last_good_revision_builder(builder), self.lkgr())
-
- def _send_job(self, pending, revision, clobber, builders_and_tests, job_name):
- """Sends a try job."""
- assert revision
- cmd = [
- '--no_search',
- '--revision', '%s@%s' % (self.solution, revision),
- '--name', job_name,
- '--user', self.commit_user.split('@', 1)[0],
- '--email', self.commit_user,
- '--rietveld_url', self._patch_url(pending),
- '--issue', str(pending.issue),
- '--patchset', str(pending.patchset)
- ]
- cmd.extend(self.extra_flags)
- for builder in sorted(builders_and_tests):
- cmd.append('--bot')
- tests = builders_and_tests[builder]
- if tests:
- cmd.append('%s:%s' % (builder, ','.join(tests)))
- else:
- cmd.append(builder)
- if clobber:
- cmd.append('--clobber')
- # TODO(maruel): use GitChange when relevant.
- change = presubmit_support.SvnChange(
- job_name,
- pending.description,
- self.context.checkout.project_path,
- [('M', f) for f in pending.files],
- pending.issue,
- pending.patchset,
- pending.owner)
- prev_dir = os.getcwd()
- try:
- os.chdir(self.context.checkout.project_path)
- trychange.TryChange(
- cmd,
- change,
- swallow_exception=True)
- except SystemExit as e:
- logging.error(
- '_send_job(%s, %s, %s, %s, %s) failed!' % (
- pending.pending_name(), revision, clobber, builders_and_tests,
- job_name))
- raise base.DiscardPending(
- pending,
- 'Failed to send try job %s: %s' % (job_name, e))
- finally:
- os.chdir(prev_dir)
-
- def _reset_cache(self, queue):
- """Resets the cache of self.status and self.step_db so the next requests
- are more efficient.
- """
- self.status.discard()
-
- jobs_to_update = []
- for _, jobs in self.loop(queue, TryJobs, True):
- jobs_to_update.extend(
- job for job in jobs.try_jobs if job.get_state() == base.PROCESSING)
-
- # First determine what data is needed.
- builds_to_cache = {}
- if self.step_db.need_full():
- logging.info('Fetching all try jobs status to fetch good revisions')
- builders_to_cache = self.builders_and_tests.keys()
- else:
- builders_to_cache = set()
- for job in jobs_to_update:
- if job.build is None:
- builders_to_cache.add(job.builder)
- else:
- if job.get_state() == base.PROCESSING:
- builds_to_cache.setdefault(job.builder, []).append(job.build)
-
- # Simplify testing.
- builders_to_cache = sorted(builders_to_cache)
-
- # Reduce the number of requests by caching all the needed builders in one
- # shot when some jobs weren't started yet.
- if builders_to_cache:
- self.status.builders.cache_partial(builders_to_cache)
-
- for builder in builders_to_cache:
- self.status.builders[builder].builds.cache()
- # Filter out jobs that were retrieved.
- if builder in builds_to_cache:
- del builds_to_cache[builder]
-
- # Cache remaining builds. Sort to make testing simpler.
- for builder, builds in sorted(
- builds_to_cache.iteritems(), key=lambda x: x[0]):
- self.status.builders[builder].builds.cache_partial(builds)
-
- def _update_statuses(self, queue):
- self._reset_cache(queue)
- for pending, jobs in self.loop(queue, TryJobs, True):
- for job in jobs.try_jobs:
- if job.get_state() != base.PROCESSING:
- continue
- self._update_status(pending, jobs, job)
-
- def _update_status(self, pending, jobs, job):
- """There's one TryJob per builder."""
- # TODO(maruel): There should be differentiation when there's multiple
- # jobs for a single builder.
- build = None
- try:
- if job.build is None:
- build = self._find_job(job)
- if build:
- # Signal a try job was found.
- info = {
- 'build': build.number,
- 'builder': job.builder,
- 'job_name': job.name,
- 'revision': job.revision,
- 'url': self._build_status_url(job),
- }
- self.send_status(pending, info)
- else:
- try:
- build = self.status.builders[job.builder].builds[job.build]
- except KeyError:
- # May happen when there is a huge backlog and the build is not
- # cached anymore.
- build = None
- except urllib2.HTTPError as e:
- logging.error(str(e))
- return
-
- if build is not None:
- self._handle_try_job(pending, jobs, job, build)
- else:
- # A job needs to be sent again if it has been sent more than
- # self.lost_try_job_delay ago.
- builder = self.status.builders[job.builder]
- pending_builds = builder.data.get('pendingBuilds', 0)
- if (time.time() - job.sent) > self.lost_try_job_delay:
- if pending_builds:
- job_names = [
- data.get('reason', '') for data in builder.pending_builds.data
- ]
- if job.name in job_names:
- # It's pending, move on.
- return
-
- # The job went to /dev/null. For example, the master may have
- # restarted, the svn server may have a fluke, network may have had a
- # short downtime, etc. Delete the previous job.
- # Resend exactly the same job.
- tests = job.tests
- if not tests:
- if not job.builder in self.builders_and_tests:
- # This means the builder was removed. Skip it.
- logging.warn(
- ( 'Wanted to retry %s but it\'s not a requirement anymore. '
- 'Ignoring it!') % job.builder)
- job.result = buildbot_json.SKIPPED
- return
-
- tests = self.builders_and_tests[job.builder]
- self._send_jobs(
- pending,
- [job],
- True,
- {job.builder:tests},
- u'%s (previous was lost)' % job.name)
-
- def _find_job(self, job):
- """Searches on the try server if the try job for |job| has started."""
- revision = '%s@%s' % (self.solution, job.revision)
- # TODO(maruel): Strip this off.
- job_name = job.name.split(':', 1)[-1]
- logging.debug('Searching for job.reason = %s @ %s' % (job_name, revision))
- for build in self.status.builders[job.builder].builds:
- blame = build.data.get('blame', [])
- logging.debug(
- 'Build.reason = %s @ %s; blame: %s' % (
- build.reason, build.revision, ','.join(blame)))
- if (build.reason == job_name and
- str(build.revision) == revision and
- len(blame) == 1 and
- blame[0] == self.commit_user):
- # Note the build number to remember it started.
- logging.info('Found build %d for job %s' % (build.number, job_name))
- job.build = build.number
- return build
- return None
-
- def _patch_url(self, pending):
- return ('%s/download/issue%d_%d.diff' %
- (self.context.rietveld.url, pending.issue, pending.patchset))
« no previous file with comments | « commit-queue/verification/try_job_steps.py ('k') | commit-queue/workdir/README » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698