scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py - Issue 1339613005: Refactoring scripts that wait for buildbot jobs to complete.

Unified Diff: scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py

Issue 1339613005: Refactoring scripts that wait for buildbot jobs to complete. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@hax

Patch Set: removing blank line Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/resources/check_buildbot.py ('k') | scripts/slave/recipe_modules/auto_bisect/revision_state.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py

diff --git a/scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py b/scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py

index 27be07da84d6987d9161c656541e9f039f33a708..87ffc57c2d794c78924ef5e67589441fa05659a1 100755

--- a/scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py

+++ b/scripts/slave/recipe_modules/auto_bisect/resources/wait_for_any.py

@@ -1,5 +1,4 @@

#!/usr/bin/python

# Use of this source code is governed by a BSD-style license that can be

# found in the LICENSE file.

@@ -7,113 +6,227 @@

"""Waits for any one job out of a list to complete or a default timeout."""

import json

+import os

import subprocess

import sys

import time

-import urllib2

import check_buildbot

+# Return codes.

+COMPLETED, FAILED, TIMED_OUT, BAD_ARGS = 0, 1, 2, 3

# The following intervals are specified in seconds, are expected to be sent as

# arguments to time.sleep()

-# All URLs are checked in sequence separated by 'short' interval seconds, to

-# prevent possibly getting throttled by whatever endpoint gsutil or urllib are

-# hitting.

-SHORT_INTERVAL = 0.4

# If none of the URLs is determined to be ready, we sleep for a 'long'

# interval.

-LONG_INTERVAL = 60

+SLEEP_INTERVAL = 60

# We should check buildbot not more often than every 10 minutes.

-BUILDBOT_CHECK_FREQUENCY = 600

-# If the 'timeout' interval elapses without any URL becoming ready, we fail.

-timeout_interval = 60 * 60

-# Global gsutil path, expected to be set by main.

-gsutil_path = ''

+BUILDBOT_CHECK_INTERVAL = 600

next_buildbot_check_due_time = 0

-def _run_gsutil(cmd):

- # Sleep for a short time between gsutil calls

- time.sleep(SHORT_INTERVAL)

- cmd = [gsutil_path] + cmd

- try:

- out = subprocess.check_output(cmd)

- return 0, out

- except subprocess.CalledProcessError as cpe:

- return cpe.returncode, cpe.output

+def _print_usage(argv):

+ usage = 'Usage: %s <gsutil path> [--timeout=<seconds>]'

+ print usage % argv[0]

+ print 'main.__doc__'

+ print main.__doc__

+ return BAD_ARGS

-def _gs_file_exists(url):

+def _gs_file_exists(gsutil_path, url):

"""Checks that running 'gsutil ls' returns 0 to see if file at url exists."""

- return _run_gsutil(['ls', url])[0] == 0

+ cmd = [gsutil_path, 'ls', url]

+ error = subprocess.call(cmd, stdout=open(os.devnull, 'wb'))

+ return not error

def _next_buildbot_check_due():

+ """To limit how often we pull the [potentially big] json object from bb."""

global next_buildbot_check_due_time

if time.time() > next_buildbot_check_due_time:

- next_buildbot_check_due_time = time.time() + BUILDBOT_CHECK_FREQUENCY

+ next_buildbot_check_due_time = time.time() + BUILDBOT_CHECK_INTERVAL

+ sys.stderr.write('Checking buildbot for completed/failed jobs')

return True

return False

-def _check_failed_buildbot_jobs(locations):

- if not locations:

+def _check_buildbot_jobs(jobs_to_check):

+ if not jobs_to_check:

return None

jobs = {}

- for loc in locations:

- _, master, builder, job_name = loc.split(':', 3)

+ completed_results = []

+ failed_results = []

+ # Mapping from job names to the original dictionary sent in jobs_to_check

+ entries = {}

+ job_urls = {}

+ for entry in jobs_to_check:

+ master = entry['master']

+ builder = entry['builder']

+ job_name = entry['job_name']

+ # The entries in this list may have multiple jobs for a single builder, and

+ # we want to avoid hitting the builder for each job, since we get the

+ # information for all builds each time.

+ #

+ # To prevent this we are taking this:

+ # [{'master': 'M', 'builder': 'B', 'job_name': 'J1'},

+ # {'master': 'M', 'builder': 'B', 'job_name': 'J2'},

+ # {'master': 'M', 'builder': 'C', 'job_name': 'J3'},

+ # ]

+ # And building this in the jobs variable:

+ # {'M': { 'B': ['J1', 'J2'], 'C': ['J3']}}

jobs.setdefault(master, {}).setdefault(builder, []).append(job_name)

+ entries[job_name] = entry

for master in jobs.keys():

for builder in jobs[master].keys():

- if check_buildbot.main(["check_buildbot", master, builder]

- + jobs[master][builder]):

- return 1

- return 0

+ config = {

+ 'master': master,

+ 'builder': builder,

+ 'job_names': jobs[master][builder],

+ }

+ builder_results = check_buildbot.main(config)

+ completed_results += builder_results.get('completed', [])

+ failed_results += builder_results.get('failed', [])

+ job_urls.update(builder_results.get('job_urls', {}))

+ results = {}

+ if completed_results:

+ results['completed'] = [entries[k] for k in completed_results]

+ if failed_results:

+ results['failed'] = [entries[k] for k in failed_results]

+ for job in results.get('failed', []) + results.get('completed', []):

+ if job['job_name'] in job_urls:

+ job['job_url'] = job_urls[job['job_name']]

+ return results

def main(argv):

- global timeout_interval

+ """Main function of the script.

+ The script expects the path to gsutil to be provided on the command line, and

+ a json object containing the details of the jobs to monitor on standard input.

+ Each job in the list, should be one of the following types:

+ - GS location, which must at least contain:

+ - The "type" key set to the "gs" value.

+ - The "location" key, containing the location ("gs://...") of the gs

+ object to check.

+ - Buildbot job, which must at least contain:

+ - The "type" key set to the "buildbot" value.

+ - The "master" key containing the name of the appropriate master, e.g.

+ "tryserver.chromium.perf".

+ - The "builder" key set to the name of the builder performing the job.

+ - The "job_name" key containing the name of the job to check. i.e.

+ typically a uuid or a hash will be used.

+ The script will wait until the first of the following conditions becomes true:

+ - An object exists at one of the GS locations

+ - One of the buildbot jobs completes as succesful

+ - One of the buildbot jobs fails

+ - One week elapses from the invocation of the script. (The exact timeout may

+ be overriden from the command line)

+ The return code will be:

+ 0 if a buildbot job succeeds or an object exists at the GS locations.

+ 1 if a buildbot job fails

+ 2 if the one-week timeout is triggered.

+ Additionally, a json object will be written to standard output containig the

+ results of the script.

+ Example of expected stdin:

+ {

+ "jobs": [

+ {

+ "type": "gs",

+ "location": "gs://chrome-perf/some_path/some_object.json"

+ },

+ {

+ "type": "buildbot",

+ "master": "tryserver.chromium.perf",

+ "builder": "linux_perf_bisect",

+ "job_name": "f74fb8e0418d47bfb7d01fad0dd4df06"

+ }

+ ]

+ }

+ EOF

+ Examples of results from stdout:

+ cat <<EOF #Successful result

+ {

+ "completed": [

+ {

+ "type": "buildbot",

+ "master": "tryserver.chromium.perf",

+ "builder": "linux_perf_bisect",

+ "job_name": "f74fb8e0418d47bfb7d01fad0dd4df06"

+ }

+ ]

+ }

+ EOF

+ cat <<EOF #Unsuccessful result

+ {

+ "failed": [

+ {

+ "type": "buildbot",

+ "master": "tryserver.chromium.perf",

+ "builder": "linux_perf_bisect",

+ "job_name": "f74fb8e0418d47bfb7d01fad0dd4df06"

+ }

+ ]

+ }

+ EOF

+ """

+ start_time = time.time()

+ # Default timeout: six days

+ timeout_interval = 6 * 24 * 60 * 60

if argv[-1].startswith('--timeout='):

timeout_interval = int(argv[-1].split('=')[1])

argv = argv[:-1]

- if len(argv) < 3:

- usage = ('Usage: %s <gsutil path> url1 [url2 [url3...]]'

- ' [--timeout=<seconds>]\n'

- ' Where urls are either a google storage location for the result '

- ' file, or a buildbot location of the form '

- '"bb:<master>:<builderi>:<job_name>".')

- print usage % argv[0]

- return 1

- list_of_urls = ', '.join(['<%s>' % url for url in argv[2:]])

- print 'Waiting for the following urls: ' + list_of_urls

- global gsutil_path

- start_time = time.time()

- gsutil_path = argv[1]

- urls = argv[2:]

- while urls:

- for url in urls:

- if url.startswith('bb:'):

- pass

- elif _gs_file_exists(url):

- print 'Build finished: ', url

- return 0

- if time.time() - start_time > timeout_interval:

- print "Timed out waiting for: ", urls

- return 1

- if _next_buildbot_check_due():

- failed_job = _check_failed_buildbot_jobs(

- [url for url in urls if url.startswith('bb:')])

- if failed_job:

- return 0

- time.sleep(LONG_INTERVAL)

- print "No jobs to check."

- return 0

+ jobs = json.loads(sys.stdin.read())['jobs']

+ gs_jobs = [job for job in jobs if job['type'] == 'gs']

+ buildbot_jobs = [job for job in jobs if job['type'] == 'buildbot']

+ if ((not gs_jobs and not buildbot_jobs) or

+ (gs_jobs and len(argv) < 2)):

+ return _print_usage(argv)

+ gsutil_path = argv[1] if gs_jobs else ''

+ while time.time() < start_time + timeout_interval:

+ # Checking GS jobs

+ completed_jobs = []

+ for job in gs_jobs:

+ if _gs_file_exists(gsutil_path, job['location']):

+ completed_jobs.append(job)

+ # Checking Buildbot jobs

+ if completed_jobs or _next_buildbot_check_due():

+ # buildbot_results will only contain jobs that have been completed or

+ # failed. All other jobs (scheduled, in progress, etc.) will be ignored.

+ buildbot_results = _check_buildbot_jobs(buildbot_jobs)

+ if buildbot_results:

+ print json.dumps(buildbot_results)

+ if 'completed' in buildbot_results and buildbot_results['completed']:

+ return COMPLETED

+ return FAILED

+ if completed_jobs:

+ # This clause is just a fallback. Ideally when a results file shows up at

+ # a gs location, we'd want to run check_buildbot jobs first to find the

+ # url to the job detaisl.

+ print json.dumps({'completed': completed_jobs})

+ return COMPLETED

+ # At this point, no jobs were completed nor failed. We print a char to

+ # prevent buildbot from killing this process for inactivity.

+ sys.stderr.write('Sleeping.\n')

+ sys.stderr.flush()

+ time.sleep(SLEEP_INTERVAL)

+ return TIMED_OUT

if __name__ == '__main__':

sys.exit(main(sys.argv))