appengine/findit/waterfall/build_failure_analysis_pipelines.py - Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure.

Unified Diff: appengine/findit/waterfall/build_failure_analysis_pipelines.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Test will come in next patchset. Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: appengine/findit/waterfall/build_failure_analysis_pipelines.py

diff --git a/appengine/findit/waterfall/build_failure_analysis_pipelines.py b/appengine/findit/waterfall/build_failure_analysis_pipelines.py

index 91af3608be8b35d6959fd22afd1d4e436faa60f2..28189073db5923634a169fee79b11c6026629e89 100644

--- a/appengine/findit/waterfall/build_failure_analysis_pipelines.py

+++ b/appengine/findit/waterfall/build_failure_analysis_pipelines.py

@@ -2,14 +2,23 @@

# Use of this source code is governed by a BSD-style license that can be

# found in the LICENSE file.

+import collections

+from datetime import datetime

import logging

+import random

+import time

+from google.appengine.api import memcache

from google.appengine.ext import ndb

from pipeline_utils import pipelines

+from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

+from common.http_client_appengine import HttpClientAppengine as HttpClient

from model.build import Build

+from model.build_analysis import BuildAnalysis

from model.build_analysis_status import BuildAnalysisStatus

+from waterfall import buildbot

# TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.

@@ -27,52 +36,220 @@ class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover

raise NotImplementedError()

-class BuildFailurePipeline(BasePipeline):

+MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s'

+MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60

+DOWNLOAD_INTERVAL_SECONDS = 5

qyearsley 2014/12/29 19:44:54 If these aren't used in any other modules, you can

stgao 2015/01/03 01:31:47 Done.

+def WaitUntilDownloadAllowed(

+ master_name, timeout_seconds=90): # pragma: no cover

+ """Wait until next download from the specified master is allowed.

+ Returns:

+ True if download is allowed to proceed.

+ False if download is not allowed until the given timeout occurs.

+ """

qyearsley 2014/12/29 19:44:53 I might put WaitUntilDownloadAllowed below where i

stgao 2015/01/03 01:31:47 In infra repo, it seems the convention is to put h

+ client = memcache.Client()

+ key = MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name

+ deadline = time.time() + timeout_seconds

+ while True:

+ info = client.gets(key)

+ if not info or time.time() - info['time'] >= DOWNLOAD_INTERVAL_SECONDS:

+ new_info = {

+ 'time': time.time()

+ }

+ if not info:

+ success = client.add(

+ key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)

+ else:

+ success = client.cas(

+ key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)

+ if success:

+ return True

+ if time.time() > deadline:

+ return False

+ logging.info('waiting to download from %s', master_name)

+ time.sleep(DOWNLOAD_INTERVAL_SECONDS + random.random())

+class DetectFirstFailure(BasePipeline):

qyearsley 2014/12/29 19:44:54 How about the name DetectFirstFailurePipeline? (It

stgao 2015/01/03 01:31:47 Good point. Done.

+ HTTP_CLIENT = HttpClient()

+ def ExtractBuildInfo(self, master_name, builder_name, build_number):

+ """Return a BuildInfo instance for the specified build."""

qyearsley 2014/12/29 19:44:53 Return -> Returns

stgao 2015/01/03 01:31:47 Done.

+ build = Build.GetBuild(master_name, builder_name, build_number)

+ if not build:

+ build = Build.CreateBuild(master_name, builder_name, build_number)

+ # Cache the data to avoid pulling from master again.

+ if (not build.data or (not build.completed and

+ (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5)):

+ if not WaitUntilDownloadAllowed(master_name):

+ raise pipeline.Retry('Too many download from %s' % master_name)

+ build.data = buildbot.GetBuildData(

+ build.master_name, build.builder_name, build.build_number,

+ self.HTTP_CLIENT)

+ build.last_crawled_time = datetime.utcnow()

+ if not build.data:

+ return None

+ build_info = buildbot.ExtractBuildInfo(

+ master_name, builder_name, build_number, build.data)

+ if not build.completed:

+ build.start_time = build_info.build_start_time

+ build.completed = build_info.completed

+ build.result = build_info.result

+ build.put()

+ analysis = BuildAnalysis.GetBuildAnalysis(

+ master_name, builder_name, build_number)

+ if analysis and not analysis.build_start_time:

+ analysis.build_start_time = build_info.build_start_time

+ return build_info

# Arguments number differs from overridden method - pylint: disable=W0221

def run(self, master_name, builder_name, build_number):

- build = Build.GetBuild(master_name, builder_name, build_number)

+ build_info = self.ExtractBuildInfo(master_name, builder_name, build_number)

- # TODO: implement the logic.

- build.analysis_status = BuildAnalysisStatus.ANALYZED

- build.put()

+ if not build_info:

+ raise pipeline.Retry('Failed to extract build info.')

+ failure_info = {

+ 'failed': True,

+ }

+ if build_info.result == buildbot.SUCCESS or not build_info.failed_steps:

+ failure_info['failed'] = False

+ return failure_info

+ # Save blame list and tested chromium revision for builds.

+ # {

+ # 555 : {

+ # 'chromium_revision': 'a_git_hash',

+ # 'blame_list': ['cl1', 'cl2'],

+ # },

+ # }

qyearsley 2014/12/29 19:44:53 Is this an example of what the `builds` dict below

stgao 2015/01/03 01:31:47 Done. Moved to a helper function.

+ builds = collections.defaultdict(dict)

+ def FillBuildInfo(info):

+ builds[info.build_number]['chromium_revision'] = info.chromium_revision

+ builds[info.build_number]['blame_list'] = info.blame_list

+ FillBuildInfo(build_info)

+ # Save build number for failed steps.

+ # {

+ # 'step_name': {

+ # 'current_failure': 555,

+ # 'first_failure': 553,

+ # 'last_pass': 551,

+ # },

+ # }

+ failed_steps = collections.defaultdict(dict)

+ for step_name in build_info.failed_steps:

+ failed_steps[step_name]['current_failure'] = build_info.build_number

+ failed_steps[step_name]['first_failure'] = build_info.build_number

+ # Look back for first known failure. At most, 20 builds.

+ for i in range(20):

qyearsley 2014/12/29 19:44:54 This 20 should probably be a module-level constant

stgao 2015/01/03 01:31:47 Done.

+ build_info = self.ExtractBuildInfo(

+ master_name, builder_name, build_number - i - 1)

+ if not build_info:

+ # Failed to extract the build information, bail out.

+ break

+ FillBuildInfo(build_info)

+ if build_info.result == buildbot.SUCCESS:

+ for step_name in failed_steps:

+ if 'last_pass' not in failed_steps[step_name]:

+ failed_steps[step_name]['last_pass'] = build_info.build_number

+ # All steps passed, so stop looking back.

+ break

+ else:

+ # If a step is not run due to some bot exception, we are not sure

+ # whether the step could pass or not. So we only check failed/passed

+ # steps here.

+ for step_name in build_info.failed_steps:

+ if step_name in failed_steps:

+ failed_steps[step_name]['first_failure'] = build_info.build_number

+ for step_name in failed_steps:

+ if step_name in build_info.passed_steps:

+ failed_steps[step_name]['last_pass'] = build_info.build_number

+ failure_info['builds'] = builds

+ failure_info['failed_steps'] = failed_steps

+ return failure_info

qyearsley 2014/12/29 19:44:53 This function is very long, and would probably be

stgao 2015/01/03 01:31:47 Done. Hope the code is easier to understand.

+class BuildFailurePipeline(BasePipeline):

+ def __init__(self, master_name, builder_name, build_number):

+ super(BuildFailurePipeline, self).__init__(

+ master_name, builder_name, build_number)

+ self.master_name = master_name

+ self.builder_name = builder_name

+ self.build_number = build_number

+ def finalized(self):

+ if self.was_aborted:

+ analysis = BuildAnalysis.GetBuildAnalysis(

+ self.master_name, self.builder_name, self.build_number)

+ analysis.status = BuildAnalysisStatus.ERROR

+ analysis.put()

+ # Arguments number differs from overridden method - pylint: disable=W0221

qyearsley 2014/12/29 19:44:54 Cool, I didn't know you could add a comment on the

stgao 2015/01/03 01:31:47 Yes, they could fit into one line.

+ def run(self, master_name, builder_name, build_number):

+ yield DetectFirstFailure(master_name, builder_name, build_number)

@ndb.transactional

def NeedANewAnalysis(master_name, builder_name, build_number, force):

- """Check analysis status of a build and decide if a new analysis is needed.

+ """Check status of analysis for the build and decide if a new one is needed.

qyearsley 2014/12/29 19:44:54 Check -> Checks, decide -> decides (third-person s

stgao 2015/01/03 01:31:47 Done.

Returns:

(build, need_analysis)

build (Build): the build as specified by the input.

qyearsley 2014/12/29 19:44:53 Make sure to update the docstring to reflect the c

stgao 2015/01/03 01:31:47 Oops, done.

need_analysis (bool): True if an analysis is needed, otherwise False.

"""

qyearsley 2014/12/29 19:44:54 Similar to WaitUntilDownloadAllowed, I think NeedA

stgao 2015/01/03 01:31:47 I intentionally made this function public for test

- build_key = Build.CreateKey(master_name, builder_name, build_number)

- build = build_key.get()

- if not build:

- build = Build.CreateBuild(master_name, builder_name, build_number)

- build.analysis_status = BuildAnalysisStatus.PENDING

- build.put()

- return build, True

+ analysis = BuildAnalysis.GetBuildAnalysis(

+ master_name, builder_name, build_number)

+ if not analysis:

+ analysis = BuildAnalysis.CreateBuildAnalysis(

+ master_name, builder_name, build_number)

+ analysis.status = BuildAnalysisStatus.PENDING

+ analysis.put()

+ return analysis, True

elif force:

# TODO: avoid concurrent analysis.

- build.Reset()

- build.put()

- return build, True

+ analysis.Reset()

+ analysis.put()

+ return analysis, True

else:

# TODO: support following cases

# 1. Automatically retry if last analysis failed with errors.

# 2. Start another analysis if the build cycle wasn't completed in last

# analysis request.

# 3. Analysis is not complete and no update in the last 5 minutes.

- return build, False

+ return analysis, False

def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,

queue_name):

"""Schedule an analysis if needed and return the build."""

- build, need_new_analysis = NeedANewAnalysis(

+ analysis, need_new_analysis = NeedANewAnalysis(

master_name, builder_name, build_number, force)

if need_new_analysis:

@@ -85,4 +262,4 @@ def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,

else: # pragma: no cover

logging.info('Analysis was already triggered or the result is recent.')

- return build

+ return analysis

« appengine/findit/model/build_analysis.py ('K') | « appengine/findit/model/build_analysis.py ('k') | no next file » | no next file with comments »