appengine/findit/waterfall/detect_first_failure_pipeline.py - Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure.

Unified Diff: appengine/findit/waterfall/detect_first_failure_pipeline.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Address comments Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: appengine/findit/waterfall/detect_first_failure_pipeline.py

diff --git a/appengine/findit/waterfall/detect_first_failure_pipeline.py b/appengine/findit/waterfall/detect_first_failure_pipeline.py

new file mode 100644

index 0000000000000000000000000000000000000000..fa62e606811981ae02989ac620ac26d9d0b56d92

--- /dev/null

+++ b/appengine/findit/waterfall/detect_first_failure_pipeline.py

@@ -0,0 +1,244 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+import collections

+from datetime import datetime

+import logging

+import random

+import time

+from google.appengine.api import memcache

+from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

+from common.http_client_appengine import HttpClientAppengine as HttpClient

+from model.build import Build

+from model.build_analysis import BuildAnalysis

+from waterfall import buildbot

+from waterfall.base_pipeline import BasePipeline

+from waterfall import lock_util

+_MAX_BUILDS_TO_CHECK = 20

+class DetectFirstFailurePipeline(BasePipeline):

+ """ A pipeline to detect first failure of each step.

+ TODO(stgao): do test-level detection for gtest.

+ Input:

+ master_name

+ builder_name

+ build_number

+ Output:

+ A json like below:

+ {

+ "master_name": "chromium.gpu",

+ "builder_name": "GPU Linux Builder"

+ "build_number": 25410,

+ "failed": true,

+ "failed_steps": {

+ "compile": {

+ "last_pass": 25408,

+ "current_failure": 25410,

+ "first_failure": 25409

+ }

+ },

+ "builds": {

+ "25408": {

+ "chromium_revision": "474ab324d17d2cd198d3fb067cabc10a775a8df7"

+ "blame_list": [

+ "474ab324d17d2cd198d3fb067cabc10a775a8df7"

+ ],

+ },

+ "25409": {

+ "chromium_revision": "33c6f11de20c5b229e102c51237d96b2d2f1be04"

+ "blame_list": [

+ "9d5ebc5eb14fc4b3823f6cfd341da023f71f49dd",

+ ...

+ ],

+ },

+ "25410": {

+ "chromium_revision": "4bffcd598dd89e0016208ce9312a1f477ff105d1"

+ "blame_list": [

+ "b98e0b320d39a323c81cc0542e6250349183a4df",

+ ...

+ ],

+ }

+ """

+ HTTP_CLIENT = HttpClient()

+ def _BuildDataNeedUpdating(self, build):

+ return (not build.data or (not build.completed and

+ (datetime.utcnow() - build.last_crawled_time).total_seconds() >= 300))

+ def _DownloadBuildData(self, master_name, builder_name, build_number):

+ """Downloads build data and returns a Build instance."""

+ build = Build.GetBuild(master_name, builder_name, build_number)

+ if not build:

+ build = Build.CreateBuild(master_name, builder_name, build_number)

+ # Cache the data to avoid pulling from master again.

+ if self._BuildDataNeedUpdating(build):

+ if not lock_util.WaitUntilDownloadAllowed(

+ master_name): # pragma: no cover

+ raise pipeline.Retry('Too many download from %s' % master_name)

+ build.data = buildbot.GetBuildData(

+ build.master_name, build.builder_name, build.build_number,

+ self.HTTP_CLIENT)

+ build.last_crawled_time = datetime.utcnow()

+ build.put()

+ return build

+ def _ExtractBuildInfo(self, master_name, builder_name, build_number):

+ """Returns a BuildInfo instance for the specified build."""

+ build = self._DownloadBuildData(master_name, builder_name, build_number)

+ if not build.data: # pragma: no cover

+ return None

+ build_info = buildbot.ExtractBuildInfo(

+ master_name, builder_name, build_number, build.data)

+ if not build.completed:

+ build.start_time = build_info.build_start_time

+ build.completed = build_info.completed

+ build.result = build_info.result

+ build.put()

+ analysis = BuildAnalysis.GetBuildAnalysis(

+ master_name, builder_name, build_number)

+ if analysis and not analysis.build_start_time:

+ analysis.build_start_time = build_info.build_start_time

+ analysis.put()

+ return build_info

+ def _SaveBlamelistAndChromiumRevisionIntoDict(self, build_info, builds):

+ """

+ Args:

+ build_info (BuildInfo): a BuildInfo instance which contains blame list and

+ chromium revision.

+ builds (dict): to which the blame list and chromium revision is saved. It

+ will be updated and looks like:

+ {

+ 555 : {

+ 'chromium_revision': 'a_git_hash',

+ 'blame_list': ['git_hash1', 'git_hash2'],

+ },

+ }

+ """

+ builds[build_info.build_number] = {

+ 'chromium_revision': build_info.chromium_revision,

+ 'blame_list': build_info.blame_list

+ }

+ def _CreateADictOfFailedSteps(self, build_info):

+ """ Returns a dict with build number for failed steps.

+ Args:

+ failed_steps (list): a list of failed steps.

+ Returns:

+ A dict like this:

+ {

+ 'step_name': {

+ 'current_failure': 555,

+ 'first_failure': 553,

+ },

+ }

+ """

+ failed_steps = dict()

+ for step_name in build_info.failed_steps:

+ failed_steps[step_name] = {

+ 'current_failure': build_info.build_number,

+ 'first_failure': build_info.build_number,

+ }

+ return failed_steps

+ def _CheckForFirstKnownFailure(self, master_name, builder_name, build_number,

+ failed_steps, builds):

+ """Checks for first known failures of the given failed steps.

+ Args:

+ master_name (str): master of the failed build.

+ builder_name (str): builder of the failed build.

+ build_number (int): builder number of the current failed build.

+ failed_steps (dict): the failed steps of the current failed build. It will

+ be updated with build numbers for 'first_failure' and 'last_pass' of

+ each failed step.

+ builds (dict): a dict to save blame list and chromium revision.

+ """

+ # Look back for first known failures.

+ for i in range(_MAX_BUILDS_TO_CHECK): # limit not hit - pragma: no cover

+ build_info = self._ExtractBuildInfo(

+ master_name, builder_name, build_number - i - 1)

+ if not build_info: # pragma: no cover

+ # Failed to extract the build information, bail out.

+ return

+ self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)

+ if build_info.result == buildbot.SUCCESS:

+ for step_name in failed_steps:

+ if 'last_pass' not in failed_steps[step_name]:

+ failed_steps[step_name]['last_pass'] = build_info.build_number

+ # All steps passed, so stop looking back.

+ return

+ else:

+ # If a step is not run due to some bot exception, we are not sure

+ # whether the step could pass or not. So we only check failed/passed

+ # steps here.

+ for step_name in build_info.failed_steps:

+ if step_name in failed_steps:

+ failed_steps[step_name]['first_failure'] = build_info.build_number

+ for step_name in failed_steps:

+ if step_name in build_info.passed_steps:

+ failed_steps[step_name]['last_pass'] = build_info.build_number

+ if all('last_pass' in step_info for step_info in failed_steps.values()):

+ # All failed steps passed in this build cycle.

+ return

+ # Arguments number differs from overridden method - pylint: disable=W0221

+ def run(self, master_name, builder_name, build_number):

+ build_info = self._ExtractBuildInfo(master_name, builder_name, build_number)

+ if not build_info: # pragma: no cover

+ raise pipeline.Retry('Failed to extract build info.')

+ failure_info = {

+ 'failed': True,

+ 'master_name': master_name,

+ 'builder_name': builder_name,

+ 'build_number': build_number

+ }

+ if (build_info.result == buildbot.SUCCESS or

+ not build_info.failed_steps):

+ failure_info['failed'] = False

+ return failure_info

+ builds = dict()

+ self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)

+ failed_steps = self._CreateADictOfFailedSteps(build_info)

+ self._CheckForFirstKnownFailure(

+ master_name, builder_name, build_number, failed_steps, builds)

+ failure_info['builds'] = builds

+ failure_info['failed_steps'] = failed_steps

+ return failure_info

« no previous file with comments | « appengine/findit/waterfall/build_failure_analysis_pipelines.py ('k') | appengine/findit/waterfall/lock_util.py » ('j') | no next file with comments »