appengine/findit/waterfall/detect_first_failure_pipeline.py - Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure.

Side by Side Diff: appengine/findit/waterfall/detect_first_failure_pipeline.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Address comments Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright 2014 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import collections

	6 from datetime import datetime

	7 import logging

	8 import random

	9 import time

	10

	11 from google.appengine.api import memcache

	12

	13 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

	14

	15 from common.http_client_appengine import HttpClientAppengine as HttpClient

	16 from model.build import Build

	17 from model.build_analysis import BuildAnalysis

	18 from waterfall import buildbot

	19 from waterfall.base_pipeline import BasePipeline

	20 from waterfall import lock_util

	21

	22

	23 _MAX_BUILDS_TO_CHECK = 20

	24

	25

	26 class DetectFirstFailurePipeline(BasePipeline):

	27 """ A pipeline to detect first failure of each step.

	28

	29 TODO(stgao): do test-level detection for gtest.

	30

	31 Input:

	32 master_name

	33 builder_name

	34 build_number

	35

	36 Output:

	37 A json like below:

	38 {

	39 "master_name": "chromium.gpu",

	40 "builder_name": "GPU Linux Builder"

	41 "build_number": 25410,

	42 "failed": true,

	43 "failed_steps": {

	44 "compile": {

	45 "last_pass": 25408,

	46 "current_failure": 25410,

	47 "first_failure": 25409

	48 }

	49 },

	50 "builds": {

	51 "25408": {

	52 "chromium_revision": "474ab324d17d2cd198d3fb067cabc10a775a8df7"

	53 "blame_list": [

	54 "474ab324d17d2cd198d3fb067cabc10a775a8df7"

	55 ],

	56 },

	57 "25409": {

	58 "chromium_revision": "33c6f11de20c5b229e102c51237d96b2d2f1be04"

	59 "blame_list": [

	60 "9d5ebc5eb14fc4b3823f6cfd341da023f71f49dd",

	61 ...

	62 ],

	63 },

	64 "25410": {

	65 "chromium_revision": "4bffcd598dd89e0016208ce9312a1f477ff105d1"

	66 "blame_list": [

	67 "b98e0b320d39a323c81cc0542e6250349183a4df",

	68 ...

	69 ],

	70 }

	71 }

	72 }

	73 """

	74

	75 HTTP_CLIENT = HttpClient()

	76

	77 def _BuildDataNeedUpdating(self, build):

	78 return (not build.data or (not build.completed and

	79 (datetime.utcnow() - build.last_crawled_time).total_seconds() >= 300))

	80

	81 def _DownloadBuildData(self, master_name, builder_name, build_number):

	82 """Downloads build data and returns a Build instance."""

	83 build = Build.GetBuild(master_name, builder_name, build_number)

	84 if not build:

	85 build = Build.CreateBuild(master_name, builder_name, build_number)

	86

	87 # Cache the data to avoid pulling from master again.

	88 if self._BuildDataNeedUpdating(build):

	89 if not lock_util.WaitUntilDownloadAllowed(

	90 master_name): # pragma: no cover

	91 raise pipeline.Retry('Too many download from %s' % master_name)

	92

	93 build.data = buildbot.GetBuildData(

	94 build.master_name, build.builder_name, build.build_number,

	95 self.HTTP_CLIENT)

	96 build.last_crawled_time = datetime.utcnow()

	97 build.put()

	98

	99 return build

	100

	101 def _ExtractBuildInfo(self, master_name, builder_name, build_number):

	102 """Returns a BuildInfo instance for the specified build."""

	103 build = self._DownloadBuildData(master_name, builder_name, build_number)

	104 if not build.data: # pragma: no cover

	105 return None

	106

	107 build_info = buildbot.ExtractBuildInfo(

	108 master_name, builder_name, build_number, build.data)

	109

	110 if not build.completed:

	111 build.start_time = build_info.build_start_time

	112 build.completed = build_info.completed

	113 build.result = build_info.result

	114 build.put()

	115

	116 analysis = BuildAnalysis.GetBuildAnalysis(

	117 master_name, builder_name, build_number)

	118 if analysis and not analysis.build_start_time:

	119 analysis.build_start_time = build_info.build_start_time

	120 analysis.put()

	121

	122 return build_info

	123

	124 def _SaveBlamelistAndChromiumRevisionIntoDict(self, build_info, builds):

	125 """

	126 Args:

	127 build_info (BuildInfo): a BuildInfo instance which contains blame list and

	128 chromium revision.

	129 builds (dict): to which the blame list and chromium revision is saved. It

	130 will be updated and looks like:

	131 {

	132 555 : {

	133 'chromium_revision': 'a_git_hash',

	134 'blame_list': ['git_hash1', 'git_hash2'],

	135 },

	136 }

	137 """

	138 builds[build_info.build_number] = {

	139 'chromium_revision': build_info.chromium_revision,

	140 'blame_list': build_info.blame_list

	141 }

	142

	143 def _CreateADictOfFailedSteps(self, build_info):

	144 """ Returns a dict with build number for failed steps.

	145

	146 Args:

	147 failed_steps (list): a list of failed steps.

	148

	149 Returns:

	150 A dict like this:

	151 {

	152 'step_name': {

	153 'current_failure': 555,

	154 'first_failure': 553,

	155 },

	156 }

	157 """

	158 failed_steps = dict()

	159 for step_name in build_info.failed_steps:

	160 failed_steps[step_name] = {

	161 'current_failure': build_info.build_number,

	162 'first_failure': build_info.build_number,

	163 }

	164

	165 return failed_steps

	166

	167 def _CheckForFirstKnownFailure(self, master_name, builder_name, build_number,

	168 failed_steps, builds):

	169 """Checks for first known failures of the given failed steps.

	170

	171 Args:

	172 master_name (str): master of the failed build.

	173 builder_name (str): builder of the failed build.

	174 build_number (int): builder number of the current failed build.

	175 failed_steps (dict): the failed steps of the current failed build. It will

	176 be updated with build numbers for 'first_failure' and 'last_pass' of

	177 each failed step.

	178 builds (dict): a dict to save blame list and chromium revision.

	179 """

	180 # Look back for first known failures.

	181 for i in range(_MAX_BUILDS_TO_CHECK): # limit not hit - pragma: no cover

	182 build_info = self._ExtractBuildInfo(

	183 master_name, builder_name, build_number - i - 1)

	184

	185 if not build_info: # pragma: no cover

	186 # Failed to extract the build information, bail out.

	187 return

	188

	189 self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)

	190

	191 if build_info.result == buildbot.SUCCESS:

	192 for step_name in failed_steps:

	193 if 'last_pass' not in failed_steps[step_name]:

	194 failed_steps[step_name]['last_pass'] = build_info.build_number

	195

	196 # All steps passed, so stop looking back.

	197 return

	198 else:

	199 # If a step is not run due to some bot exception, we are not sure

	200 # whether the step could pass or not. So we only check failed/passed

	201 # steps here.

	202

	203 for step_name in build_info.failed_steps:

	204 if step_name in failed_steps:

	205 failed_steps[step_name]['first_failure'] = build_info.build_number

	206

	207 for step_name in failed_steps:

	208 if step_name in build_info.passed_steps:

	209 failed_steps[step_name]['last_pass'] = build_info.build_number

	210

	211 if all('last_pass' in step_info for step_info in failed_steps.values()):

	212 # All failed steps passed in this build cycle.

	213 return

	214

	215 # Arguments number differs from overridden method - pylint: disable=W0221

	216 def run(self, master_name, builder_name, build_number):

	217 build_info = self._ExtractBuildInfo(master_name, builder_name, build_number)

	218

	219 if not build_info: # pragma: no cover

	220 raise pipeline.Retry('Failed to extract build info.')

	221

	222 failure_info = {

	223 'failed': True,

	224 'master_name': master_name,

	225 'builder_name': builder_name,

	226 'build_number': build_number

	227 }

	228

	229 if (build_info.result == buildbot.SUCCESS or

	230 not build_info.failed_steps):

	231 failure_info['failed'] = False

	232 return failure_info

	233

	234 builds = dict()

	235 self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)

	236

	237 failed_steps = self._CreateADictOfFailedSteps(build_info)

	238

	239 self._CheckForFirstKnownFailure(

	240 master_name, builder_name, build_number, failed_steps, builds)

	241

	242 failure_info['builds'] = builds

	243 failure_info['failed_steps'] = failed_steps

	244 return failure_info

OLD	NEW

« no previous file with comments | « appengine/findit/waterfall/build_failure_analysis_pipelines.py ('k') | appengine/findit/waterfall/lock_util.py » ('j') | no next file with comments »