appengine/findit/waterfall/build_failure_analysis_pipelines.py - Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure.

Side by Side Diff: appengine/findit/waterfall/build_failure_analysis_pipelines.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Add appengine/findit/model/test/base_model_test.py Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/findit/model/base_model.py ('K') | « appengine/findit/model/test/build_test.py ('k') | appengine/findit/waterfall/test/build_failure_analysis_pipelines_test.py » ('j') | appengine/findit/waterfall/test/build_failure_analysis_pipelines_test.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

	5 import collections

	6 from datetime import datetime

5 import logging	7 import logging

	8 import random

	9 import time

6	10

	11 from google.appengine.api import memcache

7 from google.appengine.ext import ndb	12 from google.appengine.ext import ndb

8	13

9 from pipeline_utils import pipelines	14 from pipeline_utils import pipelines

	15 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

10	16

	17 from common.http_client_appengine import HttpClientAppengine as HttpClient

11 from model.build import Build	18 from model.build import Build

	19 from model.build_analysis import BuildAnalysis

12 from model.build_analysis_status import BuildAnalysisStatus	20 from model.build_analysis_status import BuildAnalysisStatus

	21 from waterfall import buildbot

13	22

14	23

15 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.	24 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.

16 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover	25 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover

17 def run_test(self, args, *kwargs):	26 def run_test(self, args, *kwargs):

18 pass	27 pass

19	28

20 def finalized_test(self, args, *kwargs):	29 def finalized_test(self, args, *kwargs):

21 pass	30 pass

22	31

23 def callback(self, **kwargs):	32 def callback(self, **kwargs):

24 pass	33 pass

25	34

26 def run(self, args, *kwargs):	35 def run(self, args, *kwargs):

27 raise NotImplementedError()	36 raise NotImplementedError()

28	37

29	38

30 class BuildFailurePipeline(BasePipeline):	39 _MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s'

	40 _MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60

	41 _DOWNLOAD_INTERVAL_SECONDS = 5

	42 _MAX_BUILDS_TO_CHECK_FOR_FIRST_FAILURE = 20

	43

	44

	45 def _WaitUntilDownloadAllowed(

	46 master_name, timeout_seconds=90): # pragma: no cover

	47 """Waits until next download from the specified master is allowed.

	48

	49 Returns:

	50 True if download is allowed to proceed.

	51 False if download is not allowed until the given timeout occurs.

	52 """

	53 client = memcache.Client()

	54 key = _MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name

	55

	56 deadline = time.time() + timeout_seconds

	57 while True:

	58 info = client.gets(key)

	59 if not info or time.time() - info['time'] >= _DOWNLOAD_INTERVAL_SECONDS:

	60 new_info = {

	61 'time': time.time()

	62 }

	63 if not info:

	64 success = client.add(

	65 key, new_info, time=_MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)

	66 else:

	67 success = client.cas(

	68 key, new_info, time=_MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)

	69

	70 if success:

	71 logging.info('Download from %s is allowed. Waited %s seconds.',

	72 master_name, (time.time() + timeout_seconds - deadline))

	73 return True

	74

	75 if time.time() > deadline:

	76 logging.info('Download from %s is not allowed. Waited %s seconds.',

	77 master_name, timeout_seconds)

	78 return False

	79

	80 logging.info('Waiting to download from %s', master_name)

	81 time.sleep(_DOWNLOAD_INTERVAL_SECONDS + random.random())

	82

	83

	84 class DetectFirstFailurePipeline(BasePipeline):
	stgao 2015/01/06 23:54:26 I'm planning to move sub-pipelines to separate mod I'm planning to move sub-pipelines to separate modules and import them in this root pipeline. stgao 2015/01/09 01:28:23 Done. Show quoted text On 2015/01/06 23:54:26, Shuotao wrote: > I'm planning to move sub-pipelines to separate modules and import them in this > root pipeline. Done.
	85 HTTP_CLIENT = HttpClient()

	86

	87 def _BuildDataNeedUpdating(self, build):

	88 return (not build.data or (not build.completed and

	89 (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5))

	90

	91 def _DownloadBuildData(self, master_name, builder_name, build_number):

	92 """Downloads build data and returns a Build instance."""

	93 build = Build.GetBuild(master_name, builder_name, build_number)

	94 if not build: # pragma: no cover

	95 build = Build.CreateBuild(master_name, builder_name, build_number)

	96

	97 # Cache the data to avoid pulling from master again.

	98 if self._BuildDataNeedUpdating(build): # pragma: no cover

	99 if not _WaitUntilDownloadAllowed(master_name): # pragma: no cover

	100 raise pipeline.Retry('Too many download from %s' % master_name)

	101

	102 build.data = buildbot.GetBuildData(

	103 build.master_name, build.builder_name, build.build_number,

	104 self.HTTP_CLIENT)

	105 build.last_crawled_time = datetime.utcnow()

	106 build.put()

	107

	108 return build

	109

	110 def _ExtractBuildInfo(self, master_name, builder_name, build_number):

	111 """Returns a BuildInfo instance for the specified build."""

	112 build = self._DownloadBuildData(master_name, builder_name, build_number)

	113 if not build.data: # pragma: no cover

	114 return None

	115

	116 build_info = buildbot.ExtractBuildInfo(

	117 master_name, builder_name, build_number, build.data)

	118

	119 if not build.completed: # pragma: no cover

	120 build.start_time = build_info.build_start_time

	121 build.completed = build_info.completed

	122 build.result = build_info.result

	123 build.put()

	124

	125 analysis = BuildAnalysis.GetBuildAnalysis(

	126 master_name, builder_name, build_number)

	127 if analysis and not analysis.build_start_time:

	128 analysis.build_start_time = build_info.build_start_time

	129 analysis.put()

	130

	131 return build_info

	132

	133 def _SaveBlamelistAndChromiumRevisionIntoDict(self, build_info, builds):

	134 """

	135 Args:

	136 build_info (BuildInfo): a BuildInfo instance which contains blame list and

	137 chromium revision.

	138 builds (dict): to which the blame list and chromium revision is saved. It

	139 will be updated and looks like:

	140 {

	141 555 : {

	142 'chromium_revision': 'a_git_hash',

	143 'blame_list': ['git_hash1', 'git_hash2'],

	144 },

	145 }

	146 """

	147 builds[build_info.build_number] = {

	148 'chromium_revision': build_info.chromium_revision,

	149 'blame_list': build_info.blame_list

	150 }

	151

	152 def _CreateADictOfFailedSteps(self, build_info):

	153 """ Returns a dict with build number for failed steps.

	154

	155 Args:

	156 failed_steps (list): a list of failed steps.

	157

	158 Returns:

	159 A dict like this:

	160 {

	161 'step_name': {

	162 'current_failure': 555,

	163 'first_failure': 553,

	164 },

	165 }

	166 """

	167 failed_steps = dict()

	168 for step_name in build_info.failed_steps:

	169 failed_steps[step_name] = {

	170 'current_failure': build_info.build_number,

	171 'first_failure': build_info.build_number,

	172 }

	173

	174 return failed_steps

	175

	176 def _CheckForFirstKnownFailure(self, master_name, builder_name, build_number,

	177 failed_steps, builds):

	178 """Checks for first known failures of the given failed steps.

	179

	180 Args:

	181 master_name (str): master of the failed build.

	182 builder_name (str): builder of the failed build.

	183 build_number (int): builder number of the current failed build.

	184 failed_steps (dict): the failed steps of the current failed build. It will

	185 be updated with build numbers for 'first_failure' and 'last_pass' of

	186 each failed step.

	187 builds (dict): a dict to save blame list and chromium revision.

	188 """

	189 # Look back for first known failures.

	190 for i in range(_MAX_BUILDS_TO_CHECK_FOR_FIRST_FAILURE):

	191 build_info = self._ExtractBuildInfo(

	192 master_name, builder_name, build_number - i - 1)

	193

	194 if not build_info: # pragma: no cover

	195 # Failed to extract the build information, bail out.

	196 return

	197

	198 self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)

	199

	200 if build_info.result == buildbot.SUCCESS:

	201 for step_name in failed_steps:

	202 if 'last_pass' not in failed_steps[step_name]:

	203 failed_steps[step_name]['last_pass'] = build_info.build_number

	204

	205 # All steps passed, so stop looking back.

	206 return

	207 else:

	208 # If a step is not run due to some bot exception, we are not sure

	209 # whether the step could pass or not. So we only check failed/passed

	210 # steps here.

	211

	212 for step_name in build_info.failed_steps:

	213 if step_name in failed_steps:

	214 failed_steps[step_name]['first_failure'] = build_info.build_number

	215

	216 for step_name in failed_steps:

	217 if step_name in build_info.passed_steps:

	218 failed_steps[step_name]['last_pass'] = build_info.build_number

	219

	220 if all('last_pass' in step_info for step_info in failed_steps.values()):

	221 # All failed steps passed in this build cycle.

	222 return # pragma: no cover

31	223

32 # Arguments number differs from overridden method - pylint: disable=W0221	224 # Arguments number differs from overridden method - pylint: disable=W0221

33 def run(self, master_name, builder_name, build_number):	225 def run(self, master_name, builder_name, build_number):

34 build = Build.GetBuild(master_name, builder_name, build_number)	226 build_info = self._ExtractBuildInfo(master_name, builder_name, build_number)

35	227

36 # TODO: implement the logic.	228 if not build_info: # pragma: no cover

37 build.analysis_status = BuildAnalysisStatus.ANALYZED	229 raise pipeline.Retry('Failed to extract build info.')

38 build.put()	230

	231 failure_info = {

	232 'failed': True,

	233 'master_name': master_name,

	234 'builder_name': builder_name,

	235 'build_number': build_number

	236 }

	237

	238 if (build_info.result == buildbot.SUCCESS or

	239 not build_info.failed_steps): # pragma: no cover

	240 failure_info['failed'] = False

	241 return failure_info

	242

	243 builds = dict()

	244 self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)

	245

	246 failed_steps = self._CreateADictOfFailedSteps(build_info)

	247

	248 self._CheckForFirstKnownFailure(

	249 master_name, builder_name, build_number, failed_steps, builds)

	250

	251 failure_info['builds'] = builds

	252 failure_info['failed_steps'] = failed_steps

	253 return failure_info

	254

	255

	256 class BuildFailurePipeline(BasePipeline):

	257

	258 def __init__(self, master_name, builder_name, build_number):

	259 super(BuildFailurePipeline, self).__init__(

	260 master_name, builder_name, build_number)

	261 self.master_name = master_name

	262 self.builder_name = builder_name

	263 self.build_number = build_number

	264

	265 def finalized(self):

	266 analysis = BuildAnalysis.GetBuildAnalysis(

	267 self.master_name, self.builder_name, self.build_number)

	268 if self.was_aborted: # pragma: no cover

	269 analysis.status = BuildAnalysisStatus.ERROR

	270 else:

	271 analysis.status = BuildAnalysisStatus.ANALYZED

	272 analysis.put()

	273

	274 # Arguments number differs from overridden method - pylint: disable=W0221

	275 def run(self, master_name, builder_name, build_number):

	276 analysis = BuildAnalysis.GetBuildAnalysis(

	277 master_name, builder_name, build_number)

	278 analysis.pipeline_url = self.pipeline_status_url()

	279 analysis.status = BuildAnalysisStatus.ANALYZING

	280 analysis.start_time = datetime.utcnow()

	281 analysis.put()

	282

	283 yield DetectFirstFailurePipeline(master_name, builder_name, build_number)

39	284

40	285

41 @ndb.transactional	286 @ndb.transactional

42 def NeedANewAnalysis(master_name, builder_name, build_number, force):	287 def NeedANewAnalysis(master_name, builder_name, build_number, force):

43 """Check analysis status of a build and decide if a new analysis is needed.	288 """Checks status of analysis for the build and decides if a new one is needed.

	289

	290 A BuildAnalysis entity for the given build will be created if none exists.

44	291

45 Returns:	292 Returns:

46 (build, need_analysis)	293 True if an analysis is needed, otherwise False.

47 build (Build): the build as specified by the input.

48 need_analysis (bool): True if an analysis is needed, otherwise False.

49 """	294 """

50 build_key = Build.CreateKey(master_name, builder_name, build_number)	295 analysis = BuildAnalysis.GetBuildAnalysis(

51 build = build_key.get()	296 master_name, builder_name, build_number)

52	297

53 if not build:	298 if not analysis:

54 build = Build.CreateBuild(master_name, builder_name, build_number)	299 analysis = BuildAnalysis.CreateBuildAnalysis(

55 build.analysis_status = BuildAnalysisStatus.PENDING	300 master_name, builder_name, build_number)

56 build.put()	301 analysis.status = BuildAnalysisStatus.PENDING

57 return build, True	302 analysis.put()

	303 return True

58 elif force:	304 elif force:

59 # TODO: avoid concurrent analysis.	305 # TODO: avoid concurrent analysis.

60 build.Reset()	306 analysis.Reset()

61 build.put()	307 analysis.put()

62 return build, True	308 return True

63 else:	309 else:

64 # TODO: support following cases	310 # TODO: support following cases

65 # 1. Automatically retry if last analysis failed with errors.	311 # 1. Automatically retry if last analysis failed with errors.

66 # 2. Start another analysis if the build cycle wasn't completed in last	312 # 2. Start another analysis if the build cycle wasn't completed in last

67 # analysis request.	313 # analysis request.

68 # 3. Analysis is not complete and no update in the last 5 minutes.	314 # 3. Analysis is not complete and no update in the last 5 minutes.

69 return build, False	315 return False

70	316

71	317

72 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,	318 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,

73 queue_name):	319 queue_name):

74 """Schedule an analysis if needed and return the build."""	320 """Schedules an analysis if needed and returns the build analysis."""

75 build, need_new_analysis = NeedANewAnalysis(	321 if NeedANewAnalysis(master_name, builder_name, build_number, force):

76 master_name, builder_name, build_number, force)

77

78 if need_new_analysis:

79 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number)	322 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number)

80 pipeline_job.start(queue_name=queue_name)	323 pipeline_job.start(queue_name=queue_name)

81	324

82 logging.info('An analysis triggered on build %s, %s, %s: %s',	325 logging.info('An analysis triggered on build %s, %s, %s: %s',

83 master_name, builder_name, build_number,	326 master_name, builder_name, build_number,

84 pipeline_job.pipeline_status_url())	327 pipeline_job.pipeline_status_url())

85 else: # pragma: no cover	328 else: # pragma: no cover

86 logging.info('Analysis was already triggered or the result is recent.')	329 logging.info('Analysis was already triggered or the result is recent.')

87	330

88 return build	331 return BuildAnalysis.GetBuildAnalysis(master_name, builder_name, build_number)

OLD	NEW