appengine/findit/waterfall/build_failure_analysis_pipelines.py - Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure.

Side by Side Diff: appengine/findit/waterfall/build_failure_analysis_pipelines.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Test will come in next patchset. Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

	5 import collections

	6 from datetime import datetime

5 import logging	7 import logging

	8 import random

	9 import time

6	10

	11 from google.appengine.api import memcache

7 from google.appengine.ext import ndb	12 from google.appengine.ext import ndb

8	13

9 from pipeline_utils import pipelines	14 from pipeline_utils import pipelines

	15 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline

10	16

	17 from common.http_client_appengine import HttpClientAppengine as HttpClient

11 from model.build import Build	18 from model.build import Build

	19 from model.build_analysis import BuildAnalysis

12 from model.build_analysis_status import BuildAnalysisStatus	20 from model.build_analysis_status import BuildAnalysisStatus

	21 from waterfall import buildbot

13	22

14	23

15 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.	24 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.

16 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover	25 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover

17 def run_test(self, args, *kwargs):	26 def run_test(self, args, *kwargs):

18 pass	27 pass

19	28

20 def finalized_test(self, args, *kwargs):	29 def finalized_test(self, args, *kwargs):

21 pass	30 pass

22	31

23 def callback(self, **kwargs):	32 def callback(self, **kwargs):

24 pass	33 pass

25	34

26 def run(self, args, *kwargs):	35 def run(self, args, *kwargs):

27 raise NotImplementedError()	36 raise NotImplementedError()

28	37

29	38

30 class BuildFailurePipeline(BasePipeline):	39 MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s'

	40 MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60

	41 DOWNLOAD_INTERVAL_SECONDS = 5
	qyearsley 2014/12/29 19:44:54 If these aren't used in any other modules, you can If these aren't used in any other modules, you can mark them as private: _MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s' _MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60 _DOWNLOAD_INTERVAL_SECONDS = 5 stgao 2015/01/03 01:31:47 Done. Show quoted text On 2014/12/29 19:44:54, qyearsley wrote: > If these aren't used in any other modules, you can mark them as private: > > _MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s' > _MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60 > _DOWNLOAD_INTERVAL_SECONDS = 5 Done.
	42

	43

	44 def WaitUntilDownloadAllowed(

	45 master_name, timeout_seconds=90): # pragma: no cover

	46 """Wait until next download from the specified master is allowed.

	47

	48 Returns:

	49 True if download is allowed to proceed.

	50 False if download is not allowed until the given timeout occurs.

	51 """
	qyearsley 2014/12/29 19:44:53 I might put WaitUntilDownloadAllowed below where i I might put WaitUntilDownloadAllowed below where it's used in DetectFirstFailure, because in general I prefer putting helper functions after the place where they're used. Also, if it's not used in other modules, it could be private. stgao 2015/01/03 01:31:47 In infra repo, it seems the convention is to put h Show quoted text On 2014/12/29 19:44:53, qyearsley wrote: > I might put WaitUntilDownloadAllowed below where it's used in > DetectFirstFailure, because in general I prefer putting helper functions after > the place where they're used. In infra repo, it seems the convention is to put helper functions before where they are used. As findit is in infra repo, I'd follow the convention here :) Show quoted text > > Also, if it's not used in other modules, it could be private. Done.
	52 client = memcache.Client()

	53 key = MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name

	54

	55 deadline = time.time() + timeout_seconds

	56 while True:

	57 info = client.gets(key)

	58 if not info or time.time() - info['time'] >= DOWNLOAD_INTERVAL_SECONDS:

	59 new_info = {

	60 'time': time.time()

	61 }

	62 if not info:

	63 success = client.add(

	64 key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)

	65 else:

	66 success = client.cas(

	67 key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)

	68

	69 if success:

	70 return True

	71

	72 if time.time() > deadline:

	73 return False

	74

	75 logging.info('waiting to download from %s', master_name)

	76 time.sleep(DOWNLOAD_INTERVAL_SECONDS + random.random())

	77

	78

	79 class DetectFirstFailure(BasePipeline):
	qyearsley 2014/12/29 19:44:54 How about the name DetectFirstFailurePipeline? (It How about the name DetectFirstFailurePipeline? (It seems like the name of a class should always be a noun phrase, not a verb phrase.) stgao 2015/01/03 01:31:47 Good point. Done. Show quoted text On 2014/12/29 19:44:54, qyearsley wrote: > How about the name DetectFirstFailurePipeline? (It seems like the name of a > class should always be a noun phrase, not a verb phrase.) Good point. Done.
	80 HTTP_CLIENT = HttpClient()

	81

	82 def ExtractBuildInfo(self, master_name, builder_name, build_number):

	83 """Return a BuildInfo instance for the specified build."""
	qyearsley 2014/12/29 19:44:53 Return -> Returns Return -> Returns stgao 2015/01/03 01:31:47 Done. Show quoted text On 2014/12/29 19:44:53, qyearsley wrote: > Return -> Returns Done.
	84 build = Build.GetBuild(master_name, builder_name, build_number)

	85 if not build:

	86 build = Build.CreateBuild(master_name, builder_name, build_number)

	87

	88 # Cache the data to avoid pulling from master again.

	89 if (not build.data or (not build.completed and

	90 (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5)):

	91 if not WaitUntilDownloadAllowed(master_name):

	92 raise pipeline.Retry('Too many download from %s' % master_name)

	93

	94 build.data = buildbot.GetBuildData(

	95 build.master_name, build.builder_name, build.build_number,

	96 self.HTTP_CLIENT)

	97 build.last_crawled_time = datetime.utcnow()

	98

	99 if not build.data:

	100 return None

	101

	102 build_info = buildbot.ExtractBuildInfo(

	103 master_name, builder_name, build_number, build.data)

	104

	105 if not build.completed:

	106 build.start_time = build_info.build_start_time

	107 build.completed = build_info.completed

	108 build.result = build_info.result

	109 build.put()

	110

	111 analysis = BuildAnalysis.GetBuildAnalysis(

	112 master_name, builder_name, build_number)

	113 if analysis and not analysis.build_start_time:

	114 analysis.build_start_time = build_info.build_start_time

	115

	116 return build_info

31	117

32 # Arguments number differs from overridden method - pylint: disable=W0221	118 # Arguments number differs from overridden method - pylint: disable=W0221

33 def run(self, master_name, builder_name, build_number):	119 def run(self, master_name, builder_name, build_number):

34 build = Build.GetBuild(master_name, builder_name, build_number)	120 build_info = self.ExtractBuildInfo(master_name, builder_name, build_number)

35	121

36 # TODO: implement the logic.	122 if not build_info:

37 build.analysis_status = BuildAnalysisStatus.ANALYZED	123 raise pipeline.Retry('Failed to extract build info.')

38 build.put()	124

	125 failure_info = {

	126 'failed': True,

	127 }

	128

	129 if build_info.result == buildbot.SUCCESS or not build_info.failed_steps:

	130 failure_info['failed'] = False

	131 return failure_info

	132

	133 # Save blame list and tested chromium revision for builds.

	134 # {

	135 # 555 : {

	136 # 'chromium_revision': 'a_git_hash',

	137 # 'blame_list': ['cl1', 'cl2'],

	138 # },

	139 # }
	qyearsley 2014/12/29 19:44:53 Is this an example of what the `builds` dict below Is this an example of what the `builds` dict below might look like? If so, maybe you should explicitly say this. # Save blame list and tested chromium revision for builds. # This will be saved in a dict that looks like this: # ... If you extract these few lines to a sub-function, then this could be put into the Returns part of the docstring. stgao 2015/01/03 01:31:47 Done. Moved to a helper function. Show quoted text On 2014/12/29 19:44:53, qyearsley wrote: > Is this an example of what the `builds` dict below might look like? If so, maybe > you should explicitly say this. > > # Save blame list and tested chromium revision for builds. > # This will be saved in a dict that looks like this: > # ... > > If you extract these few lines to a sub-function, then this could be put into > the Returns part of the docstring. Done. Moved to a helper function.
	140 builds = collections.defaultdict(dict)

	141 def FillBuildInfo(info):

	142 builds[info.build_number]['chromium_revision'] = info.chromium_revision

	143 builds[info.build_number]['blame_list'] = info.blame_list

	144

	145 FillBuildInfo(build_info)

	146

	147 # Save build number for failed steps.

	148 # {

	149 # 'step_name': {

	150 # 'current_failure': 555,

	151 # 'first_failure': 553,

	152 # 'last_pass': 551,

	153 # },

	154 # }

	155 failed_steps = collections.defaultdict(dict)

	156 for step_name in build_info.failed_steps:

	157 failed_steps[step_name]['current_failure'] = build_info.build_number

	158 failed_steps[step_name]['first_failure'] = build_info.build_number

	159

	160 # Look back for first known failure. At most, 20 builds.

	161 for i in range(20):
	qyearsley 2014/12/29 19:44:54 This 20 should probably be a module-level constant This 20 should probably be a module-level constant. stgao 2015/01/03 01:31:47 Done. Show quoted text On 2014/12/29 19:44:54, qyearsley wrote: > This 20 should probably be a module-level constant. Done.
	162 build_info = self.ExtractBuildInfo(

	163 master_name, builder_name, build_number - i - 1)

	164

	165 if not build_info:

	166 # Failed to extract the build information, bail out.

	167 break

	168

	169 FillBuildInfo(build_info)

	170

	171 if build_info.result == buildbot.SUCCESS:

	172 for step_name in failed_steps:

	173 if 'last_pass' not in failed_steps[step_name]:

	174 failed_steps[step_name]['last_pass'] = build_info.build_number

	175

	176 # All steps passed, so stop looking back.

	177 break

	178 else:

	179 # If a step is not run due to some bot exception, we are not sure

	180 # whether the step could pass or not. So we only check failed/passed

	181 # steps here.

	182

	183 for step_name in build_info.failed_steps:

	184 if step_name in failed_steps:

	185 failed_steps[step_name]['first_failure'] = build_info.build_number

	186

	187 for step_name in failed_steps:

	188 if step_name in build_info.passed_steps:

	189 failed_steps[step_name]['last_pass'] = build_info.build_number

	190

	191 failure_info['builds'] = builds

	192 failure_info['failed_steps'] = failed_steps

	193 return failure_info
	qyearsley 2014/12/29 19:44:53 This function is very long, and would probably be This function is very long, and would probably be easier to understand if it were broken into some sub-functions. Possible candidates for sub-functions: Lines 133-143: Constructing the builds dict Lines 147-158: Constructing the failed_steps dict Lines 162-189: Looking back at a particular past build number in order to see whether it passed or failed. stgao 2015/01/03 01:31:47 Done. Hope the code is easier to understand. Show quoted text On 2014/12/29 19:44:53, qyearsley wrote: > This function is very long, and would probably be easier to understand if it > were broken into some sub-functions. Possible candidates for sub-functions: > > Lines 133-143: Constructing the builds dict > Lines 147-158: Constructing the failed_steps dict > Lines 162-189: Looking back at a particular past build number in order to see > whether it passed or failed. Done. Hope the code is easier to understand.
	194

	195

	196 class BuildFailurePipeline(BasePipeline):

	197

	198 def __init__(self, master_name, builder_name, build_number):

	199 super(BuildFailurePipeline, self).__init__(

	200 master_name, builder_name, build_number)

	201 self.master_name = master_name

	202 self.builder_name = builder_name

	203 self.build_number = build_number

	204

	205 def finalized(self):

	206 if self.was_aborted:

	207 analysis = BuildAnalysis.GetBuildAnalysis(

	208 self.master_name, self.builder_name, self.build_number)

	209 analysis.status = BuildAnalysisStatus.ERROR

	210 analysis.put()

	211

	212 # Arguments number differs from overridden method - pylint: disable=W0221
	qyearsley 2014/12/29 19:44:54 Cool, I didn't know you could add a comment on the Cool, I didn't know you could add a comment on the same line as a pylint pragma. stgao 2015/01/03 01:31:47 Yes, they could fit into one line. Show quoted text On 2014/12/29 19:44:54, qyearsley wrote: > Cool, I didn't know you could add a comment on the same line as a pylint pragma. Yes, they could fit into one line.
	213 def run(self, master_name, builder_name, build_number):

	214 yield DetectFirstFailure(master_name, builder_name, build_number)

39	215

40	216

41 @ndb.transactional	217 @ndb.transactional

42 def NeedANewAnalysis(master_name, builder_name, build_number, force):	218 def NeedANewAnalysis(master_name, builder_name, build_number, force):

43 """Check analysis status of a build and decide if a new analysis is needed.	219 """Check status of analysis for the build and decide if a new one is needed.
	qyearsley 2014/12/29 19:44:54 Check -> Checks, decide -> decides (third-person s Check -> Checks, decide -> decides (third-person singular form.) stgao 2015/01/03 01:31:47 Done. Show quoted text On 2014/12/29 19:44:54, qyearsley wrote: > Check -> Checks, decide -> decides (third-person singular form.) Done.
44	220

45 Returns:	221 Returns:

46 (build, need_analysis)	222 (build, need_analysis)

47 build (Build): the build as specified by the input.	223 build (Build): the build as specified by the input.
	qyearsley 2014/12/29 19:44:53 Make sure to update the docstring to reflect the c Make sure to update the docstring to reflect the current state of the code. stgao 2015/01/03 01:31:47 Oops, done. Show quoted text On 2014/12/29 19:44:53, qyearsley wrote: > Make sure to update the docstring to reflect the current state of the code. Oops, done.
48 need_analysis (bool): True if an analysis is needed, otherwise False.	224 need_analysis (bool): True if an analysis is needed, otherwise False.

49 """	225 """
	qyearsley 2014/12/29 19:44:54 Similar to WaitUntilDownloadAllowed, I think NeedA Similar to WaitUntilDownloadAllowed, I think NeedANewAnalysis could be put below where it's used (below ScheduleAnalysisIfNeeded) and make it private if possible. Also, it would be easier to understand this function if it only returned a boolean instead of a pair, and if it were named "NeedsNewAnalysis". To make it return only a boolean, you could make it take a BuildAnalysis (or None), and call BuildAnalysis.GetBuildAnalysis before calling this function. stgao 2015/01/03 01:31:47 I intentionally made this function public for test Show quoted text On 2014/12/29 19:44:54, qyearsley wrote: > Similar to WaitUntilDownloadAllowed, I think NeedANewAnalysis could be put below > where it's used (below ScheduleAnalysisIfNeeded) and make it private if > possible. I intentionally made this function public for testing. Last time, pylint complained that testing code accesses this private function. (Same for other functions that are unit-tested.) Show quoted text > > Also, it would be easier to understand this function if it only returned a > boolean instead of a pair, and if it were named "NeedsNewAnalysis". To make it > return only a boolean, you could make it take a BuildAnalysis (or None), and > call BuildAnalysis.GetBuildAnalysis before calling this function. Good point. I made this function return a boolean only now. The caller could just call BuildAnalysis.GetBuildAnalysis again. That's fine too. But we couldn't pass in a BuildAnalysis. Because this function is run in a transaction to avoid scheduling more than two analysis for the same build failure. If two concurrent analysis requests for the same build are sent to findit and a BuildAnalysis instance is passed into this function, two analysis might get scheduled.
50 build_key = Build.CreateKey(master_name, builder_name, build_number)	226 analysis = BuildAnalysis.GetBuildAnalysis(

51 build = build_key.get()	227 master_name, builder_name, build_number)

52	228

53 if not build:	229 if not analysis:

54 build = Build.CreateBuild(master_name, builder_name, build_number)	230 analysis = BuildAnalysis.CreateBuildAnalysis(

55 build.analysis_status = BuildAnalysisStatus.PENDING	231 master_name, builder_name, build_number)

56 build.put()	232 analysis.status = BuildAnalysisStatus.PENDING

57 return build, True	233 analysis.put()

	234 return analysis, True

58 elif force:	235 elif force:

59 # TODO: avoid concurrent analysis.	236 # TODO: avoid concurrent analysis.

60 build.Reset()	237 analysis.Reset()

61 build.put()	238 analysis.put()

62 return build, True	239 return analysis, True

63 else:	240 else:

64 # TODO: support following cases	241 # TODO: support following cases

65 # 1. Automatically retry if last analysis failed with errors.	242 # 1. Automatically retry if last analysis failed with errors.

66 # 2. Start another analysis if the build cycle wasn't completed in last	243 # 2. Start another analysis if the build cycle wasn't completed in last

67 # analysis request.	244 # analysis request.

68 # 3. Analysis is not complete and no update in the last 5 minutes.	245 # 3. Analysis is not complete and no update in the last 5 minutes.

69 return build, False	246 return analysis, False

70	247

71	248

72 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,	249 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,

73 queue_name):	250 queue_name):

74 """Schedule an analysis if needed and return the build."""	251 """Schedule an analysis if needed and return the build."""

75 build, need_new_analysis = NeedANewAnalysis(	252 analysis, need_new_analysis = NeedANewAnalysis(

76 master_name, builder_name, build_number, force)	253 master_name, builder_name, build_number, force)

77	254

78 if need_new_analysis:	255 if need_new_analysis:

79 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number)	256 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number)

80 pipeline_job.start(queue_name=queue_name)	257 pipeline_job.start(queue_name=queue_name)

81	258

82 logging.info('An analysis triggered on build %s, %s, %s: %s',	259 logging.info('An analysis triggered on build %s, %s, %s: %s',

83 master_name, builder_name, build_number,	260 master_name, builder_name, build_number,

84 pipeline_job.pipeline_status_url())	261 pipeline_job.pipeline_status_url())

85 else: # pragma: no cover	262 else: # pragma: no cover

86 logging.info('Analysis was already triggered or the result is recent.')	263 logging.info('Analysis was already triggered or the result is recent.')

87	264

88 return build	265 return analysis

OLD	NEW

« appengine/findit/model/build_analysis.py ('K') | « appengine/findit/model/build_analysis.py ('k') | no next file » | no next file with comments »