Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(117)

Side by Side Diff: appengine/findit/waterfall/build_failure_analysis_pipelines.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Test will come in next patchset. Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved. 1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import collections
6 from datetime import datetime
5 import logging 7 import logging
8 import random
9 import time
6 10
11 from google.appengine.api import memcache
7 from google.appengine.ext import ndb 12 from google.appengine.ext import ndb
8 13
9 from pipeline_utils import pipelines 14 from pipeline_utils import pipelines
15 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline
10 16
17 from common.http_client_appengine import HttpClientAppengine as HttpClient
11 from model.build import Build 18 from model.build import Build
19 from model.build_analysis import BuildAnalysis
12 from model.build_analysis_status import BuildAnalysisStatus 20 from model.build_analysis_status import BuildAnalysisStatus
21 from waterfall import buildbot
13 22
14 23
15 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed. 24 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.
16 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover 25 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover
17 def run_test(self, *args, **kwargs): 26 def run_test(self, *args, **kwargs):
18 pass 27 pass
19 28
20 def finalized_test(self, *args, **kwargs): 29 def finalized_test(self, *args, **kwargs):
21 pass 30 pass
22 31
23 def callback(self, **kwargs): 32 def callback(self, **kwargs):
24 pass 33 pass
25 34
26 def run(self, *args, **kwargs): 35 def run(self, *args, **kwargs):
27 raise NotImplementedError() 36 raise NotImplementedError()
28 37
29 38
30 class BuildFailurePipeline(BasePipeline): 39 MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s'
40 MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60
41 DOWNLOAD_INTERVAL_SECONDS = 5
qyearsley 2014/12/29 19:44:54 If these aren't used in any other modules, you can
stgao 2015/01/03 01:31:47 Done.
42
43
44 def WaitUntilDownloadAllowed(
45 master_name, timeout_seconds=90): # pragma: no cover
46 """Wait until next download from the specified master is allowed.
47
48 Returns:
49 True if download is allowed to proceed.
50 False if download is not allowed until the given timeout occurs.
51 """
qyearsley 2014/12/29 19:44:53 I might put WaitUntilDownloadAllowed below where i
stgao 2015/01/03 01:31:47 In infra repo, it seems the convention is to put h
52 client = memcache.Client()
53 key = MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name
54
55 deadline = time.time() + timeout_seconds
56 while True:
57 info = client.gets(key)
58 if not info or time.time() - info['time'] >= DOWNLOAD_INTERVAL_SECONDS:
59 new_info = {
60 'time': time.time()
61 }
62 if not info:
63 success = client.add(
64 key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)
65 else:
66 success = client.cas(
67 key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)
68
69 if success:
70 return True
71
72 if time.time() > deadline:
73 return False
74
75 logging.info('waiting to download from %s', master_name)
76 time.sleep(DOWNLOAD_INTERVAL_SECONDS + random.random())
77
78
79 class DetectFirstFailure(BasePipeline):
qyearsley 2014/12/29 19:44:54 How about the name DetectFirstFailurePipeline? (It
stgao 2015/01/03 01:31:47 Good point. Done.
80 HTTP_CLIENT = HttpClient()
81
82 def ExtractBuildInfo(self, master_name, builder_name, build_number):
83 """Return a BuildInfo instance for the specified build."""
qyearsley 2014/12/29 19:44:53 Return -> Returns
stgao 2015/01/03 01:31:47 Done.
84 build = Build.GetBuild(master_name, builder_name, build_number)
85 if not build:
86 build = Build.CreateBuild(master_name, builder_name, build_number)
87
88 # Cache the data to avoid pulling from master again.
89 if (not build.data or (not build.completed and
90 (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5)):
91 if not WaitUntilDownloadAllowed(master_name):
92 raise pipeline.Retry('Too many download from %s' % master_name)
93
94 build.data = buildbot.GetBuildData(
95 build.master_name, build.builder_name, build.build_number,
96 self.HTTP_CLIENT)
97 build.last_crawled_time = datetime.utcnow()
98
99 if not build.data:
100 return None
101
102 build_info = buildbot.ExtractBuildInfo(
103 master_name, builder_name, build_number, build.data)
104
105 if not build.completed:
106 build.start_time = build_info.build_start_time
107 build.completed = build_info.completed
108 build.result = build_info.result
109 build.put()
110
111 analysis = BuildAnalysis.GetBuildAnalysis(
112 master_name, builder_name, build_number)
113 if analysis and not analysis.build_start_time:
114 analysis.build_start_time = build_info.build_start_time
115
116 return build_info
31 117
32 # Arguments number differs from overridden method - pylint: disable=W0221 118 # Arguments number differs from overridden method - pylint: disable=W0221
33 def run(self, master_name, builder_name, build_number): 119 def run(self, master_name, builder_name, build_number):
34 build = Build.GetBuild(master_name, builder_name, build_number) 120 build_info = self.ExtractBuildInfo(master_name, builder_name, build_number)
35 121
36 # TODO: implement the logic. 122 if not build_info:
37 build.analysis_status = BuildAnalysisStatus.ANALYZED 123 raise pipeline.Retry('Failed to extract build info.')
38 build.put() 124
125 failure_info = {
126 'failed': True,
127 }
128
129 if build_info.result == buildbot.SUCCESS or not build_info.failed_steps:
130 failure_info['failed'] = False
131 return failure_info
132
133 # Save blame list and tested chromium revision for builds.
134 # {
135 # 555 : {
136 # 'chromium_revision': 'a_git_hash',
137 # 'blame_list': ['cl1', 'cl2'],
138 # },
139 # }
qyearsley 2014/12/29 19:44:53 Is this an example of what the `builds` dict below
stgao 2015/01/03 01:31:47 Done. Moved to a helper function.
140 builds = collections.defaultdict(dict)
141 def FillBuildInfo(info):
142 builds[info.build_number]['chromium_revision'] = info.chromium_revision
143 builds[info.build_number]['blame_list'] = info.blame_list
144
145 FillBuildInfo(build_info)
146
147 # Save build number for failed steps.
148 # {
149 # 'step_name': {
150 # 'current_failure': 555,
151 # 'first_failure': 553,
152 # 'last_pass': 551,
153 # },
154 # }
155 failed_steps = collections.defaultdict(dict)
156 for step_name in build_info.failed_steps:
157 failed_steps[step_name]['current_failure'] = build_info.build_number
158 failed_steps[step_name]['first_failure'] = build_info.build_number
159
160 # Look back for first known failure. At most, 20 builds.
161 for i in range(20):
qyearsley 2014/12/29 19:44:54 This 20 should probably be a module-level constant
stgao 2015/01/03 01:31:47 Done.
162 build_info = self.ExtractBuildInfo(
163 master_name, builder_name, build_number - i - 1)
164
165 if not build_info:
166 # Failed to extract the build information, bail out.
167 break
168
169 FillBuildInfo(build_info)
170
171 if build_info.result == buildbot.SUCCESS:
172 for step_name in failed_steps:
173 if 'last_pass' not in failed_steps[step_name]:
174 failed_steps[step_name]['last_pass'] = build_info.build_number
175
176 # All steps passed, so stop looking back.
177 break
178 else:
179 # If a step is not run due to some bot exception, we are not sure
180 # whether the step could pass or not. So we only check failed/passed
181 # steps here.
182
183 for step_name in build_info.failed_steps:
184 if step_name in failed_steps:
185 failed_steps[step_name]['first_failure'] = build_info.build_number
186
187 for step_name in failed_steps:
188 if step_name in build_info.passed_steps:
189 failed_steps[step_name]['last_pass'] = build_info.build_number
190
191 failure_info['builds'] = builds
192 failure_info['failed_steps'] = failed_steps
193 return failure_info
qyearsley 2014/12/29 19:44:53 This function is very long, and would probably be
stgao 2015/01/03 01:31:47 Done. Hope the code is easier to understand.
194
195
196 class BuildFailurePipeline(BasePipeline):
197
198 def __init__(self, master_name, builder_name, build_number):
199 super(BuildFailurePipeline, self).__init__(
200 master_name, builder_name, build_number)
201 self.master_name = master_name
202 self.builder_name = builder_name
203 self.build_number = build_number
204
205 def finalized(self):
206 if self.was_aborted:
207 analysis = BuildAnalysis.GetBuildAnalysis(
208 self.master_name, self.builder_name, self.build_number)
209 analysis.status = BuildAnalysisStatus.ERROR
210 analysis.put()
211
212 # Arguments number differs from overridden method - pylint: disable=W0221
qyearsley 2014/12/29 19:44:54 Cool, I didn't know you could add a comment on the
stgao 2015/01/03 01:31:47 Yes, they could fit into one line.
213 def run(self, master_name, builder_name, build_number):
214 yield DetectFirstFailure(master_name, builder_name, build_number)
39 215
40 216
41 @ndb.transactional 217 @ndb.transactional
42 def NeedANewAnalysis(master_name, builder_name, build_number, force): 218 def NeedANewAnalysis(master_name, builder_name, build_number, force):
43 """Check analysis status of a build and decide if a new analysis is needed. 219 """Check status of analysis for the build and decide if a new one is needed.
qyearsley 2014/12/29 19:44:54 Check -> Checks, decide -> decides (third-person s
stgao 2015/01/03 01:31:47 Done.
44 220
45 Returns: 221 Returns:
46 (build, need_analysis) 222 (build, need_analysis)
47 build (Build): the build as specified by the input. 223 build (Build): the build as specified by the input.
qyearsley 2014/12/29 19:44:53 Make sure to update the docstring to reflect the c
stgao 2015/01/03 01:31:47 Oops, done.
48 need_analysis (bool): True if an analysis is needed, otherwise False. 224 need_analysis (bool): True if an analysis is needed, otherwise False.
49 """ 225 """
qyearsley 2014/12/29 19:44:54 Similar to WaitUntilDownloadAllowed, I think NeedA
stgao 2015/01/03 01:31:47 I intentionally made this function public for test
50 build_key = Build.CreateKey(master_name, builder_name, build_number) 226 analysis = BuildAnalysis.GetBuildAnalysis(
51 build = build_key.get() 227 master_name, builder_name, build_number)
52 228
53 if not build: 229 if not analysis:
54 build = Build.CreateBuild(master_name, builder_name, build_number) 230 analysis = BuildAnalysis.CreateBuildAnalysis(
55 build.analysis_status = BuildAnalysisStatus.PENDING 231 master_name, builder_name, build_number)
56 build.put() 232 analysis.status = BuildAnalysisStatus.PENDING
57 return build, True 233 analysis.put()
234 return analysis, True
58 elif force: 235 elif force:
59 # TODO: avoid concurrent analysis. 236 # TODO: avoid concurrent analysis.
60 build.Reset() 237 analysis.Reset()
61 build.put() 238 analysis.put()
62 return build, True 239 return analysis, True
63 else: 240 else:
64 # TODO: support following cases 241 # TODO: support following cases
65 # 1. Automatically retry if last analysis failed with errors. 242 # 1. Automatically retry if last analysis failed with errors.
66 # 2. Start another analysis if the build cycle wasn't completed in last 243 # 2. Start another analysis if the build cycle wasn't completed in last
67 # analysis request. 244 # analysis request.
68 # 3. Analysis is not complete and no update in the last 5 minutes. 245 # 3. Analysis is not complete and no update in the last 5 minutes.
69 return build, False 246 return analysis, False
70 247
71 248
72 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force, 249 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,
73 queue_name): 250 queue_name):
74 """Schedule an analysis if needed and return the build.""" 251 """Schedule an analysis if needed and return the build."""
75 build, need_new_analysis = NeedANewAnalysis( 252 analysis, need_new_analysis = NeedANewAnalysis(
76 master_name, builder_name, build_number, force) 253 master_name, builder_name, build_number, force)
77 254
78 if need_new_analysis: 255 if need_new_analysis:
79 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number) 256 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number)
80 pipeline_job.start(queue_name=queue_name) 257 pipeline_job.start(queue_name=queue_name)
81 258
82 logging.info('An analysis triggered on build %s, %s, %s: %s', 259 logging.info('An analysis triggered on build %s, %s, %s: %s',
83 master_name, builder_name, build_number, 260 master_name, builder_name, build_number,
84 pipeline_job.pipeline_status_url()) 261 pipeline_job.pipeline_status_url())
85 else: # pragma: no cover 262 else: # pragma: no cover
86 logging.info('Analysis was already triggered or the result is recent.') 263 logging.info('Analysis was already triggered or the result is recent.')
87 264
88 return build 265 return analysis
OLDNEW
« appengine/findit/model/build_analysis.py ('K') | « appengine/findit/model/build_analysis.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698