Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(922)

Side by Side Diff: appengine/findit/waterfall/build_failure_analysis_pipelines.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Add appengine/findit/model/test/base_model_test.py Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved. 1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import collections
6 from datetime import datetime
5 import logging 7 import logging
8 import random
9 import time
6 10
11 from google.appengine.api import memcache
7 from google.appengine.ext import ndb 12 from google.appengine.ext import ndb
8 13
9 from pipeline_utils import pipelines 14 from pipeline_utils import pipelines
15 from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline
10 16
17 from common.http_client_appengine import HttpClientAppengine as HttpClient
11 from model.build import Build 18 from model.build import Build
19 from model.build_analysis import BuildAnalysis
12 from model.build_analysis_status import BuildAnalysisStatus 20 from model.build_analysis_status import BuildAnalysisStatus
21 from waterfall import buildbot
13 22
14 23
15 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed. 24 # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.
16 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover 25 class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover
17 def run_test(self, *args, **kwargs): 26 def run_test(self, *args, **kwargs):
18 pass 27 pass
19 28
20 def finalized_test(self, *args, **kwargs): 29 def finalized_test(self, *args, **kwargs):
21 pass 30 pass
22 31
23 def callback(self, **kwargs): 32 def callback(self, **kwargs):
24 pass 33 pass
25 34
26 def run(self, *args, **kwargs): 35 def run(self, *args, **kwargs):
27 raise NotImplementedError() 36 raise NotImplementedError()
28 37
29 38
30 class BuildFailurePipeline(BasePipeline): 39 _MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s'
40 _MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60
41 _DOWNLOAD_INTERVAL_SECONDS = 5
42 _MAX_BUILDS_TO_CHECK_FOR_FIRST_FAILURE = 20
43
44
45 def _WaitUntilDownloadAllowed(
46 master_name, timeout_seconds=90): # pragma: no cover
47 """Waits until next download from the specified master is allowed.
48
49 Returns:
50 True if download is allowed to proceed.
51 False if download is not allowed until the given timeout occurs.
52 """
53 client = memcache.Client()
54 key = _MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name
55
56 deadline = time.time() + timeout_seconds
57 while True:
58 info = client.gets(key)
59 if not info or time.time() - info['time'] >= _DOWNLOAD_INTERVAL_SECONDS:
60 new_info = {
61 'time': time.time()
62 }
63 if not info:
64 success = client.add(
65 key, new_info, time=_MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)
66 else:
67 success = client.cas(
68 key, new_info, time=_MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)
69
70 if success:
71 logging.info('Download from %s is allowed. Waited %s seconds.',
72 master_name, (time.time() + timeout_seconds - deadline))
73 return True
74
75 if time.time() > deadline:
76 logging.info('Download from %s is not allowed. Waited %s seconds.',
77 master_name, timeout_seconds)
78 return False
79
80 logging.info('Waiting to download from %s', master_name)
81 time.sleep(_DOWNLOAD_INTERVAL_SECONDS + random.random())
82
83
84 class DetectFirstFailurePipeline(BasePipeline):
stgao 2015/01/06 23:54:26 I'm planning to move sub-pipelines to separate mod
stgao 2015/01/09 01:28:23 Done.
85 HTTP_CLIENT = HttpClient()
86
87 def _BuildDataNeedUpdating(self, build):
88 return (not build.data or (not build.completed and
89 (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5))
90
91 def _DownloadBuildData(self, master_name, builder_name, build_number):
92 """Downloads build data and returns a Build instance."""
93 build = Build.GetBuild(master_name, builder_name, build_number)
94 if not build: # pragma: no cover
95 build = Build.CreateBuild(master_name, builder_name, build_number)
96
97 # Cache the data to avoid pulling from master again.
98 if self._BuildDataNeedUpdating(build): # pragma: no cover
99 if not _WaitUntilDownloadAllowed(master_name): # pragma: no cover
100 raise pipeline.Retry('Too many download from %s' % master_name)
101
102 build.data = buildbot.GetBuildData(
103 build.master_name, build.builder_name, build.build_number,
104 self.HTTP_CLIENT)
105 build.last_crawled_time = datetime.utcnow()
106 build.put()
107
108 return build
109
110 def _ExtractBuildInfo(self, master_name, builder_name, build_number):
111 """Returns a BuildInfo instance for the specified build."""
112 build = self._DownloadBuildData(master_name, builder_name, build_number)
113 if not build.data: # pragma: no cover
114 return None
115
116 build_info = buildbot.ExtractBuildInfo(
117 master_name, builder_name, build_number, build.data)
118
119 if not build.completed: # pragma: no cover
120 build.start_time = build_info.build_start_time
121 build.completed = build_info.completed
122 build.result = build_info.result
123 build.put()
124
125 analysis = BuildAnalysis.GetBuildAnalysis(
126 master_name, builder_name, build_number)
127 if analysis and not analysis.build_start_time:
128 analysis.build_start_time = build_info.build_start_time
129 analysis.put()
130
131 return build_info
132
133 def _SaveBlamelistAndChromiumRevisionIntoDict(self, build_info, builds):
134 """
135 Args:
136 build_info (BuildInfo): a BuildInfo instance which contains blame list and
137 chromium revision.
138 builds (dict): to which the blame list and chromium revision is saved. It
139 will be updated and looks like:
140 {
141 555 : {
142 'chromium_revision': 'a_git_hash',
143 'blame_list': ['git_hash1', 'git_hash2'],
144 },
145 }
146 """
147 builds[build_info.build_number] = {
148 'chromium_revision': build_info.chromium_revision,
149 'blame_list': build_info.blame_list
150 }
151
152 def _CreateADictOfFailedSteps(self, build_info):
153 """ Returns a dict with build number for failed steps.
154
155 Args:
156 failed_steps (list): a list of failed steps.
157
158 Returns:
159 A dict like this:
160 {
161 'step_name': {
162 'current_failure': 555,
163 'first_failure': 553,
164 },
165 }
166 """
167 failed_steps = dict()
168 for step_name in build_info.failed_steps:
169 failed_steps[step_name] = {
170 'current_failure': build_info.build_number,
171 'first_failure': build_info.build_number,
172 }
173
174 return failed_steps
175
176 def _CheckForFirstKnownFailure(self, master_name, builder_name, build_number,
177 failed_steps, builds):
178 """Checks for first known failures of the given failed steps.
179
180 Args:
181 master_name (str): master of the failed build.
182 builder_name (str): builder of the failed build.
183 build_number (int): builder number of the current failed build.
184 failed_steps (dict): the failed steps of the current failed build. It will
185 be updated with build numbers for 'first_failure' and 'last_pass' of
186 each failed step.
187 builds (dict): a dict to save blame list and chromium revision.
188 """
189 # Look back for first known failures.
190 for i in range(_MAX_BUILDS_TO_CHECK_FOR_FIRST_FAILURE):
191 build_info = self._ExtractBuildInfo(
192 master_name, builder_name, build_number - i - 1)
193
194 if not build_info: # pragma: no cover
195 # Failed to extract the build information, bail out.
196 return
197
198 self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)
199
200 if build_info.result == buildbot.SUCCESS:
201 for step_name in failed_steps:
202 if 'last_pass' not in failed_steps[step_name]:
203 failed_steps[step_name]['last_pass'] = build_info.build_number
204
205 # All steps passed, so stop looking back.
206 return
207 else:
208 # If a step is not run due to some bot exception, we are not sure
209 # whether the step could pass or not. So we only check failed/passed
210 # steps here.
211
212 for step_name in build_info.failed_steps:
213 if step_name in failed_steps:
214 failed_steps[step_name]['first_failure'] = build_info.build_number
215
216 for step_name in failed_steps:
217 if step_name in build_info.passed_steps:
218 failed_steps[step_name]['last_pass'] = build_info.build_number
219
220 if all('last_pass' in step_info for step_info in failed_steps.values()):
221 # All failed steps passed in this build cycle.
222 return # pragma: no cover
31 223
32 # Arguments number differs from overridden method - pylint: disable=W0221 224 # Arguments number differs from overridden method - pylint: disable=W0221
33 def run(self, master_name, builder_name, build_number): 225 def run(self, master_name, builder_name, build_number):
34 build = Build.GetBuild(master_name, builder_name, build_number) 226 build_info = self._ExtractBuildInfo(master_name, builder_name, build_number)
35 227
36 # TODO: implement the logic. 228 if not build_info: # pragma: no cover
37 build.analysis_status = BuildAnalysisStatus.ANALYZED 229 raise pipeline.Retry('Failed to extract build info.')
38 build.put() 230
231 failure_info = {
232 'failed': True,
233 'master_name': master_name,
234 'builder_name': builder_name,
235 'build_number': build_number
236 }
237
238 if (build_info.result == buildbot.SUCCESS or
239 not build_info.failed_steps): # pragma: no cover
240 failure_info['failed'] = False
241 return failure_info
242
243 builds = dict()
244 self._SaveBlamelistAndChromiumRevisionIntoDict(build_info, builds)
245
246 failed_steps = self._CreateADictOfFailedSteps(build_info)
247
248 self._CheckForFirstKnownFailure(
249 master_name, builder_name, build_number, failed_steps, builds)
250
251 failure_info['builds'] = builds
252 failure_info['failed_steps'] = failed_steps
253 return failure_info
254
255
256 class BuildFailurePipeline(BasePipeline):
257
258 def __init__(self, master_name, builder_name, build_number):
259 super(BuildFailurePipeline, self).__init__(
260 master_name, builder_name, build_number)
261 self.master_name = master_name
262 self.builder_name = builder_name
263 self.build_number = build_number
264
265 def finalized(self):
266 analysis = BuildAnalysis.GetBuildAnalysis(
267 self.master_name, self.builder_name, self.build_number)
268 if self.was_aborted: # pragma: no cover
269 analysis.status = BuildAnalysisStatus.ERROR
270 else:
271 analysis.status = BuildAnalysisStatus.ANALYZED
272 analysis.put()
273
274 # Arguments number differs from overridden method - pylint: disable=W0221
275 def run(self, master_name, builder_name, build_number):
276 analysis = BuildAnalysis.GetBuildAnalysis(
277 master_name, builder_name, build_number)
278 analysis.pipeline_url = self.pipeline_status_url()
279 analysis.status = BuildAnalysisStatus.ANALYZING
280 analysis.start_time = datetime.utcnow()
281 analysis.put()
282
283 yield DetectFirstFailurePipeline(master_name, builder_name, build_number)
39 284
40 285
41 @ndb.transactional 286 @ndb.transactional
42 def NeedANewAnalysis(master_name, builder_name, build_number, force): 287 def NeedANewAnalysis(master_name, builder_name, build_number, force):
43 """Check analysis status of a build and decide if a new analysis is needed. 288 """Checks status of analysis for the build and decides if a new one is needed.
289
290 A BuildAnalysis entity for the given build will be created if none exists.
44 291
45 Returns: 292 Returns:
46 (build, need_analysis) 293 True if an analysis is needed, otherwise False.
47 build (Build): the build as specified by the input.
48 need_analysis (bool): True if an analysis is needed, otherwise False.
49 """ 294 """
50 build_key = Build.CreateKey(master_name, builder_name, build_number) 295 analysis = BuildAnalysis.GetBuildAnalysis(
51 build = build_key.get() 296 master_name, builder_name, build_number)
52 297
53 if not build: 298 if not analysis:
54 build = Build.CreateBuild(master_name, builder_name, build_number) 299 analysis = BuildAnalysis.CreateBuildAnalysis(
55 build.analysis_status = BuildAnalysisStatus.PENDING 300 master_name, builder_name, build_number)
56 build.put() 301 analysis.status = BuildAnalysisStatus.PENDING
57 return build, True 302 analysis.put()
303 return True
58 elif force: 304 elif force:
59 # TODO: avoid concurrent analysis. 305 # TODO: avoid concurrent analysis.
60 build.Reset() 306 analysis.Reset()
61 build.put() 307 analysis.put()
62 return build, True 308 return True
63 else: 309 else:
64 # TODO: support following cases 310 # TODO: support following cases
65 # 1. Automatically retry if last analysis failed with errors. 311 # 1. Automatically retry if last analysis failed with errors.
66 # 2. Start another analysis if the build cycle wasn't completed in last 312 # 2. Start another analysis if the build cycle wasn't completed in last
67 # analysis request. 313 # analysis request.
68 # 3. Analysis is not complete and no update in the last 5 minutes. 314 # 3. Analysis is not complete and no update in the last 5 minutes.
69 return build, False 315 return False
70 316
71 317
72 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force, 318 def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,
73 queue_name): 319 queue_name):
74 """Schedule an analysis if needed and return the build.""" 320 """Schedules an analysis if needed and returns the build analysis."""
75 build, need_new_analysis = NeedANewAnalysis( 321 if NeedANewAnalysis(master_name, builder_name, build_number, force):
76 master_name, builder_name, build_number, force)
77
78 if need_new_analysis:
79 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number) 322 pipeline_job = BuildFailurePipeline(master_name, builder_name, build_number)
80 pipeline_job.start(queue_name=queue_name) 323 pipeline_job.start(queue_name=queue_name)
81 324
82 logging.info('An analysis triggered on build %s, %s, %s: %s', 325 logging.info('An analysis triggered on build %s, %s, %s: %s',
83 master_name, builder_name, build_number, 326 master_name, builder_name, build_number,
84 pipeline_job.pipeline_status_url()) 327 pipeline_job.pipeline_status_url())
85 else: # pragma: no cover 328 else: # pragma: no cover
86 logging.info('Analysis was already triggered or the result is recent.') 329 logging.info('Analysis was already triggered or the result is recent.')
87 330
88 return build 331 return BuildAnalysis.GetBuildAnalysis(master_name, builder_name, build_number)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698