Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2302)

Unified Diff: appengine/findit/waterfall/build_failure_analysis_pipelines.py

Issue 820113002: [Findit] Add a sub-pipeline to detect first-known failure. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Test will come in next patchset. Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/waterfall/build_failure_analysis_pipelines.py
diff --git a/appengine/findit/waterfall/build_failure_analysis_pipelines.py b/appengine/findit/waterfall/build_failure_analysis_pipelines.py
index 91af3608be8b35d6959fd22afd1d4e436faa60f2..28189073db5923634a169fee79b11c6026629e89 100644
--- a/appengine/findit/waterfall/build_failure_analysis_pipelines.py
+++ b/appengine/findit/waterfall/build_failure_analysis_pipelines.py
@@ -2,14 +2,23 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
+import collections
+from datetime import datetime
import logging
+import random
+import time
+from google.appengine.api import memcache
from google.appengine.ext import ndb
from pipeline_utils import pipelines
+from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline
+from common.http_client_appengine import HttpClientAppengine as HttpClient
from model.build import Build
+from model.build_analysis import BuildAnalysis
from model.build_analysis_status import BuildAnalysisStatus
+from waterfall import buildbot
# TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed.
@@ -27,52 +36,220 @@ class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover
raise NotImplementedError()
-class BuildFailurePipeline(BasePipeline):
+MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s'
+MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60
+DOWNLOAD_INTERVAL_SECONDS = 5
qyearsley 2014/12/29 19:44:54 If these aren't used in any other modules, you can
stgao 2015/01/03 01:31:47 Done.
+
+
+def WaitUntilDownloadAllowed(
+ master_name, timeout_seconds=90): # pragma: no cover
+ """Wait until next download from the specified master is allowed.
+
+ Returns:
+ True if download is allowed to proceed.
+ False if download is not allowed until the given timeout occurs.
+ """
qyearsley 2014/12/29 19:44:53 I might put WaitUntilDownloadAllowed below where i
stgao 2015/01/03 01:31:47 In infra repo, it seems the convention is to put h
+ client = memcache.Client()
+ key = MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name
+
+ deadline = time.time() + timeout_seconds
+ while True:
+ info = client.gets(key)
+ if not info or time.time() - info['time'] >= DOWNLOAD_INTERVAL_SECONDS:
+ new_info = {
+ 'time': time.time()
+ }
+ if not info:
+ success = client.add(
+ key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)
+ else:
+ success = client.cas(
+ key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS)
+
+ if success:
+ return True
+
+ if time.time() > deadline:
+ return False
+
+ logging.info('waiting to download from %s', master_name)
+ time.sleep(DOWNLOAD_INTERVAL_SECONDS + random.random())
+
+
+class DetectFirstFailure(BasePipeline):
qyearsley 2014/12/29 19:44:54 How about the name DetectFirstFailurePipeline? (It
stgao 2015/01/03 01:31:47 Good point. Done.
+ HTTP_CLIENT = HttpClient()
+
+ def ExtractBuildInfo(self, master_name, builder_name, build_number):
+ """Return a BuildInfo instance for the specified build."""
qyearsley 2014/12/29 19:44:53 Return -> Returns
stgao 2015/01/03 01:31:47 Done.
+ build = Build.GetBuild(master_name, builder_name, build_number)
+ if not build:
+ build = Build.CreateBuild(master_name, builder_name, build_number)
+
+ # Cache the data to avoid pulling from master again.
+ if (not build.data or (not build.completed and
+ (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5)):
+ if not WaitUntilDownloadAllowed(master_name):
+ raise pipeline.Retry('Too many download from %s' % master_name)
+
+ build.data = buildbot.GetBuildData(
+ build.master_name, build.builder_name, build.build_number,
+ self.HTTP_CLIENT)
+ build.last_crawled_time = datetime.utcnow()
+
+ if not build.data:
+ return None
+
+ build_info = buildbot.ExtractBuildInfo(
+ master_name, builder_name, build_number, build.data)
+
+ if not build.completed:
+ build.start_time = build_info.build_start_time
+ build.completed = build_info.completed
+ build.result = build_info.result
+ build.put()
+
+ analysis = BuildAnalysis.GetBuildAnalysis(
+ master_name, builder_name, build_number)
+ if analysis and not analysis.build_start_time:
+ analysis.build_start_time = build_info.build_start_time
+
+ return build_info
# Arguments number differs from overridden method - pylint: disable=W0221
def run(self, master_name, builder_name, build_number):
- build = Build.GetBuild(master_name, builder_name, build_number)
+ build_info = self.ExtractBuildInfo(master_name, builder_name, build_number)
- # TODO: implement the logic.
- build.analysis_status = BuildAnalysisStatus.ANALYZED
- build.put()
+ if not build_info:
+ raise pipeline.Retry('Failed to extract build info.')
+
+ failure_info = {
+ 'failed': True,
+ }
+
+ if build_info.result == buildbot.SUCCESS or not build_info.failed_steps:
+ failure_info['failed'] = False
+ return failure_info
+
+ # Save blame list and tested chromium revision for builds.
+ # {
+ # 555 : {
+ # 'chromium_revision': 'a_git_hash',
+ # 'blame_list': ['cl1', 'cl2'],
+ # },
+ # }
qyearsley 2014/12/29 19:44:53 Is this an example of what the `builds` dict below
stgao 2015/01/03 01:31:47 Done. Moved to a helper function.
+ builds = collections.defaultdict(dict)
+ def FillBuildInfo(info):
+ builds[info.build_number]['chromium_revision'] = info.chromium_revision
+ builds[info.build_number]['blame_list'] = info.blame_list
+
+ FillBuildInfo(build_info)
+
+ # Save build number for failed steps.
+ # {
+ # 'step_name': {
+ # 'current_failure': 555,
+ # 'first_failure': 553,
+ # 'last_pass': 551,
+ # },
+ # }
+ failed_steps = collections.defaultdict(dict)
+ for step_name in build_info.failed_steps:
+ failed_steps[step_name]['current_failure'] = build_info.build_number
+ failed_steps[step_name]['first_failure'] = build_info.build_number
+
+ # Look back for first known failure. At most, 20 builds.
+ for i in range(20):
qyearsley 2014/12/29 19:44:54 This 20 should probably be a module-level constant
stgao 2015/01/03 01:31:47 Done.
+ build_info = self.ExtractBuildInfo(
+ master_name, builder_name, build_number - i - 1)
+
+ if not build_info:
+ # Failed to extract the build information, bail out.
+ break
+
+ FillBuildInfo(build_info)
+
+ if build_info.result == buildbot.SUCCESS:
+ for step_name in failed_steps:
+ if 'last_pass' not in failed_steps[step_name]:
+ failed_steps[step_name]['last_pass'] = build_info.build_number
+
+ # All steps passed, so stop looking back.
+ break
+ else:
+ # If a step is not run due to some bot exception, we are not sure
+ # whether the step could pass or not. So we only check failed/passed
+ # steps here.
+
+ for step_name in build_info.failed_steps:
+ if step_name in failed_steps:
+ failed_steps[step_name]['first_failure'] = build_info.build_number
+
+ for step_name in failed_steps:
+ if step_name in build_info.passed_steps:
+ failed_steps[step_name]['last_pass'] = build_info.build_number
+
+ failure_info['builds'] = builds
+ failure_info['failed_steps'] = failed_steps
+ return failure_info
qyearsley 2014/12/29 19:44:53 This function is very long, and would probably be
stgao 2015/01/03 01:31:47 Done. Hope the code is easier to understand.
+
+
+class BuildFailurePipeline(BasePipeline):
+
+ def __init__(self, master_name, builder_name, build_number):
+ super(BuildFailurePipeline, self).__init__(
+ master_name, builder_name, build_number)
+ self.master_name = master_name
+ self.builder_name = builder_name
+ self.build_number = build_number
+
+ def finalized(self):
+ if self.was_aborted:
+ analysis = BuildAnalysis.GetBuildAnalysis(
+ self.master_name, self.builder_name, self.build_number)
+ analysis.status = BuildAnalysisStatus.ERROR
+ analysis.put()
+
+ # Arguments number differs from overridden method - pylint: disable=W0221
qyearsley 2014/12/29 19:44:54 Cool, I didn't know you could add a comment on the
stgao 2015/01/03 01:31:47 Yes, they could fit into one line.
+ def run(self, master_name, builder_name, build_number):
+ yield DetectFirstFailure(master_name, builder_name, build_number)
@ndb.transactional
def NeedANewAnalysis(master_name, builder_name, build_number, force):
- """Check analysis status of a build and decide if a new analysis is needed.
+ """Check status of analysis for the build and decide if a new one is needed.
qyearsley 2014/12/29 19:44:54 Check -> Checks, decide -> decides (third-person s
stgao 2015/01/03 01:31:47 Done.
Returns:
(build, need_analysis)
build (Build): the build as specified by the input.
qyearsley 2014/12/29 19:44:53 Make sure to update the docstring to reflect the c
stgao 2015/01/03 01:31:47 Oops, done.
need_analysis (bool): True if an analysis is needed, otherwise False.
"""
qyearsley 2014/12/29 19:44:54 Similar to WaitUntilDownloadAllowed, I think NeedA
stgao 2015/01/03 01:31:47 I intentionally made this function public for test
- build_key = Build.CreateKey(master_name, builder_name, build_number)
- build = build_key.get()
-
- if not build:
- build = Build.CreateBuild(master_name, builder_name, build_number)
- build.analysis_status = BuildAnalysisStatus.PENDING
- build.put()
- return build, True
+ analysis = BuildAnalysis.GetBuildAnalysis(
+ master_name, builder_name, build_number)
+
+ if not analysis:
+ analysis = BuildAnalysis.CreateBuildAnalysis(
+ master_name, builder_name, build_number)
+ analysis.status = BuildAnalysisStatus.PENDING
+ analysis.put()
+ return analysis, True
elif force:
# TODO: avoid concurrent analysis.
- build.Reset()
- build.put()
- return build, True
+ analysis.Reset()
+ analysis.put()
+ return analysis, True
else:
# TODO: support following cases
# 1. Automatically retry if last analysis failed with errors.
# 2. Start another analysis if the build cycle wasn't completed in last
# analysis request.
# 3. Analysis is not complete and no update in the last 5 minutes.
- return build, False
+ return analysis, False
def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,
queue_name):
"""Schedule an analysis if needed and return the build."""
- build, need_new_analysis = NeedANewAnalysis(
+ analysis, need_new_analysis = NeedANewAnalysis(
master_name, builder_name, build_number, force)
if need_new_analysis:
@@ -85,4 +262,4 @@ def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force,
else: # pragma: no cover
logging.info('Analysis was already triggered or the result is recent.')
- return build
+ return analysis
« appengine/findit/model/build_analysis.py ('K') | « appengine/findit/model/build_analysis.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698