Index: appengine/findit/waterfall/build_failure_analysis_pipelines.py |
diff --git a/appengine/findit/waterfall/build_failure_analysis_pipelines.py b/appengine/findit/waterfall/build_failure_analysis_pipelines.py |
index 91af3608be8b35d6959fd22afd1d4e436faa60f2..28189073db5923634a169fee79b11c6026629e89 100644 |
--- a/appengine/findit/waterfall/build_failure_analysis_pipelines.py |
+++ b/appengine/findit/waterfall/build_failure_analysis_pipelines.py |
@@ -2,14 +2,23 @@ |
# Use of this source code is governed by a BSD-style license that can be |
# found in the LICENSE file. |
+import collections |
+from datetime import datetime |
import logging |
+import random |
+import time |
+from google.appengine.api import memcache |
from google.appengine.ext import ndb |
from pipeline_utils import pipelines |
+from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline |
+from common.http_client_appengine import HttpClientAppengine as HttpClient |
from model.build import Build |
+from model.build_analysis import BuildAnalysis |
from model.build_analysis_status import BuildAnalysisStatus |
+from waterfall import buildbot |
# TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed. |
@@ -27,52 +36,220 @@ class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover |
raise NotImplementedError() |
-class BuildFailurePipeline(BasePipeline): |
+MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s' |
+MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60 |
+DOWNLOAD_INTERVAL_SECONDS = 5 |
qyearsley
2014/12/29 19:44:54
If these aren't used in any other modules, you can
stgao
2015/01/03 01:31:47
Done.
|
+ |
+ |
+def WaitUntilDownloadAllowed( |
+ master_name, timeout_seconds=90): # pragma: no cover |
+ """Wait until next download from the specified master is allowed. |
+ |
+ Returns: |
+ True if download is allowed to proceed. |
+ False if download is not allowed until the given timeout occurs. |
+ """ |
qyearsley
2014/12/29 19:44:53
I might put WaitUntilDownloadAllowed below where i
stgao
2015/01/03 01:31:47
In infra repo, it seems the convention is to put h
|
+ client = memcache.Client() |
+ key = MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name |
+ |
+ deadline = time.time() + timeout_seconds |
+ while True: |
+ info = client.gets(key) |
+ if not info or time.time() - info['time'] >= DOWNLOAD_INTERVAL_SECONDS: |
+ new_info = { |
+ 'time': time.time() |
+ } |
+ if not info: |
+ success = client.add( |
+ key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS) |
+ else: |
+ success = client.cas( |
+ key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS) |
+ |
+ if success: |
+ return True |
+ |
+ if time.time() > deadline: |
+ return False |
+ |
+ logging.info('waiting to download from %s', master_name) |
+ time.sleep(DOWNLOAD_INTERVAL_SECONDS + random.random()) |
+ |
+ |
+class DetectFirstFailure(BasePipeline): |
qyearsley
2014/12/29 19:44:54
How about the name DetectFirstFailurePipeline? (It
stgao
2015/01/03 01:31:47
Good point. Done.
|
+ HTTP_CLIENT = HttpClient() |
+ |
+ def ExtractBuildInfo(self, master_name, builder_name, build_number): |
+ """Return a BuildInfo instance for the specified build.""" |
qyearsley
2014/12/29 19:44:53
Return -> Returns
stgao
2015/01/03 01:31:47
Done.
|
+ build = Build.GetBuild(master_name, builder_name, build_number) |
+ if not build: |
+ build = Build.CreateBuild(master_name, builder_name, build_number) |
+ |
+ # Cache the data to avoid pulling from master again. |
+ if (not build.data or (not build.completed and |
+ (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5)): |
+ if not WaitUntilDownloadAllowed(master_name): |
+ raise pipeline.Retry('Too many download from %s' % master_name) |
+ |
+ build.data = buildbot.GetBuildData( |
+ build.master_name, build.builder_name, build.build_number, |
+ self.HTTP_CLIENT) |
+ build.last_crawled_time = datetime.utcnow() |
+ |
+ if not build.data: |
+ return None |
+ |
+ build_info = buildbot.ExtractBuildInfo( |
+ master_name, builder_name, build_number, build.data) |
+ |
+ if not build.completed: |
+ build.start_time = build_info.build_start_time |
+ build.completed = build_info.completed |
+ build.result = build_info.result |
+ build.put() |
+ |
+ analysis = BuildAnalysis.GetBuildAnalysis( |
+ master_name, builder_name, build_number) |
+ if analysis and not analysis.build_start_time: |
+ analysis.build_start_time = build_info.build_start_time |
+ |
+ return build_info |
# Arguments number differs from overridden method - pylint: disable=W0221 |
def run(self, master_name, builder_name, build_number): |
- build = Build.GetBuild(master_name, builder_name, build_number) |
+ build_info = self.ExtractBuildInfo(master_name, builder_name, build_number) |
- # TODO: implement the logic. |
- build.analysis_status = BuildAnalysisStatus.ANALYZED |
- build.put() |
+ if not build_info: |
+ raise pipeline.Retry('Failed to extract build info.') |
+ |
+ failure_info = { |
+ 'failed': True, |
+ } |
+ |
+ if build_info.result == buildbot.SUCCESS or not build_info.failed_steps: |
+ failure_info['failed'] = False |
+ return failure_info |
+ |
+ # Save blame list and tested chromium revision for builds. |
+ # { |
+ # 555 : { |
+ # 'chromium_revision': 'a_git_hash', |
+ # 'blame_list': ['cl1', 'cl2'], |
+ # }, |
+ # } |
qyearsley
2014/12/29 19:44:53
Is this an example of what the `builds` dict below
stgao
2015/01/03 01:31:47
Done. Moved to a helper function.
|
+ builds = collections.defaultdict(dict) |
+ def FillBuildInfo(info): |
+ builds[info.build_number]['chromium_revision'] = info.chromium_revision |
+ builds[info.build_number]['blame_list'] = info.blame_list |
+ |
+ FillBuildInfo(build_info) |
+ |
+ # Save build number for failed steps. |
+ # { |
+ # 'step_name': { |
+ # 'current_failure': 555, |
+ # 'first_failure': 553, |
+ # 'last_pass': 551, |
+ # }, |
+ # } |
+ failed_steps = collections.defaultdict(dict) |
+ for step_name in build_info.failed_steps: |
+ failed_steps[step_name]['current_failure'] = build_info.build_number |
+ failed_steps[step_name]['first_failure'] = build_info.build_number |
+ |
+ # Look back for first known failure. At most, 20 builds. |
+ for i in range(20): |
qyearsley
2014/12/29 19:44:54
This 20 should probably be a module-level constant
stgao
2015/01/03 01:31:47
Done.
|
+ build_info = self.ExtractBuildInfo( |
+ master_name, builder_name, build_number - i - 1) |
+ |
+ if not build_info: |
+ # Failed to extract the build information, bail out. |
+ break |
+ |
+ FillBuildInfo(build_info) |
+ |
+ if build_info.result == buildbot.SUCCESS: |
+ for step_name in failed_steps: |
+ if 'last_pass' not in failed_steps[step_name]: |
+ failed_steps[step_name]['last_pass'] = build_info.build_number |
+ |
+ # All steps passed, so stop looking back. |
+ break |
+ else: |
+ # If a step is not run due to some bot exception, we are not sure |
+ # whether the step could pass or not. So we only check failed/passed |
+ # steps here. |
+ |
+ for step_name in build_info.failed_steps: |
+ if step_name in failed_steps: |
+ failed_steps[step_name]['first_failure'] = build_info.build_number |
+ |
+ for step_name in failed_steps: |
+ if step_name in build_info.passed_steps: |
+ failed_steps[step_name]['last_pass'] = build_info.build_number |
+ |
+ failure_info['builds'] = builds |
+ failure_info['failed_steps'] = failed_steps |
+ return failure_info |
qyearsley
2014/12/29 19:44:53
This function is very long, and would probably be
stgao
2015/01/03 01:31:47
Done. Hope the code is easier to understand.
|
+ |
+ |
+class BuildFailurePipeline(BasePipeline): |
+ |
+ def __init__(self, master_name, builder_name, build_number): |
+ super(BuildFailurePipeline, self).__init__( |
+ master_name, builder_name, build_number) |
+ self.master_name = master_name |
+ self.builder_name = builder_name |
+ self.build_number = build_number |
+ |
+ def finalized(self): |
+ if self.was_aborted: |
+ analysis = BuildAnalysis.GetBuildAnalysis( |
+ self.master_name, self.builder_name, self.build_number) |
+ analysis.status = BuildAnalysisStatus.ERROR |
+ analysis.put() |
+ |
+ # Arguments number differs from overridden method - pylint: disable=W0221 |
qyearsley
2014/12/29 19:44:54
Cool, I didn't know you could add a comment on the
stgao
2015/01/03 01:31:47
Yes, they could fit into one line.
|
+ def run(self, master_name, builder_name, build_number): |
+ yield DetectFirstFailure(master_name, builder_name, build_number) |
@ndb.transactional |
def NeedANewAnalysis(master_name, builder_name, build_number, force): |
- """Check analysis status of a build and decide if a new analysis is needed. |
+ """Check status of analysis for the build and decide if a new one is needed. |
qyearsley
2014/12/29 19:44:54
Check -> Checks, decide -> decides (third-person s
stgao
2015/01/03 01:31:47
Done.
|
Returns: |
(build, need_analysis) |
build (Build): the build as specified by the input. |
qyearsley
2014/12/29 19:44:53
Make sure to update the docstring to reflect the c
stgao
2015/01/03 01:31:47
Oops, done.
|
need_analysis (bool): True if an analysis is needed, otherwise False. |
""" |
qyearsley
2014/12/29 19:44:54
Similar to WaitUntilDownloadAllowed, I think NeedA
stgao
2015/01/03 01:31:47
I intentionally made this function public for test
|
- build_key = Build.CreateKey(master_name, builder_name, build_number) |
- build = build_key.get() |
- |
- if not build: |
- build = Build.CreateBuild(master_name, builder_name, build_number) |
- build.analysis_status = BuildAnalysisStatus.PENDING |
- build.put() |
- return build, True |
+ analysis = BuildAnalysis.GetBuildAnalysis( |
+ master_name, builder_name, build_number) |
+ |
+ if not analysis: |
+ analysis = BuildAnalysis.CreateBuildAnalysis( |
+ master_name, builder_name, build_number) |
+ analysis.status = BuildAnalysisStatus.PENDING |
+ analysis.put() |
+ return analysis, True |
elif force: |
# TODO: avoid concurrent analysis. |
- build.Reset() |
- build.put() |
- return build, True |
+ analysis.Reset() |
+ analysis.put() |
+ return analysis, True |
else: |
# TODO: support following cases |
# 1. Automatically retry if last analysis failed with errors. |
# 2. Start another analysis if the build cycle wasn't completed in last |
# analysis request. |
# 3. Analysis is not complete and no update in the last 5 minutes. |
- return build, False |
+ return analysis, False |
def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force, |
queue_name): |
"""Schedule an analysis if needed and return the build.""" |
- build, need_new_analysis = NeedANewAnalysis( |
+ analysis, need_new_analysis = NeedANewAnalysis( |
master_name, builder_name, build_number, force) |
if need_new_analysis: |
@@ -85,4 +262,4 @@ def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force, |
else: # pragma: no cover |
logging.info('Analysis was already triggered or the result is recent.') |
- return build |
+ return analysis |