Chromium Code Reviews| Index: appengine/findit/waterfall/build_failure_analysis_pipelines.py |
| diff --git a/appengine/findit/waterfall/build_failure_analysis_pipelines.py b/appengine/findit/waterfall/build_failure_analysis_pipelines.py |
| index 91af3608be8b35d6959fd22afd1d4e436faa60f2..28189073db5923634a169fee79b11c6026629e89 100644 |
| --- a/appengine/findit/waterfall/build_failure_analysis_pipelines.py |
| +++ b/appengine/findit/waterfall/build_failure_analysis_pipelines.py |
| @@ -2,14 +2,23 @@ |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| +import collections |
| +from datetime import datetime |
| import logging |
| +import random |
| +import time |
| +from google.appengine.api import memcache |
| from google.appengine.ext import ndb |
| from pipeline_utils import pipelines |
| +from pipeline_utils.appengine_third_party_pipeline_src_pipeline import pipeline |
| +from common.http_client_appengine import HttpClientAppengine as HttpClient |
| from model.build import Build |
| +from model.build_analysis import BuildAnalysis |
| from model.build_analysis_status import BuildAnalysisStatus |
| +from waterfall import buildbot |
| # TODO(stgao): remove BasePipeline after http://crrev.com/810193002 is landed. |
| @@ -27,52 +36,220 @@ class BasePipeline(pipelines.AppenginePipeline): # pragma: no cover |
| raise NotImplementedError() |
| -class BuildFailurePipeline(BasePipeline): |
| +MEMCACHE_MASTER_DOWNLOAD_LOCK = 'master-download-lock-%s' |
| +MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS = 60 * 60 |
| +DOWNLOAD_INTERVAL_SECONDS = 5 |
|
qyearsley
2014/12/29 19:44:54
If these aren't used in any other modules, you can
stgao
2015/01/03 01:31:47
Done.
|
| + |
| + |
| +def WaitUntilDownloadAllowed( |
| + master_name, timeout_seconds=90): # pragma: no cover |
| + """Wait until next download from the specified master is allowed. |
| + |
| + Returns: |
| + True if download is allowed to proceed. |
| + False if download is not allowed until the given timeout occurs. |
| + """ |
|
qyearsley
2014/12/29 19:44:53
I might put WaitUntilDownloadAllowed below where i
stgao
2015/01/03 01:31:47
In infra repo, it seems the convention is to put h
|
| + client = memcache.Client() |
| + key = MEMCACHE_MASTER_DOWNLOAD_LOCK % master_name |
| + |
| + deadline = time.time() + timeout_seconds |
| + while True: |
| + info = client.gets(key) |
| + if not info or time.time() - info['time'] >= DOWNLOAD_INTERVAL_SECONDS: |
| + new_info = { |
| + 'time': time.time() |
| + } |
| + if not info: |
| + success = client.add( |
| + key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS) |
| + else: |
| + success = client.cas( |
| + key, new_info, time=MEMCACHE_MASTER_DOWNLOAD_EXPIRATION_SECONDS) |
| + |
| + if success: |
| + return True |
| + |
| + if time.time() > deadline: |
| + return False |
| + |
| + logging.info('waiting to download from %s', master_name) |
| + time.sleep(DOWNLOAD_INTERVAL_SECONDS + random.random()) |
| + |
| + |
| +class DetectFirstFailure(BasePipeline): |
|
qyearsley
2014/12/29 19:44:54
How about the name DetectFirstFailurePipeline? (It
stgao
2015/01/03 01:31:47
Good point. Done.
|
| + HTTP_CLIENT = HttpClient() |
| + |
| + def ExtractBuildInfo(self, master_name, builder_name, build_number): |
| + """Return a BuildInfo instance for the specified build.""" |
|
qyearsley
2014/12/29 19:44:53
Return -> Returns
stgao
2015/01/03 01:31:47
Done.
|
| + build = Build.GetBuild(master_name, builder_name, build_number) |
| + if not build: |
| + build = Build.CreateBuild(master_name, builder_name, build_number) |
| + |
| + # Cache the data to avoid pulling from master again. |
| + if (not build.data or (not build.completed and |
| + (datetime.utcnow() - build.last_crawled_time).total_seconds >= 60 * 5)): |
| + if not WaitUntilDownloadAllowed(master_name): |
| + raise pipeline.Retry('Too many download from %s' % master_name) |
| + |
| + build.data = buildbot.GetBuildData( |
| + build.master_name, build.builder_name, build.build_number, |
| + self.HTTP_CLIENT) |
| + build.last_crawled_time = datetime.utcnow() |
| + |
| + if not build.data: |
| + return None |
| + |
| + build_info = buildbot.ExtractBuildInfo( |
| + master_name, builder_name, build_number, build.data) |
| + |
| + if not build.completed: |
| + build.start_time = build_info.build_start_time |
| + build.completed = build_info.completed |
| + build.result = build_info.result |
| + build.put() |
| + |
| + analysis = BuildAnalysis.GetBuildAnalysis( |
| + master_name, builder_name, build_number) |
| + if analysis and not analysis.build_start_time: |
| + analysis.build_start_time = build_info.build_start_time |
| + |
| + return build_info |
| # Arguments number differs from overridden method - pylint: disable=W0221 |
| def run(self, master_name, builder_name, build_number): |
| - build = Build.GetBuild(master_name, builder_name, build_number) |
| + build_info = self.ExtractBuildInfo(master_name, builder_name, build_number) |
| - # TODO: implement the logic. |
| - build.analysis_status = BuildAnalysisStatus.ANALYZED |
| - build.put() |
| + if not build_info: |
| + raise pipeline.Retry('Failed to extract build info.') |
| + |
| + failure_info = { |
| + 'failed': True, |
| + } |
| + |
| + if build_info.result == buildbot.SUCCESS or not build_info.failed_steps: |
| + failure_info['failed'] = False |
| + return failure_info |
| + |
| + # Save blame list and tested chromium revision for builds. |
| + # { |
| + # 555 : { |
| + # 'chromium_revision': 'a_git_hash', |
| + # 'blame_list': ['cl1', 'cl2'], |
| + # }, |
| + # } |
|
qyearsley
2014/12/29 19:44:53
Is this an example of what the `builds` dict below
stgao
2015/01/03 01:31:47
Done. Moved to a helper function.
|
| + builds = collections.defaultdict(dict) |
| + def FillBuildInfo(info): |
| + builds[info.build_number]['chromium_revision'] = info.chromium_revision |
| + builds[info.build_number]['blame_list'] = info.blame_list |
| + |
| + FillBuildInfo(build_info) |
| + |
| + # Save build number for failed steps. |
| + # { |
| + # 'step_name': { |
| + # 'current_failure': 555, |
| + # 'first_failure': 553, |
| + # 'last_pass': 551, |
| + # }, |
| + # } |
| + failed_steps = collections.defaultdict(dict) |
| + for step_name in build_info.failed_steps: |
| + failed_steps[step_name]['current_failure'] = build_info.build_number |
| + failed_steps[step_name]['first_failure'] = build_info.build_number |
| + |
| + # Look back for first known failure. At most, 20 builds. |
| + for i in range(20): |
|
qyearsley
2014/12/29 19:44:54
This 20 should probably be a module-level constant
stgao
2015/01/03 01:31:47
Done.
|
| + build_info = self.ExtractBuildInfo( |
| + master_name, builder_name, build_number - i - 1) |
| + |
| + if not build_info: |
| + # Failed to extract the build information, bail out. |
| + break |
| + |
| + FillBuildInfo(build_info) |
| + |
| + if build_info.result == buildbot.SUCCESS: |
| + for step_name in failed_steps: |
| + if 'last_pass' not in failed_steps[step_name]: |
| + failed_steps[step_name]['last_pass'] = build_info.build_number |
| + |
| + # All steps passed, so stop looking back. |
| + break |
| + else: |
| + # If a step is not run due to some bot exception, we are not sure |
| + # whether the step could pass or not. So we only check failed/passed |
| + # steps here. |
| + |
| + for step_name in build_info.failed_steps: |
| + if step_name in failed_steps: |
| + failed_steps[step_name]['first_failure'] = build_info.build_number |
| + |
| + for step_name in failed_steps: |
| + if step_name in build_info.passed_steps: |
| + failed_steps[step_name]['last_pass'] = build_info.build_number |
| + |
| + failure_info['builds'] = builds |
| + failure_info['failed_steps'] = failed_steps |
| + return failure_info |
|
qyearsley
2014/12/29 19:44:53
This function is very long, and would probably be
stgao
2015/01/03 01:31:47
Done. Hope the code is easier to understand.
|
| + |
| + |
| +class BuildFailurePipeline(BasePipeline): |
| + |
| + def __init__(self, master_name, builder_name, build_number): |
| + super(BuildFailurePipeline, self).__init__( |
| + master_name, builder_name, build_number) |
| + self.master_name = master_name |
| + self.builder_name = builder_name |
| + self.build_number = build_number |
| + |
| + def finalized(self): |
| + if self.was_aborted: |
| + analysis = BuildAnalysis.GetBuildAnalysis( |
| + self.master_name, self.builder_name, self.build_number) |
| + analysis.status = BuildAnalysisStatus.ERROR |
| + analysis.put() |
| + |
| + # Arguments number differs from overridden method - pylint: disable=W0221 |
|
qyearsley
2014/12/29 19:44:54
Cool, I didn't know you could add a comment on the
stgao
2015/01/03 01:31:47
Yes, they could fit into one line.
|
| + def run(self, master_name, builder_name, build_number): |
| + yield DetectFirstFailure(master_name, builder_name, build_number) |
| @ndb.transactional |
| def NeedANewAnalysis(master_name, builder_name, build_number, force): |
| - """Check analysis status of a build and decide if a new analysis is needed. |
| + """Check status of analysis for the build and decide if a new one is needed. |
|
qyearsley
2014/12/29 19:44:54
Check -> Checks, decide -> decides (third-person s
stgao
2015/01/03 01:31:47
Done.
|
| Returns: |
| (build, need_analysis) |
| build (Build): the build as specified by the input. |
|
qyearsley
2014/12/29 19:44:53
Make sure to update the docstring to reflect the c
stgao
2015/01/03 01:31:47
Oops, done.
|
| need_analysis (bool): True if an analysis is needed, otherwise False. |
| """ |
|
qyearsley
2014/12/29 19:44:54
Similar to WaitUntilDownloadAllowed, I think NeedA
stgao
2015/01/03 01:31:47
I intentionally made this function public for test
|
| - build_key = Build.CreateKey(master_name, builder_name, build_number) |
| - build = build_key.get() |
| - |
| - if not build: |
| - build = Build.CreateBuild(master_name, builder_name, build_number) |
| - build.analysis_status = BuildAnalysisStatus.PENDING |
| - build.put() |
| - return build, True |
| + analysis = BuildAnalysis.GetBuildAnalysis( |
| + master_name, builder_name, build_number) |
| + |
| + if not analysis: |
| + analysis = BuildAnalysis.CreateBuildAnalysis( |
| + master_name, builder_name, build_number) |
| + analysis.status = BuildAnalysisStatus.PENDING |
| + analysis.put() |
| + return analysis, True |
| elif force: |
| # TODO: avoid concurrent analysis. |
| - build.Reset() |
| - build.put() |
| - return build, True |
| + analysis.Reset() |
| + analysis.put() |
| + return analysis, True |
| else: |
| # TODO: support following cases |
| # 1. Automatically retry if last analysis failed with errors. |
| # 2. Start another analysis if the build cycle wasn't completed in last |
| # analysis request. |
| # 3. Analysis is not complete and no update in the last 5 minutes. |
| - return build, False |
| + return analysis, False |
| def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force, |
| queue_name): |
| """Schedule an analysis if needed and return the build.""" |
| - build, need_new_analysis = NeedANewAnalysis( |
| + analysis, need_new_analysis = NeedANewAnalysis( |
| master_name, builder_name, build_number, force) |
| if need_new_analysis: |
| @@ -85,4 +262,4 @@ def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, force, |
| else: # pragma: no cover |
| logging.info('Analysis was already triggered or the result is recent.') |
| - return build |
| + return analysis |