appengine/findit/waterfall/monitor_try_job_pipeline.py - Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline

Unified Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: appengine/findit/waterfall/monitor_try_job_pipeline.py

diff --git a/appengine/findit/waterfall/monitor_try_job_pipeline.py b/appengine/findit/waterfall/monitor_try_job_pipeline.py

index ffda4499036eea99a5335b5d0e8e7a6d3a8f526c..6d36303dd32d6506f979cc755ea50e3d7d5300a9 100644

--- a/appengine/findit/waterfall/monitor_try_job_pipeline.py

+++ b/appengine/findit/waterfall/monitor_try_job_pipeline.py

@@ -3,6 +3,8 @@

# found in the LICENSE file.

from datetime import datetime

+import json

+import re

import time

from common.pipeline_wrapper import BasePipeline

@@ -10,6 +12,7 @@ from common.pipeline_wrapper import pipeline

from common.waterfall import buildbucket_client

from common.waterfall.buildbucket_client import BuildbucketBuild

from model import analysis_status

+from model import try_job_error

from model.wf_try_job import WfTryJob

from model.wf_try_job_data import WfTryJobData

from waterfall import waterfall_config

@@ -23,7 +26,7 @@ class MonitorTryJobPipeline(BasePipeline):

which type of build failure we are running try job for.

"""

- TIMEOUT = 'TIMEOUT'

+ UNKNOWN = 'UNKNOWN'

@staticmethod

def _MicrosecondsToDatetime(microseconds):

@@ -33,27 +36,80 @@ class MonitorTryJobPipeline(BasePipeline):

return None

@staticmethod

- def _GetError(buildbucket_error, timed_out):

- # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the

- # try job after waiting too long for it to complete) and errors reported

- # directly in the buildbucket_client request are captured. Several other

- # failures can be derrived from the response in the build too which should

- # be determined here.

+ def _GetError(buildbucket_response, buildbucket_error, timed_out):

chanli 2016/04/25 20:51:28 Will this handle cases like: https://build.chromiu

lijeffrey 2016/04/25 22:54:16 This try job was triggered outside of Findit, but

+ """Determines whether or not a try job error occurred.

+ Args:

+ buildbucket_response: A dict of the json response from buildbucket.

+ buildbucket_error: A BuildBucketError object returned from the call to

+ buildbucket_client.GetTryJobs()

+ timed_out: A bool whether or not Findit abandoned monitoring the try job.

+ Returns:

+ A tuple containing an error dict and number representing an error code, or

+ (None, None) if no error was determined to have occurred.

+ """

+ error_dict = None

+ error_code = None

if buildbucket_error:

- return {

+ error_dict = {

'message': buildbucket_error.message,

'reason': buildbucket_error.reason

}

- if timed_out:

- return {

+ error_code = try_job_error.BUILDBUCKET_ERROR

+ elif timed_out:

+ error_dict = {

'message': 'Try job monitoring was abandoned.',

- 'reason': MonitorTryJobPipeline.TIMEOUT

+ 'reason': 'Timeout after %s hours' % (

+ waterfall_config.GetTryJobSettings().get('job_timeout_hours'))

}

+ error_code = try_job_error.TIMEOUT

+ elif buildbucket_response:

+ # If there is no explicit timeout or reason specified, check the last

+ # build response for errors.

+ result_details_json = json.loads(

+ buildbucket_response.get('result_details_json', '{}')) or {}

+ # Check result_details_json for any obvious errors.

+ error = result_details_json.get('error', {})

+ if error:

+ message = error.get('message')

+ if message:

+ error_dict = {

+ 'message': 'Try job could not be triggered.',

+ 'reason': message

+ }

chanli 2016/04/25 20:51:28 Will it be better if error_dict = { 'message'

lijeffrey 2016/04/25 22:54:16 Reason should be the root cause, not the observed

+ trybot_not_found_pattern = re.compile(r'Builder [^\s-]+ not found')

+ if trybot_not_found_pattern.match(message):

+ error_code = try_job_error.TRYBOT_NOT_FOUND

+ else:

+ error_code = try_job_error.UNKNOWN

+ else:

+ error_dict = {

+ 'message': 'Try job error was detected.',

+ 'reason': MonitorTryJobPipeline.UNKNOWN

+ }

+ error_code = try_job_error.UNKNOWN

+ # Check the report to see if anything went wrong.

+ report = result_details_json.get('report')

+ if report:

+ if ('infra_failed' in report.get('result', {}).itervalues() or

+ report.get('metadata', {}).get('infra_failure')):

+ # Check for any infra issues caught by the recipe.

+ error_dict = {

+ 'message': 'Try job encountered an infra issue during execution.',

+ 'reason': MonitorTryJobPipeline.UNKNOWN

+ }

+ error_code = try_job_error.INFRA_FAILURE

+ return error_dict, error_code

@staticmethod

def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,

buildbucket_error, timed_out):

+ buildbucket_response = {}

if buildbucket_build:

try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(

buildbucket_build.request_time)

@@ -67,11 +123,14 @@ class MonitorTryJobPipeline(BasePipeline):

try_job_data.regression_range_size = buildbucket_build.report.get(

'metadata', {}).get('regression_range_size')

try_job_data.last_buildbucket_response = buildbucket_build.response

+ buildbucket_response = buildbucket_build.response

- error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out)

+ error_dict, error_code = MonitorTryJobPipeline._GetError(

+ buildbucket_response, buildbucket_error, timed_out)

- if error:

- try_job_data.error = error

+ if error_dict:

+ try_job_data.error = error_dict

+ try_job_data.error_code = error_code

try_job_data.put()

« no previous file with comments | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | no next file with comments »