appengine/findit/waterfall/monitor_try_job_pipeline.py - Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline

Unified Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Addressing comments Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | appengine/findit/waterfall/try_job_error.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: appengine/findit/waterfall/monitor_try_job_pipeline.py

diff --git a/appengine/findit/waterfall/monitor_try_job_pipeline.py b/appengine/findit/waterfall/monitor_try_job_pipeline.py

index ffda4499036eea99a5335b5d0e8e7a6d3a8f526c..948cce50ceef96370aacce5591c2886c3bf4d9ca 100644

--- a/appengine/findit/waterfall/monitor_try_job_pipeline.py

+++ b/appengine/findit/waterfall/monitor_try_job_pipeline.py

@@ -3,6 +3,7 @@

# found in the LICENSE file.

from datetime import datetime

+import json

import time

from common.pipeline_wrapper import BasePipeline

@@ -12,6 +13,7 @@ from common.waterfall.buildbucket_client import BuildbucketBuild

from model import analysis_status

from model.wf_try_job import WfTryJob

from model.wf_try_job_data import WfTryJobData

+from waterfall import try_job_error

from waterfall import waterfall_config

from waterfall.try_job_type import TryJobType

@@ -23,7 +25,7 @@ class MonitorTryJobPipeline(BasePipeline):

which type of build failure we are running try job for.

"""

- TIMEOUT = 'TIMEOUT'

+ UNKNOWN = 'UNKNOWN'

@staticmethod

def _MicrosecondsToDatetime(microseconds):

@@ -33,27 +35,81 @@ class MonitorTryJobPipeline(BasePipeline):

return None

@staticmethod

- def _GetError(buildbucket_error, timed_out):

- # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the

- # try job after waiting too long for it to complete) and errors reported

- # directly in the buildbucket_client request are captured. Several other

- # failures can be derrived from the response in the build too which should

- # be determined here.

+ def _GetError(buildbucket_response, buildbucket_error, timed_out):

+ """Determines whether or not a try job error occurred.

+ Args:

+ buildbucket_response: A dict of the json response from buildbucket.

+ buildbucket_error: A BuildBucketError object returned from the call to

+ buildbucket_client.GetTryJobs()

+ timed_out: A bool whether or not Findit abandoned monitoring the try job.

+ Returns:

+ A tuple containing an error dict and number representing an error code, or

+ (None, None) if no error was determined to have occurred.

+ """

if buildbucket_error:

- return {

- 'message': buildbucket_error.message,

- 'reason': buildbucket_error.reason

- }

+ return (

+ {

+ 'message': buildbucket_error.message,

+ 'reason': buildbucket_error.reason

+ },

+ try_job_error.BUILDBUCKET_REQUEST_ERROR)

if timed_out:

- return {

- 'message': 'Try job monitoring was abandoned.',

- 'reason': MonitorTryJobPipeline.TIMEOUT

- }

+ return (

+ {

+ 'message': 'Try job monitoring was abandoned.',

+ 'reason': 'Timeout after %s hours' % (

+ waterfall_config.GetTryJobSettings().get('job_timeout_hours'))

+ },

+ try_job_error.TIMEOUT)

+ if buildbucket_response:

+ # If there is no explicit timeout or reason specified, check the last

+ # build response for errors.

+ result_details_json = json.loads(

+ buildbucket_response.get('result_details_json', '{}')) or {}

+ # Check result_details_json for any obvious errors.

+ error = result_details_json.get('error', {})

+ if error:

+ return (

+ {

+ 'message': 'Buildbucket reported an error.',

+ 'reason': error.get('message', MonitorTryJobPipeline.UNKNOWN)

+ },

+ try_job_error.BUILDBUCKET_REPORTED_ERROR)

stgao 2016/04/26 17:33:06 Maybe rename it to CI_REPORTED_ERROR. result_deta

lijeffrey 2016/04/26 19:07:25 Done.

+ # Check the report to see if anything went wrong.

+ report = result_details_json.get('properties', {}).get('report')

+ if report:

+ if report.get('metadata', {}).get('infra_failure'):

+ # Check for any infra issues caught by the recipe.

+ return (

+ {

+ 'message': ('Try job encountered an infra issue during '

+ 'execution.'),

+ 'reason': MonitorTryJobPipeline.UNKNOWN

+ },

+ try_job_error.INFRA_FAILURE)

+ else:

+ # A report should always be included as prt of the properties. If it is

stgao 2016/04/26 17:33:06 typo: prt -> part?

lijeffrey 2016/04/26 19:07:25 Oops, good catch.

+ # missing something else is wrong.

+ return (

+ {

+ 'message': 'No result report was found.',

+ 'reason': MonitorTryJobPipeline.UNKNOWN

+ },

+ try_job_error.UNKNOWN)

+ return None, None

@staticmethod

def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,

buildbucket_error, timed_out):

+ buildbucket_response = {}

if buildbucket_build:

try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(

buildbucket_build.request_time)

@@ -67,11 +123,14 @@ class MonitorTryJobPipeline(BasePipeline):

try_job_data.regression_range_size = buildbucket_build.report.get(

'metadata', {}).get('regression_range_size')

try_job_data.last_buildbucket_response = buildbucket_build.response

+ buildbucket_response = buildbucket_build.response

- error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out)

+ error_dict, error_code = MonitorTryJobPipeline._GetError(

+ buildbucket_response, buildbucket_error, timed_out)

- if error:

- try_job_data.error = error

+ if error_dict:

+ try_job_data.error = error_dict

+ try_job_data.error_code = error_code

try_job_data.put()