Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2306)

Unified Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Addressing comments Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/waterfall/monitor_try_job_pipeline.py
diff --git a/appengine/findit/waterfall/monitor_try_job_pipeline.py b/appengine/findit/waterfall/monitor_try_job_pipeline.py
index ffda4499036eea99a5335b5d0e8e7a6d3a8f526c..62c4093c4a89e974ee49f10b8733a23edcbe6c42 100644
--- a/appengine/findit/waterfall/monitor_try_job_pipeline.py
+++ b/appengine/findit/waterfall/monitor_try_job_pipeline.py
@@ -3,11 +3,13 @@
# found in the LICENSE file.
from datetime import datetime
+import json
import time
from common.pipeline_wrapper import BasePipeline
from common.pipeline_wrapper import pipeline
from common.waterfall import buildbucket_client
+from common.waterfall import try_job_error
from common.waterfall.buildbucket_client import BuildbucketBuild
from model import analysis_status
from model.wf_try_job import WfTryJob
@@ -23,7 +25,7 @@ class MonitorTryJobPipeline(BasePipeline):
which type of build failure we are running try job for.
"""
- TIMEOUT = 'TIMEOUT'
+ UNKNOWN = 'UNKNOWN'
@staticmethod
def _MicrosecondsToDatetime(microseconds):
@@ -33,27 +35,81 @@ class MonitorTryJobPipeline(BasePipeline):
return None
@staticmethod
- def _GetError(buildbucket_error, timed_out):
- # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the
- # try job after waiting too long for it to complete) and errors reported
- # directly in the buildbucket_client request are captured. Several other
- # failures can be derrived from the response in the build too which should
- # be determined here.
+ def _GetError(buildbucket_response, buildbucket_error, timed_out):
+ """Determines whether or not a try job error occurred.
+
+ Args:
+ buildbucket_response: A dict of the json response from buildbucket.
+ buildbucket_error: A BuildBucketError object returned from the call to
+ buildbucket_client.GetTryJobs()
+ timed_out: A bool whether or not Findit abandoned monitoring the try job.
+
+ Returns:
+ A tuple containing an error dict and number representing an error code, or
+ (None, None) if no error was determined to have occurred.
+ """
+
if buildbucket_error:
- return {
- 'message': buildbucket_error.message,
- 'reason': buildbucket_error.reason
- }
+ return (
+ {
+ 'message': buildbucket_error.message,
+ 'reason': buildbucket_error.reason
+ },
+ try_job_error.BUILDBUCKET_REQUEST_ERROR)
if timed_out:
- return {
- 'message': 'Try job monitoring was abandoned.',
- 'reason': MonitorTryJobPipeline.TIMEOUT
- }
+ return (
+ {
+ 'message': 'Try job monitoring was abandoned.',
+ 'reason': 'Timeout after %s hours' % (
+ waterfall_config.GetTryJobSettings().get('job_timeout_hours'))
+ },
+ try_job_error.TIMEOUT)
+
+ if buildbucket_response:
+ # If there is no explicit timeout or reason specified, check the last
+ # build response for errors.
+ result_details_json = json.loads(
+ buildbucket_response.get('result_details_json', '{}')) or {}
+
+ # Check result_details_json for any obvious errors.
+ error = result_details_json.get('error', {})
+ if error:
+ return (
+ {
+ 'message': 'Buildbucket reported an error.',
+ 'reason': error.get('message', MonitorTryJobPipeline.UNKNOWN)
+ },
+ try_job_error.CI_REPORTED_ERROR)
+
+ # Check the report to see if anything went wrong.
+ report = result_details_json.get('properties', {}).get('report')
+ if report:
+ if report.get('metadata', {}).get('infra_failure'):
+ # Check for any infra issues caught by the recipe.
+ return (
+ {
+ 'message': ('Try job encountered an infra issue during '
+ 'execution.'),
+ 'reason': MonitorTryJobPipeline.UNKNOWN
+ },
+ try_job_error.INFRA_FAILURE)
+ else:
+ # A report should always be included as part of 'properties'. If it is
+ # missing something else is wrong.
+ return (
+ {
+ 'message': 'No result report was found.',
+ 'reason': MonitorTryJobPipeline.UNKNOWN
+ },
+ try_job_error.UNKNOWN)
+
+ return None, None
@staticmethod
def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,
buildbucket_error, timed_out):
+ buildbucket_response = {}
if buildbucket_build:
try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(
buildbucket_build.request_time)
@@ -67,11 +123,14 @@ class MonitorTryJobPipeline(BasePipeline):
try_job_data.regression_range_size = buildbucket_build.report.get(
'metadata', {}).get('regression_range_size')
try_job_data.last_buildbucket_response = buildbucket_build.response
+ buildbucket_response = buildbucket_build.response
- error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out)
+ error_dict, error_code = MonitorTryJobPipeline._GetError(
+ buildbucket_response, buildbucket_error, timed_out)
- if error:
- try_job_data.error = error
+ if error_dict:
+ try_job_data.error = error_dict
+ try_job_data.error_code = error_code
try_job_data.put()
« no previous file with comments | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698