Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(366)

Unified Diff: appengine/findit/waterfall/monitor_try_job_pipeline.py

Issue 1921493002: [Findit] Adding improved error detection to MonitorTryJobPipeline (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/waterfall/monitor_try_job_pipeline.py
diff --git a/appengine/findit/waterfall/monitor_try_job_pipeline.py b/appengine/findit/waterfall/monitor_try_job_pipeline.py
index ffda4499036eea99a5335b5d0e8e7a6d3a8f526c..6d36303dd32d6506f979cc755ea50e3d7d5300a9 100644
--- a/appengine/findit/waterfall/monitor_try_job_pipeline.py
+++ b/appengine/findit/waterfall/monitor_try_job_pipeline.py
@@ -3,6 +3,8 @@
# found in the LICENSE file.
from datetime import datetime
+import json
+import re
import time
from common.pipeline_wrapper import BasePipeline
@@ -10,6 +12,7 @@ from common.pipeline_wrapper import pipeline
from common.waterfall import buildbucket_client
from common.waterfall.buildbucket_client import BuildbucketBuild
from model import analysis_status
+from model import try_job_error
from model.wf_try_job import WfTryJob
from model.wf_try_job_data import WfTryJobData
from waterfall import waterfall_config
@@ -23,7 +26,7 @@ class MonitorTryJobPipeline(BasePipeline):
which type of build failure we are running try job for.
"""
- TIMEOUT = 'TIMEOUT'
+ UNKNOWN = 'UNKNOWN'
@staticmethod
def _MicrosecondsToDatetime(microseconds):
@@ -33,27 +36,80 @@ class MonitorTryJobPipeline(BasePipeline):
return None
@staticmethod
- def _GetError(buildbucket_error, timed_out):
- # TODO(lijeffrey): Currently only timeouts (Findit abandoned monitoring the
- # try job after waiting too long for it to complete) and errors reported
- # directly in the buildbucket_client request are captured. Several other
- # failures can be derrived from the response in the build too which should
- # be determined here.
+ def _GetError(buildbucket_response, buildbucket_error, timed_out):
chanli 2016/04/25 20:51:28 Will this handle cases like: https://build.chromiu
lijeffrey 2016/04/25 22:54:16 This try job was triggered outside of Findit, but
+ """Determines whether or not a try job error occurred.
+
+ Args:
+ buildbucket_response: A dict of the json response from buildbucket.
+ buildbucket_error: A BuildBucketError object returned from the call to
+ buildbucket_client.GetTryJobs()
+ timed_out: A bool whether or not Findit abandoned monitoring the try job.
+
+ Returns:
+ A tuple containing an error dict and number representing an error code, or
+ (None, None) if no error was determined to have occurred.
+ """
+ error_dict = None
+ error_code = None
+
if buildbucket_error:
- return {
+ error_dict = {
'message': buildbucket_error.message,
'reason': buildbucket_error.reason
}
-
- if timed_out:
- return {
+ error_code = try_job_error.BUILDBUCKET_ERROR
+ elif timed_out:
+ error_dict = {
'message': 'Try job monitoring was abandoned.',
- 'reason': MonitorTryJobPipeline.TIMEOUT
+ 'reason': 'Timeout after %s hours' % (
+ waterfall_config.GetTryJobSettings().get('job_timeout_hours'))
}
+ error_code = try_job_error.TIMEOUT
+ elif buildbucket_response:
+ # If there is no explicit timeout or reason specified, check the last
+ # build response for errors.
+ result_details_json = json.loads(
+ buildbucket_response.get('result_details_json', '{}')) or {}
+
+ # Check result_details_json for any obvious errors.
+ error = result_details_json.get('error', {})
+ if error:
+ message = error.get('message')
+ if message:
+ error_dict = {
+ 'message': 'Try job could not be triggered.',
+ 'reason': message
+ }
chanli 2016/04/25 20:51:28 Will it be better if error_dict = { 'message'
lijeffrey 2016/04/25 22:54:16 Reason should be the root cause, not the observed
+ trybot_not_found_pattern = re.compile(r'Builder [^\s-]+ not found')
+ if trybot_not_found_pattern.match(message):
+ error_code = try_job_error.TRYBOT_NOT_FOUND
+ else:
+ error_code = try_job_error.UNKNOWN
+ else:
+ error_dict = {
+ 'message': 'Try job error was detected.',
+ 'reason': MonitorTryJobPipeline.UNKNOWN
+ }
+ error_code = try_job_error.UNKNOWN
+
+ # Check the report to see if anything went wrong.
+ report = result_details_json.get('report')
+ if report:
+ if ('infra_failed' in report.get('result', {}).itervalues() or
+ report.get('metadata', {}).get('infra_failure')):
+ # Check for any infra issues caught by the recipe.
+ error_dict = {
+ 'message': 'Try job encountered an infra issue during execution.',
+ 'reason': MonitorTryJobPipeline.UNKNOWN
+ }
+ error_code = try_job_error.INFRA_FAILURE
+
+ return error_dict, error_code
@staticmethod
def _UpdateTryJobMetadata(try_job_data, start_time, buildbucket_build,
buildbucket_error, timed_out):
+ buildbucket_response = {}
if buildbucket_build:
try_job_data.request_time = MonitorTryJobPipeline._MicrosecondsToDatetime(
buildbucket_build.request_time)
@@ -67,11 +123,14 @@ class MonitorTryJobPipeline(BasePipeline):
try_job_data.regression_range_size = buildbucket_build.report.get(
'metadata', {}).get('regression_range_size')
try_job_data.last_buildbucket_response = buildbucket_build.response
+ buildbucket_response = buildbucket_build.response
- error = MonitorTryJobPipeline._GetError(buildbucket_error, timed_out)
+ error_dict, error_code = MonitorTryJobPipeline._GetError(
+ buildbucket_response, buildbucket_error, timed_out)
- if error:
- try_job_data.error = error
+ if error_dict:
+ try_job_data.error = error_dict
+ try_job_data.error_code = error_code
try_job_data.put()
« no previous file with comments | « appengine/findit/model/wf_try_job_data.py ('k') | appengine/findit/waterfall/test/monitor_try_job_pipeline_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698