Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(285)

Side by Side Diff: appengine/findit/waterfall/swarming_util.py

Issue 2547713002: [Findit] Using ts_mon to track swarming/isolated server outages (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import base64 5 import base64
6 from collections import defaultdict 6 from collections import defaultdict
7 import json 7 import json
8 import logging 8 import logging
9 import time 9 import time
10 import urllib 10 import urllib
11 import zlib 11 import zlib
12 12
13 from google.appengine.api.urlfetch_errors import DeadlineExceededError 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError
14 from google.appengine.api.urlfetch_errors import DownloadError 14 from google.appengine.api.urlfetch_errors import DownloadError
15 from google.appengine.api.urlfetch_errors import ConnectionClosedError 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError
16 from google.appengine.ext import ndb 16 from google.appengine.ext import ndb
17 17
18 from common import auth_util 18 from common import auth_util
19 from model.wf_step import WfStep 19 from model.wf_step import WfStep
20 from waterfall import monitoring
20 from waterfall import waterfall_config 21 from waterfall import waterfall_config
21 from waterfall.swarming_task_request import SwarmingTaskRequest 22 from waterfall.swarming_task_request import SwarmingTaskRequest
22 23
23 24
24 # Swarming task states. 25 # Swarming task states.
25 STATES_RUNNING = ('RUNNING', 'PENDING') 26 STATES_RUNNING = ('RUNNING', 'PENDING')
26 STATE_COMPLETED = 'COMPLETED' 27 STATE_COMPLETED = 'COMPLETED'
27 STATES_NOT_RUNNING = ( 28 STATES_NOT_RUNNING = (
28 'BOT_DIED', 'CANCELED', 'COMPLETED', 'EXPIRED', 'TIMED_OUT') 29 'BOT_DIED', 'CANCELED', 'COMPLETED', 'EXPIRED', 'TIMED_OUT')
29 30
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
75 76
76 Params: 77 Params:
77 retry_backoff (int): The base backoff in seconds. 78 retry_backoff (int): The base backoff in seconds.
78 tries (int): Indicates how many tries have been done. 79 tries (int): Indicates how many tries have been done.
79 maximum_retry_interval (int): The upper limit in seconds of how long to wait 80 maximum_retry_interval (int): The upper limit in seconds of how long to wait
80 between retries. 81 between retries.
81 """ 82 """
82 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval) 83 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)
83 84
84 85
86 def _OnConnectionFailed(url, exception):
87 swarming_settings = waterfall_config.GetSwarmingSettings()
88 swarming_server_host = swarming_settings.get('server_host')
89 isolated_server_host = swarming_settings.get('isolated_server')
90 exception_type_name = type(exception).__name__
stgao 2016/12/01 22:25:15 Why not pass over the exception name from code bel
lijeffrey 2016/12/01 22:54:34 Done.
91
92 if isolated_server_host in url:
stgao 2016/12/01 22:25:15 Extract the host from the url, and make it a field
lijeffrey 2016/12/01 22:54:34 Done.
93 monitoring.isolated_server_failures.increment({'type': exception_type_name})
94 elif swarming_server_host in url:
95 monitoring.swarming_server_failures.increment({'type': exception_type_name})
96
97
85 def _SendRequestToServer(url, http_client, post_data=None): 98 def _SendRequestToServer(url, http_client, post_data=None):
86 """Sends GET/POST request to arbitrary url and returns response content. 99 """Sends GET/POST request to arbitrary url and returns response content.
87 100
88 Because the Swarming and Isolated servers that _SendRequestToServer tries to 101 Because the Swarming and Isolated servers that _SendRequestToServer tries to
89 contact are prone to outages, exceptions trying to reach them may occur thus 102 contact are prone to outages, exceptions trying to reach them may occur thus
90 this method should retry. We want to monitor and document these occurrences 103 this method should retry. We want to monitor and document these occurrences
91 even if the request eventually succeeds after retrying, with the last error 104 even if the request eventually succeeds after retrying, with the last error
92 encountered being the one that is reported. 105 encountered being the one that is reported.
93 106
94 Args: 107 Args:
95 url (str): The url to send the request to. 108 url (str): The url to send the request to.
96 http_client (HttpClient): The httpclient object with which to make the 109 http_client (HttpClient): The httpclient object with which to make the
97 server calls. 110 server calls.
98 post_data (dict): Data/params to send with the request, if any. 111 post_data (dict): Data/params to send with the request, if any.
99 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming
100 task with which to capture errors.
101 112
102 Returns: 113 Returns:
103 content (dict), error (dict): The content from the server and the last error 114 content (dict), error (dict): The content from the server and the last error
104 encountered trying to retrieve it. 115 encountered trying to retrieve it.
105 """ 116 """
106 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} 117 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}
107 swarming_settings = waterfall_config.GetSwarmingSettings() 118 swarming_settings = waterfall_config.GetSwarmingSettings()
108 should_retry = swarming_settings.get('should_retry_server') 119 should_retry = swarming_settings.get('should_retry_server')
109 timeout_seconds = ( 120 timeout_seconds = (
110 swarming_settings.get('server_retry_timeout_hours') * 60 * 60) 121 swarming_settings.get('server_retry_timeout_hours') * 60 * 60)
(...skipping 13 matching lines...) Expand all
124 try: 135 try:
125 if post_data: 136 if post_data:
126 status_code, content = http_client.Post(url, post_data, headers=headers) 137 status_code, content = http_client.Post(url, post_data, headers=headers)
127 else: 138 else:
128 status_code, content = http_client.Get(url, headers=headers) 139 status_code, content = http_client.Get(url, headers=headers)
129 except ConnectionClosedError as e: 140 except ConnectionClosedError as e:
130 error = { 141 error = {
131 'code': URLFETCH_CONNECTION_CLOSED_ERROR, 142 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
132 'message': e.message 143 'message': e.message
133 } 144 }
145 _OnConnectionFailed(url, e)
134 except DeadlineExceededError as e: 146 except DeadlineExceededError as e:
135 error = { 147 error = {
136 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, 148 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
137 'message': e.message 149 'message': e.message
138 } 150 }
151 _OnConnectionFailed(url, e)
139 except DownloadError as e: 152 except DownloadError as e:
140 error = { 153 error = {
141 'code': URLFETCH_DOWNLOAD_ERROR, 154 'code': URLFETCH_DOWNLOAD_ERROR,
142 'message': e.message 155 'message': e.message
143 } 156 }
157 _OnConnectionFailed(url, e)
144 except Exception as e: # pragma: no cover 158 except Exception as e: # pragma: no cover
145 error = { 159 error = {
146 'code': UNKNOWN, 160 'code': UNKNOWN,
147 'message': e.message 161 'message': e.message
148 } 162 }
163 _OnConnectionFailed(url, e)
149 164
150 if error or status_code != 200: 165 if error or status_code != 200:
151 # The retry upon 50x (501 excluded) is automatically handled in the 166 # The retry upon 50x (501 excluded) is automatically handled in the
152 # underlying http_client. 167 # underlying http_client.
153 # By default, it retries 5 times with exponential backoff. 168 # By default, it retries 5 times with exponential backoff.
154 error = error or { 169 error = error or {
155 'code': EXCEEDED_MAX_RETRIES_ERROR, 170 'code': EXCEEDED_MAX_RETRIES_ERROR,
156 'message': 'Max retries exceeded trying to reach %s' % url 171 'message': 'Max retries exceeded trying to reach %s' % url
157 } 172 }
158 logging.error(error['message']) 173 logging.error(error['message'])
(...skipping 373 matching lines...) Expand 10 before | Expand all | Expand 10 after
532 for isolated_data in list_isolated_data: 547 for isolated_data in list_isolated_data:
533 output_json, _ = _DownloadTestResults(isolated_data, http_client) 548 output_json, _ = _DownloadTestResults(isolated_data, http_client)
534 if not output_json: 549 if not output_json:
535 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. 550 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.
536 return None 551 return None
537 shard_results.append(output_json) 552 shard_results.append(output_json)
538 553
539 if len(list_isolated_data) == 1: 554 if len(list_isolated_data) == 1:
540 return shard_results[0] 555 return shard_results[0]
541 return _MergeSwarmingTestShards(shard_results) 556 return _MergeSwarmingTestShards(shard_results)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698