Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(69)

Side by Side Diff: appengine/findit/waterfall/swarming_util.py

Issue 2547713002: [Findit] Using ts_mon to track swarming/isolated server outages (Closed)
Patch Set: Addressing comments Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import base64 5 import base64
6 from collections import defaultdict 6 from collections import defaultdict
7 import json 7 import json
8 import logging 8 import logging
9 import time 9 import time
10 import urllib 10 import urllib
11 from urlparse import urlparse
11 import zlib 12 import zlib
12 13
13 from google.appengine.api.urlfetch_errors import DeadlineExceededError 14 from google.appengine.api.urlfetch_errors import DeadlineExceededError
14 from google.appengine.api.urlfetch_errors import DownloadError 15 from google.appengine.api.urlfetch_errors import DownloadError
15 from google.appengine.api.urlfetch_errors import ConnectionClosedError 16 from google.appengine.api.urlfetch_errors import ConnectionClosedError
16 from google.appengine.ext import ndb 17 from google.appengine.ext import ndb
17 18
18 from common import auth_util 19 from common import auth_util
19 from model.wf_step import WfStep 20 from model.wf_step import WfStep
21 from waterfall import monitoring
20 from waterfall import waterfall_config 22 from waterfall import waterfall_config
21 from waterfall.swarming_task_request import SwarmingTaskRequest 23 from waterfall.swarming_task_request import SwarmingTaskRequest
22 24
23 25
24 # Swarming task states. 26 # Swarming task states.
25 STATES_RUNNING = ('RUNNING', 'PENDING') 27 STATES_RUNNING = ('RUNNING', 'PENDING')
26 STATE_COMPLETED = 'COMPLETED' 28 STATE_COMPLETED = 'COMPLETED'
27 STATES_NOT_RUNNING = ( 29 STATES_NOT_RUNNING = (
28 'BOT_DIED', 'CANCELED', 'COMPLETED', 'EXPIRED', 'TIMED_OUT') 30 'BOT_DIED', 'CANCELED', 'COMPLETED', 'EXPIRED', 'TIMED_OUT')
29 31
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
75 77
76 Params: 78 Params:
77 retry_backoff (int): The base backoff in seconds. 79 retry_backoff (int): The base backoff in seconds.
78 tries (int): Indicates how many tries have been done. 80 tries (int): Indicates how many tries have been done.
79 maximum_retry_interval (int): The upper limit in seconds of how long to wait 81 maximum_retry_interval (int): The upper limit in seconds of how long to wait
80 between retries. 82 between retries.
81 """ 83 """
82 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval) 84 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval)
83 85
84 86
87 def _OnConnectionFailed(url, exception_type):
88 host = urlparse(url).hostname
89 assert host
90
91 monitoring.http_errors.increment({'host': host, 'exception': exception_type})
92
93
85 def _SendRequestToServer(url, http_client, post_data=None): 94 def _SendRequestToServer(url, http_client, post_data=None):
86 """Sends GET/POST request to arbitrary url and returns response content. 95 """Sends GET/POST request to arbitrary url and returns response content.
87 96
88 Because the Swarming and Isolated servers that _SendRequestToServer tries to 97 Because the Swarming and Isolated servers that _SendRequestToServer tries to
89 contact are prone to outages, exceptions trying to reach them may occur thus 98 contact are prone to outages, exceptions trying to reach them may occur thus
90 this method should retry. We want to monitor and document these occurrences 99 this method should retry. We want to monitor and document these occurrences
91 even if the request eventually succeeds after retrying, with the last error 100 even if the request eventually succeeds after retrying, with the last error
92 encountered being the one that is reported. 101 encountered being the one that is reported.
93 102
94 Args: 103 Args:
95 url (str): The url to send the request to. 104 url (str): The url to send the request to.
96 http_client (HttpClient): The httpclient object with which to make the 105 http_client (HttpClient): The httpclient object with which to make the
97 server calls. 106 server calls.
98 post_data (dict): Data/params to send with the request, if any. 107 post_data (dict): Data/params to send with the request, if any.
99 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming
100 task with which to capture errors.
101 108
102 Returns: 109 Returns:
103 content (dict), error (dict): The content from the server and the last error 110 content (dict), error (dict): The content from the server and the last error
104 encountered trying to retrieve it. 111 encountered trying to retrieve it.
105 """ 112 """
106 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} 113 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()}
107 swarming_settings = waterfall_config.GetSwarmingSettings() 114 swarming_settings = waterfall_config.GetSwarmingSettings()
108 should_retry = swarming_settings.get('should_retry_server') 115 should_retry = swarming_settings.get('should_retry_server')
109 timeout_seconds = ( 116 timeout_seconds = (
110 swarming_settings.get('server_retry_timeout_hours') * 60 * 60) 117 swarming_settings.get('server_retry_timeout_hours') * 60 * 60)
(...skipping 13 matching lines...) Expand all
124 try: 131 try:
125 if post_data: 132 if post_data:
126 status_code, content = http_client.Post(url, post_data, headers=headers) 133 status_code, content = http_client.Post(url, post_data, headers=headers)
127 else: 134 else:
128 status_code, content = http_client.Get(url, headers=headers) 135 status_code, content = http_client.Get(url, headers=headers)
129 except ConnectionClosedError as e: 136 except ConnectionClosedError as e:
130 error = { 137 error = {
131 'code': URLFETCH_CONNECTION_CLOSED_ERROR, 138 'code': URLFETCH_CONNECTION_CLOSED_ERROR,
132 'message': e.message 139 'message': e.message
133 } 140 }
141 _OnConnectionFailed(url, 'ConnectionClosedError')
134 except DeadlineExceededError as e: 142 except DeadlineExceededError as e:
135 error = { 143 error = {
136 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, 144 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR,
137 'message': e.message 145 'message': e.message
138 } 146 }
147 _OnConnectionFailed(url, 'DeadlineExceededError')
139 except DownloadError as e: 148 except DownloadError as e:
140 error = { 149 error = {
141 'code': URLFETCH_DOWNLOAD_ERROR, 150 'code': URLFETCH_DOWNLOAD_ERROR,
142 'message': e.message 151 'message': e.message
143 } 152 }
153 _OnConnectionFailed(url, 'DownloadError')
144 except Exception as e: # pragma: no cover 154 except Exception as e: # pragma: no cover
155 logging.error(
156 'An unknown exception occurred that need to be monitored: %s',
157 e.message)
145 error = { 158 error = {
146 'code': UNKNOWN, 159 'code': UNKNOWN,
147 'message': e.message 160 'message': e.message
148 } 161 }
162 _OnConnectionFailed(url, 'Unknown Exception')
149 163
150 if error or status_code != 200: 164 if error or status_code != 200:
151 # The retry upon 50x (501 excluded) is automatically handled in the 165 # The retry upon 50x (501 excluded) is automatically handled in the
152 # underlying http_client. 166 # underlying http_client.
153 # By default, it retries 5 times with exponential backoff. 167 # By default, it retries 5 times with exponential backoff.
154 error = error or { 168 error = error or {
155 'code': EXCEEDED_MAX_RETRIES_ERROR, 169 'code': EXCEEDED_MAX_RETRIES_ERROR,
156 'message': 'Max retries exceeded trying to reach %s' % url 170 'message': 'Max retries exceeded trying to reach %s' % url
157 } 171 }
158 logging.error(error['message']) 172 logging.error(error['message'])
(...skipping 373 matching lines...) Expand 10 before | Expand all | Expand 10 after
532 for isolated_data in list_isolated_data: 546 for isolated_data in list_isolated_data:
533 output_json, _ = _DownloadTestResults(isolated_data, http_client) 547 output_json, _ = _DownloadTestResults(isolated_data, http_client)
534 if not output_json: 548 if not output_json:
535 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. 549 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults.
536 return None 550 return None
537 shard_results.append(output_json) 551 shard_results.append(output_json)
538 552
539 if len(list_isolated_data) == 1: 553 if len(list_isolated_data) == 1:
540 return shard_results[0] 554 return shard_results[0]
541 return _MergeSwarmingTestShards(shard_results) 555 return _MergeSwarmingTestShards(shard_results)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698