Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import base64 | 5 import base64 |
| 6 from collections import defaultdict | 6 from collections import defaultdict |
| 7 import json | 7 import json |
| 8 import logging | 8 import logging |
| 9 import time | 9 import time |
| 10 import urllib | 10 import urllib |
| 11 import zlib | 11 import zlib |
| 12 | 12 |
| 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError | 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError |
| 14 from google.appengine.api.urlfetch_errors import DownloadError | 14 from google.appengine.api.urlfetch_errors import DownloadError |
| 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError | 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError |
| 16 from google.appengine.ext import ndb | 16 from google.appengine.ext import ndb |
| 17 | 17 |
| 18 from common import auth_util | 18 from common import auth_util |
| 19 from model.wf_step import WfStep | 19 from model.wf_step import WfStep |
| 20 from waterfall import monitoring | |
| 20 from waterfall import waterfall_config | 21 from waterfall import waterfall_config |
| 21 from waterfall.swarming_task_request import SwarmingTaskRequest | 22 from waterfall.swarming_task_request import SwarmingTaskRequest |
| 22 | 23 |
| 23 | 24 |
| 24 # Swarming task states. | 25 # Swarming task states. |
| 25 STATES_RUNNING = ('RUNNING', 'PENDING') | 26 STATES_RUNNING = ('RUNNING', 'PENDING') |
| 26 STATE_COMPLETED = 'COMPLETED' | 27 STATE_COMPLETED = 'COMPLETED' |
| 27 STATES_NOT_RUNNING = ( | 28 STATES_NOT_RUNNING = ( |
| 28 'BOT_DIED', 'CANCELED', 'COMPLETED', 'EXPIRED', 'TIMED_OUT') | 29 'BOT_DIED', 'CANCELED', 'COMPLETED', 'EXPIRED', 'TIMED_OUT') |
| 29 | 30 |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 75 | 76 |
| 76 Params: | 77 Params: |
| 77 retry_backoff (int): The base backoff in seconds. | 78 retry_backoff (int): The base backoff in seconds. |
| 78 tries (int): Indicates how many tries have been done. | 79 tries (int): Indicates how many tries have been done. |
| 79 maximum_retry_interval (int): The upper limit in seconds of how long to wait | 80 maximum_retry_interval (int): The upper limit in seconds of how long to wait |
| 80 between retries. | 81 between retries. |
| 81 """ | 82 """ |
| 82 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval) | 83 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval) |
| 83 | 84 |
| 84 | 85 |
| 86 def _OnConnectionFailed(url, exception): | |
| 87 swarming_settings = waterfall_config.GetSwarmingSettings() | |
| 88 swarming_server_host = swarming_settings.get('server_host') | |
| 89 isolated_server_host = swarming_settings.get('isolated_server') | |
| 90 exception_type_name = type(exception).__name__ | |
|
stgao
2016/12/01 22:25:15
Why not pass over the exception name from code bel
lijeffrey
2016/12/01 22:54:34
Done.
| |
| 91 | |
| 92 if isolated_server_host in url: | |
|
stgao
2016/12/01 22:25:15
Extract the host from the url, and make it a field
lijeffrey
2016/12/01 22:54:34
Done.
| |
| 93 monitoring.isolated_server_failures.increment({'type': exception_type_name}) | |
| 94 elif swarming_server_host in url: | |
| 95 monitoring.swarming_server_failures.increment({'type': exception_type_name}) | |
| 96 | |
| 97 | |
| 85 def _SendRequestToServer(url, http_client, post_data=None): | 98 def _SendRequestToServer(url, http_client, post_data=None): |
| 86 """Sends GET/POST request to arbitrary url and returns response content. | 99 """Sends GET/POST request to arbitrary url and returns response content. |
| 87 | 100 |
| 88 Because the Swarming and Isolated servers that _SendRequestToServer tries to | 101 Because the Swarming and Isolated servers that _SendRequestToServer tries to |
| 89 contact are prone to outages, exceptions trying to reach them may occur thus | 102 contact are prone to outages, exceptions trying to reach them may occur thus |
| 90 this method should retry. We want to monitor and document these occurrences | 103 this method should retry. We want to monitor and document these occurrences |
| 91 even if the request eventually succeeds after retrying, with the last error | 104 even if the request eventually succeeds after retrying, with the last error |
| 92 encountered being the one that is reported. | 105 encountered being the one that is reported. |
| 93 | 106 |
| 94 Args: | 107 Args: |
| 95 url (str): The url to send the request to. | 108 url (str): The url to send the request to. |
| 96 http_client (HttpClient): The httpclient object with which to make the | 109 http_client (HttpClient): The httpclient object with which to make the |
| 97 server calls. | 110 server calls. |
| 98 post_data (dict): Data/params to send with the request, if any. | 111 post_data (dict): Data/params to send with the request, if any. |
| 99 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming | |
| 100 task with which to capture errors. | |
| 101 | 112 |
| 102 Returns: | 113 Returns: |
| 103 content (dict), error (dict): The content from the server and the last error | 114 content (dict), error (dict): The content from the server and the last error |
| 104 encountered trying to retrieve it. | 115 encountered trying to retrieve it. |
| 105 """ | 116 """ |
| 106 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} | 117 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} |
| 107 swarming_settings = waterfall_config.GetSwarmingSettings() | 118 swarming_settings = waterfall_config.GetSwarmingSettings() |
| 108 should_retry = swarming_settings.get('should_retry_server') | 119 should_retry = swarming_settings.get('should_retry_server') |
| 109 timeout_seconds = ( | 120 timeout_seconds = ( |
| 110 swarming_settings.get('server_retry_timeout_hours') * 60 * 60) | 121 swarming_settings.get('server_retry_timeout_hours') * 60 * 60) |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 124 try: | 135 try: |
| 125 if post_data: | 136 if post_data: |
| 126 status_code, content = http_client.Post(url, post_data, headers=headers) | 137 status_code, content = http_client.Post(url, post_data, headers=headers) |
| 127 else: | 138 else: |
| 128 status_code, content = http_client.Get(url, headers=headers) | 139 status_code, content = http_client.Get(url, headers=headers) |
| 129 except ConnectionClosedError as e: | 140 except ConnectionClosedError as e: |
| 130 error = { | 141 error = { |
| 131 'code': URLFETCH_CONNECTION_CLOSED_ERROR, | 142 'code': URLFETCH_CONNECTION_CLOSED_ERROR, |
| 132 'message': e.message | 143 'message': e.message |
| 133 } | 144 } |
| 145 _OnConnectionFailed(url, e) | |
| 134 except DeadlineExceededError as e: | 146 except DeadlineExceededError as e: |
| 135 error = { | 147 error = { |
| 136 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, | 148 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, |
| 137 'message': e.message | 149 'message': e.message |
| 138 } | 150 } |
| 151 _OnConnectionFailed(url, e) | |
| 139 except DownloadError as e: | 152 except DownloadError as e: |
| 140 error = { | 153 error = { |
| 141 'code': URLFETCH_DOWNLOAD_ERROR, | 154 'code': URLFETCH_DOWNLOAD_ERROR, |
| 142 'message': e.message | 155 'message': e.message |
| 143 } | 156 } |
| 157 _OnConnectionFailed(url, e) | |
| 144 except Exception as e: # pragma: no cover | 158 except Exception as e: # pragma: no cover |
| 145 error = { | 159 error = { |
| 146 'code': UNKNOWN, | 160 'code': UNKNOWN, |
| 147 'message': e.message | 161 'message': e.message |
| 148 } | 162 } |
| 163 _OnConnectionFailed(url, e) | |
| 149 | 164 |
| 150 if error or status_code != 200: | 165 if error or status_code != 200: |
| 151 # The retry upon 50x (501 excluded) is automatically handled in the | 166 # The retry upon 50x (501 excluded) is automatically handled in the |
| 152 # underlying http_client. | 167 # underlying http_client. |
| 153 # By default, it retries 5 times with exponential backoff. | 168 # By default, it retries 5 times with exponential backoff. |
| 154 error = error or { | 169 error = error or { |
| 155 'code': EXCEEDED_MAX_RETRIES_ERROR, | 170 'code': EXCEEDED_MAX_RETRIES_ERROR, |
| 156 'message': 'Max retries exceeded trying to reach %s' % url | 171 'message': 'Max retries exceeded trying to reach %s' % url |
| 157 } | 172 } |
| 158 logging.error(error['message']) | 173 logging.error(error['message']) |
| (...skipping 373 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 532 for isolated_data in list_isolated_data: | 547 for isolated_data in list_isolated_data: |
| 533 output_json, _ = _DownloadTestResults(isolated_data, http_client) | 548 output_json, _ = _DownloadTestResults(isolated_data, http_client) |
| 534 if not output_json: | 549 if not output_json: |
| 535 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. | 550 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. |
| 536 return None | 551 return None |
| 537 shard_results.append(output_json) | 552 shard_results.append(output_json) |
| 538 | 553 |
| 539 if len(list_isolated_data) == 1: | 554 if len(list_isolated_data) == 1: |
| 540 return shard_results[0] | 555 return shard_results[0] |
| 541 return _MergeSwarmingTestShards(shard_results) | 556 return _MergeSwarmingTestShards(shard_results) |
| OLD | NEW |