Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import base64 | 5 import base64 |
| 6 from collections import defaultdict | 6 from collections import defaultdict |
| 7 import json | 7 import json |
| 8 import logging | 8 import logging |
| 9 import time | |
| 9 import urllib | 10 import urllib |
| 10 import zlib | 11 import zlib |
| 11 | 12 |
| 12 from google.appengine.api.urlfetch_errors import DeadlineExceededError | 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError |
| 13 from google.appengine.api.urlfetch_errors import DownloadError | 14 from google.appengine.api.urlfetch_errors import DownloadError |
| 14 from google.appengine.api.urlfetch_errors import ConnectionClosedError | 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError |
| 15 from google.appengine.ext import ndb | 16 from google.appengine.ext import ndb |
| 16 | 17 |
| 17 from common import auth_util | 18 from common import auth_util |
| 18 from model.wf_step import WfStep | 19 from model.wf_step import WfStep |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 58 TASK_FAILED = 2 | 59 TASK_FAILED = 2 |
| 59 | 60 |
| 60 # Swarming task exit code descriptions. | 61 # Swarming task exit code descriptions. |
| 61 EXIT_CODE_DESCRIPTIONS = { | 62 EXIT_CODE_DESCRIPTIONS = { |
| 62 ALL_TESTS_PASSED: 'All tests passed', | 63 ALL_TESTS_PASSED: 'All tests passed', |
| 63 SOME_TESTS_FAILED: 'Some tests failed', | 64 SOME_TESTS_FAILED: 'Some tests failed', |
| 64 TASK_FAILED: 'Swarming task failed', | 65 TASK_FAILED: 'Swarming task failed', |
| 65 } | 66 } |
| 66 | 67 |
| 67 | 68 |
| 69 def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval): | |
| 70 """Returns how many seconds to wait before next retry. | |
| 71 | |
| 72 Params: | |
| 73 retry_backoff (int): The base backoff in seconds. | |
| 74 tries (int): Indicates how many tries have been done. | |
| 75 maximum_retry_interval (int): The upper limit in seconds of how long to wait | |
| 76 between retries. | |
| 77 """ | |
| 78 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval) | |
| 79 | |
| 80 | |
| 68 def _SendRequestToServer(url, http_client, post_data=None): | 81 def _SendRequestToServer(url, http_client, post_data=None): |
| 69 """Sends GET/POST request to arbitrary url and returns response content.""" | 82 """Sends GET/POST request to arbitrary url and returns response content. |
| 83 | |
| 84 Because the Swarming and Isolated servers that _SendRequestToServer tries to | |
| 85 contact are prone to outages, exceptions trying to reach them may occur thus | |
| 86 this method should retry. We want to monitor and document these occurrences | |
| 87 even if the request eventually succeeds after retrying, with the last error | |
| 88 encountered being the one that is reported. | |
| 89 | |
| 90 Args: | |
| 91 url (str): The url to send the request to. | |
| 92 http_client (HttpClient): The httpclient object with which to make the | |
| 93 server calls. | |
| 94 post_data (dict): Data/params to send with the request, if any. | |
| 95 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming | |
| 96 task with which to capture errors. | |
| 97 | |
| 98 Returns: | |
| 99 content (dict), error (dict): The content from the server and the last error | |
| 100 encountered trying to retrieve it. | |
| 101 """ | |
| 70 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} | 102 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} |
| 103 swarming_settings = waterfall_config.GetSwarmingSettings() | |
| 104 should_retry = swarming_settings.get('should_retry_server') | |
|
chanli
2016/11/23 23:45:32
I have a question: How long is the server down typ
lijeffrey
2016/11/28 23:55:24
It's difficult to say just from the logs, which us
chanli
2016/11/29 18:49:38
Acknowledged.
| |
| 105 timeout_seconds = swarming_settings.get('server_retry_timeout') * 60 * 60 | |
| 106 maximum_retry_interval = swarming_settings.get( | |
| 107 'maximum_server_contact_retry_interval_seconds') | |
| 108 deadline = time.time() + timeout_seconds | |
| 109 retry_backoff = 60 | |
| 110 tries = 1 | |
| 71 error = None | 111 error = None |
| 72 | 112 |
| 73 try: | 113 while True: |
| 74 if post_data: | 114 try: |
| 75 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) | 115 if post_data: |
| 76 headers['Content-Type'] = 'application/json; charset=UTF-8' | 116 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) |
| 77 headers['Content-Length'] = len(post_data) | 117 headers['Content-Type'] = 'application/json; charset=UTF-8' |
| 78 status_code, content = http_client.Post(url, post_data, headers=headers) | 118 headers['Content-Length'] = len(post_data) |
|
chanli
2016/11/30 18:53:29
Nit: This part can be done only once outside of th
lijeffrey
2016/11/30 20:12:11
Done.
| |
| 119 status_code, content = http_client.Post(url, post_data, headers=headers) | |
| 120 else: | |
| 121 status_code, content = http_client.Get(url, headers=headers) | |
| 122 except ConnectionClosedError as e: | |
| 123 error = { | |
| 124 'code': URLFETCH_CONNECTION_CLOSED_ERROR, | |
| 125 'message': e.message | |
| 126 } | |
| 127 except DeadlineExceededError as e: | |
| 128 error = { | |
| 129 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, | |
| 130 'message': e.message | |
| 131 } | |
| 132 except DownloadError as e: | |
| 133 error = { | |
| 134 'code': URLFETCH_DOWNLOAD_ERROR, | |
| 135 'message': e.message | |
| 136 } | |
| 137 except Exception as e: # pragma: no cover | |
| 138 error = { | |
| 139 'code': UNKNOWN, | |
| 140 'message': e.message | |
| 141 } | |
| 142 | |
| 143 if error or status_code != 200: | |
| 144 # The retry upon 50x (501 excluded) is automatically handled in the | |
| 145 # underlying http_client. | |
| 146 # By default, it retries 5 times with exponential backoff. | |
| 147 error = error or { | |
| 148 'code': EXCEEDED_MAX_RETRIES_ERROR, | |
| 149 'message': 'Max retries exceeded trying to reach %s' % url | |
| 150 } | |
| 151 logging.error(error['message']) | |
| 79 else: | 152 else: |
| 80 status_code, content = http_client.Get(url, headers=headers) | 153 # Even if the call is successful, still return the last error encountered. |
| 81 except ConnectionClosedError as e: | 154 return content, error |
| 82 error = { | |
| 83 'code': URLFETCH_CONNECTION_CLOSED_ERROR, | |
| 84 'message': e.message | |
| 85 } | |
| 86 except DeadlineExceededError as e: | |
| 87 error = { | |
| 88 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, | |
| 89 'message': e.message | |
| 90 } | |
| 91 except DownloadError as e: | |
| 92 error = { | |
| 93 'code': URLFETCH_DOWNLOAD_ERROR, | |
| 94 'message': e.message | |
| 95 } | |
| 96 except Exception as e: # pragma: no cover | |
| 97 error = { | |
| 98 'code': UNKNOWN, | |
| 99 'message': e.message | |
| 100 } | |
| 101 # Still raise an exception here for cases not encountered before in order | |
| 102 # to see what went wrong in the logs. | |
| 103 raise e | |
| 104 | 155 |
| 105 if error or status_code != 200: | 156 if should_retry and time.time() < deadline: # pragma: no cover |
| 106 # The retry upon 50x (501 excluded) is automatically handled in the | 157 # Wait, then retry if applicable. |
| 107 # underlying http_client. | 158 wait_time = _GetBackoffSeconds( |
| 108 # By default, it retries 5 times with exponential backoff. | 159 retry_backoff, tries, maximum_retry_interval) |
| 109 error = error or { | 160 logging.info('Retrying connection to %s in %d seconds', url, wait_time) |
| 110 'code': EXCEEDED_MAX_RETRIES_ERROR, | 161 time.sleep(wait_time) |
| 111 'message': 'Max retries exceeded trying to reach %s' % url | 162 tries += 1 |
| 112 } | 163 else: |
| 113 logging.error(error['message']) | 164 if should_retry: |
|
chanli
2016/11/23 23:45:32
Shouldn't you also handle case when should_retry =
lijeffrey
2016/11/28 23:55:24
If I'm understanding your comment correctly, the s
| |
| 114 return None, error | 165 # Indicate in the error that the retry timeout was reached. |
| 166 error['retry_timeout'] = True | |
| 167 break | |
| 115 | 168 |
| 116 return content, None | 169 logging.error('Failed to get an adequate response from %s. No data could be ' |
| 170 'retrieved', url) | |
| 171 return None, error | |
| 117 | 172 |
| 118 | 173 |
| 119 def GetSwarmingTaskRequest(task_id, http_client): | 174 def GetSwarmingTaskRequest(task_id, http_client): |
| 120 """Returns an instance of SwarmingTaskRequest representing the given task.""" | 175 """Returns an instance of SwarmingTaskRequest representing the given task.""" |
| 121 swarming_server_host = waterfall_config.GetSwarmingSettings().get( | 176 swarming_server_host = waterfall_config.GetSwarmingSettings().get( |
| 122 'server_host') | 177 'server_host') |
| 123 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( | 178 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( |
| 124 swarming_server_host, task_id) | 179 swarming_server_host, task_id) |
| 125 content, error = _SendRequestToServer(url, http_client) | 180 content, error = _SendRequestToServer(url, http_client) |
| 126 | 181 |
| (...skipping 20 matching lines...) Expand all Loading... | |
| 147 request.priority = max(100, swarming_settings.get('default_request_priority')) | 202 request.priority = max(100, swarming_settings.get('default_request_priority')) |
| 148 request.expiration_secs = request_expiration_hours * 60 * 60 | 203 request.expiration_secs = request_expiration_hours * 60 * 60 |
| 149 | 204 |
| 150 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) | 205 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) |
| 151 | 206 |
| 152 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( | 207 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( |
| 153 'server_host') | 208 'server_host') |
| 154 response_data, error = _SendRequestToServer( | 209 response_data, error = _SendRequestToServer( |
| 155 url, http_client, request.Serialize()) | 210 url, http_client, request.Serialize()) |
| 156 | 211 |
| 157 # TODO(lijeffrey): Handle error in calling functions. | |
| 158 if not error: | 212 if not error: |
| 159 return json.loads(response_data)['task_id'], None | 213 return json.loads(response_data)['task_id'], None |
| 160 | 214 |
| 161 return None, error | 215 return None, error |
| 162 | 216 |
| 163 | 217 |
| 164 def ListSwarmingTasksDataByTags( | 218 def ListSwarmingTasksDataByTags( |
| 165 master_name, builder_name, build_number, http_client, | 219 master_name, builder_name, build_number, http_client, |
| 166 additional_tag_filters=None): | 220 additional_tag_filters=None): |
| 167 """Downloads tasks data from swarming server. | 221 """Downloads tasks data from swarming server. |
| (...skipping 303 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 471 for isolated_data in list_isolated_data: | 525 for isolated_data in list_isolated_data: |
| 472 output_json, _ = _DownloadTestResults(isolated_data, http_client) | 526 output_json, _ = _DownloadTestResults(isolated_data, http_client) |
| 473 if not output_json: | 527 if not output_json: |
| 474 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. | 528 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. |
| 475 return None | 529 return None |
| 476 shard_results.append(output_json) | 530 shard_results.append(output_json) |
| 477 | 531 |
| 478 if len(list_isolated_data) == 1: | 532 if len(list_isolated_data) == 1: |
| 479 return shard_results[0] | 533 return shard_results[0] |
| 480 return _MergeSwarmingTestShards(shard_results) | 534 return _MergeSwarmingTestShards(shard_results) |
| OLD | NEW |