Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import base64 | 5 import base64 |
| 6 from collections import defaultdict | 6 from collections import defaultdict |
| 7 import json | 7 import json |
| 8 import logging | 8 import logging |
| 9 import time | |
| 9 import urllib | 10 import urllib |
| 10 import zlib | 11 import zlib |
| 11 | 12 |
| 12 from google.appengine.api.urlfetch_errors import DeadlineExceededError | 13 from google.appengine.api.urlfetch_errors import DeadlineExceededError |
| 13 from google.appengine.api.urlfetch_errors import DownloadError | 14 from google.appengine.api.urlfetch_errors import DownloadError |
| 14 from google.appengine.api.urlfetch_errors import ConnectionClosedError | 15 from google.appengine.api.urlfetch_errors import ConnectionClosedError |
| 15 from google.appengine.ext import ndb | 16 from google.appengine.ext import ndb |
| 16 | 17 |
| 17 from common import auth_util | 18 from common import auth_util |
| 18 from model.wf_step import WfStep | 19 from model.wf_step import WfStep |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 62 TASK_FAILED = 2 | 63 TASK_FAILED = 2 |
| 63 | 64 |
| 64 # Swarming task exit code descriptions. | 65 # Swarming task exit code descriptions. |
| 65 EXIT_CODE_DESCRIPTIONS = { | 66 EXIT_CODE_DESCRIPTIONS = { |
| 66 ALL_TESTS_PASSED: 'All tests passed', | 67 ALL_TESTS_PASSED: 'All tests passed', |
| 67 SOME_TESTS_FAILED: 'Some tests failed', | 68 SOME_TESTS_FAILED: 'Some tests failed', |
| 68 TASK_FAILED: 'Swarming task failed', | 69 TASK_FAILED: 'Swarming task failed', |
| 69 } | 70 } |
| 70 | 71 |
| 71 | 72 |
| 73 def _GetBackoffSeconds(retry_backoff, tries, maximum_retry_interval): | |
| 74 """Returns how many seconds to wait before next retry. | |
| 75 | |
| 76 Params: | |
| 77 retry_backoff (int): The base backoff in seconds. | |
| 78 tries (int): Indicates how many tries have been done. | |
| 79 maximum_retry_interval (int): The upper limit in seconds of how long to wait | |
| 80 between retries. | |
| 81 """ | |
| 82 return min(retry_backoff * (2 ** (tries - 1)), maximum_retry_interval) | |
| 83 | |
| 84 | |
| 72 def _SendRequestToServer(url, http_client, post_data=None): | 85 def _SendRequestToServer(url, http_client, post_data=None): |
| 73 """Sends GET/POST request to arbitrary url and returns response content.""" | 86 """Sends GET/POST request to arbitrary url and returns response content. |
| 87 | |
| 88 Because the Swarming and Isolated servers that _SendRequestToServer tries to | |
| 89 contact are prone to outages, exceptions trying to reach them may occur thus | |
| 90 this method should retry. We want to monitor and document these occurrences | |
| 91 even if the request eventually succeeds after retrying, with the last error | |
| 92 encountered being the one that is reported. | |
| 93 | |
| 94 Args: | |
| 95 url (str): The url to send the request to. | |
| 96 http_client (HttpClient): The httpclient object with which to make the | |
| 97 server calls. | |
| 98 post_data (dict): Data/params to send with the request, if any. | |
| 99 swarming_task (WfSwarmingTask, FlakeSwarmingTask): An optional swarming | |
|
stgao
2016/12/01 20:33:46
This seems invalid now.
lijeffrey
2016/12/01 20:51:11
Good catch, originally I had designed this to cons
| |
| 100 task with which to capture errors. | |
| 101 | |
| 102 Returns: | |
| 103 content (dict), error (dict): The content from the server and the last error | |
| 104 encountered trying to retrieve it. | |
| 105 """ | |
| 74 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} | 106 headers = {'Authorization': 'Bearer ' + auth_util.GetAuthToken()} |
| 107 swarming_settings = waterfall_config.GetSwarmingSettings() | |
| 108 should_retry = swarming_settings.get('should_retry_server') | |
| 109 timeout_seconds = ( | |
| 110 swarming_settings.get('server_retry_timeout_hours') * 60 * 60) | |
| 111 maximum_retry_interval = swarming_settings.get( | |
| 112 'maximum_server_contact_retry_interval_seconds') | |
| 113 deadline = time.time() + timeout_seconds | |
| 114 retry_backoff = 60 | |
| 115 tries = 1 | |
| 75 error = None | 116 error = None |
| 76 | 117 |
| 77 try: | 118 if post_data: |
| 78 if post_data: | 119 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) |
|
stgao
2016/12/01 20:33:46
Why we need to sort?
lijeffrey
2016/12/01 20:51:12
Chan may be more familiar with how post_data needs
| |
| 79 post_data = json.dumps(post_data, sort_keys=True, separators=(',', ':')) | 120 headers['Content-Type'] = 'application/json; charset=UTF-8' |
| 80 headers['Content-Type'] = 'application/json; charset=UTF-8' | 121 headers['Content-Length'] = len(post_data) |
| 81 headers['Content-Length'] = len(post_data) | 122 |
| 82 status_code, content = http_client.Post(url, post_data, headers=headers) | 123 while True: |
| 124 try: | |
| 125 if post_data: | |
| 126 status_code, content = http_client.Post(url, post_data, headers=headers) | |
| 127 else: | |
| 128 status_code, content = http_client.Get(url, headers=headers) | |
| 129 except ConnectionClosedError as e: | |
| 130 error = { | |
| 131 'code': URLFETCH_CONNECTION_CLOSED_ERROR, | |
| 132 'message': e.message | |
| 133 } | |
| 134 except DeadlineExceededError as e: | |
| 135 error = { | |
| 136 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, | |
| 137 'message': e.message | |
| 138 } | |
| 139 except DownloadError as e: | |
| 140 error = { | |
| 141 'code': URLFETCH_DOWNLOAD_ERROR, | |
| 142 'message': e.message | |
| 143 } | |
| 144 except Exception as e: # pragma: no cover | |
| 145 error = { | |
| 146 'code': UNKNOWN, | |
| 147 'message': e.message | |
| 148 } | |
| 149 | |
| 150 if error or status_code != 200: | |
| 151 # The retry upon 50x (501 excluded) is automatically handled in the | |
| 152 # underlying http_client. | |
| 153 # By default, it retries 5 times with exponential backoff. | |
| 154 error = error or { | |
| 155 'code': EXCEEDED_MAX_RETRIES_ERROR, | |
| 156 'message': 'Max retries exceeded trying to reach %s' % url | |
| 157 } | |
| 158 logging.error(error['message']) | |
| 83 else: | 159 else: |
| 84 status_code, content = http_client.Get(url, headers=headers) | 160 # Even if the call is successful, still return the last error encountered. |
| 85 except ConnectionClosedError as e: | 161 return content, error |
| 86 error = { | |
| 87 'code': URLFETCH_CONNECTION_CLOSED_ERROR, | |
| 88 'message': e.message | |
| 89 } | |
| 90 except DeadlineExceededError as e: | |
| 91 error = { | |
| 92 'code': URLFETCH_DEADLINE_EXCEEDED_ERROR, | |
| 93 'message': e.message | |
| 94 } | |
| 95 except DownloadError as e: | |
| 96 error = { | |
| 97 'code': URLFETCH_DOWNLOAD_ERROR, | |
| 98 'message': e.message | |
| 99 } | |
| 100 except Exception as e: # pragma: no cover | |
| 101 error = { | |
| 102 'code': UNKNOWN, | |
| 103 'message': e.message | |
| 104 } | |
| 105 # Still raise an exception here for cases not encountered before in order | |
| 106 # to see what went wrong in the logs. | |
| 107 raise e | |
| 108 | 162 |
| 109 if error or status_code != 200: | 163 if should_retry and time.time() < deadline: # pragma: no cover |
| 110 # The retry upon 50x (501 excluded) is automatically handled in the | 164 # Wait, then retry if applicable. |
| 111 # underlying http_client. | 165 wait_time = _GetBackoffSeconds( |
| 112 # By default, it retries 5 times with exponential backoff. | 166 retry_backoff, tries, maximum_retry_interval) |
| 113 error = error or { | 167 logging.info('Retrying connection to %s in %d seconds', url, wait_time) |
| 114 'code': EXCEEDED_MAX_RETRIES_ERROR, | 168 time.sleep(wait_time) |
| 115 'message': 'Max retries exceeded trying to reach %s' % url | 169 tries += 1 |
| 116 } | 170 else: |
| 117 logging.error(error['message']) | 171 if should_retry: |
| 118 return None, error | 172 # Indicate in the error that the retry timeout was reached. |
| 173 error['retry_timeout'] = True | |
| 174 break | |
| 119 | 175 |
| 120 return content, None | 176 logging.error('Failed to get an adequate response from %s. No data could be ' |
| 177 'retrieved', url) | |
| 178 return None, error | |
| 121 | 179 |
| 122 | 180 |
| 123 def GetSwarmingTaskRequest(task_id, http_client): | 181 def GetSwarmingTaskRequest(task_id, http_client): |
| 124 """Returns an instance of SwarmingTaskRequest representing the given task.""" | 182 """Returns an instance of SwarmingTaskRequest representing the given task.""" |
| 125 swarming_server_host = waterfall_config.GetSwarmingSettings().get( | 183 swarming_server_host = waterfall_config.GetSwarmingSettings().get( |
| 126 'server_host') | 184 'server_host') |
| 127 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( | 185 url = ('https://%s/_ah/api/swarming/v1/task/%s/request') % ( |
| 128 swarming_server_host, task_id) | 186 swarming_server_host, task_id) |
| 129 content, error = _SendRequestToServer(url, http_client) | 187 content, error = _SendRequestToServer(url, http_client) |
| 130 | 188 |
| (...skipping 20 matching lines...) Expand all Loading... | |
| 151 request.priority = max(100, swarming_settings.get('default_request_priority')) | 209 request.priority = max(100, swarming_settings.get('default_request_priority')) |
| 152 request.expiration_secs = request_expiration_hours * 60 * 60 | 210 request.expiration_secs = request_expiration_hours * 60 * 60 |
| 153 | 211 |
| 154 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) | 212 request.tags.extend(['findit:1', 'project:Chromium', 'purpose:post-commit']) |
| 155 | 213 |
| 156 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( | 214 url = 'https://%s/_ah/api/swarming/v1/tasks/new' % swarming_settings.get( |
| 157 'server_host') | 215 'server_host') |
| 158 response_data, error = _SendRequestToServer( | 216 response_data, error = _SendRequestToServer( |
| 159 url, http_client, request.Serialize()) | 217 url, http_client, request.Serialize()) |
| 160 | 218 |
| 161 # TODO(lijeffrey): Handle error in calling functions. | |
| 162 if not error: | 219 if not error: |
| 163 return json.loads(response_data)['task_id'], None | 220 return json.loads(response_data)['task_id'], None |
| 164 | 221 |
| 165 return None, error | 222 return None, error |
| 166 | 223 |
| 167 | 224 |
| 168 def ListSwarmingTasksDataByTags( | 225 def ListSwarmingTasksDataByTags( |
| 169 master_name, builder_name, build_number, http_client, | 226 master_name, builder_name, build_number, http_client, |
| 170 additional_tag_filters=None): | 227 additional_tag_filters=None): |
| 171 """Downloads tasks data from swarming server. | 228 """Downloads tasks data from swarming server. |
| (...skipping 303 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 475 for isolated_data in list_isolated_data: | 532 for isolated_data in list_isolated_data: |
| 476 output_json, _ = _DownloadTestResults(isolated_data, http_client) | 533 output_json, _ = _DownloadTestResults(isolated_data, http_client) |
| 477 if not output_json: | 534 if not output_json: |
| 478 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. | 535 # TODO(lijeffrey): Report/handle error returned from _DownloadTestResults. |
| 479 return None | 536 return None |
| 480 shard_results.append(output_json) | 537 shard_results.append(output_json) |
| 481 | 538 |
| 482 if len(list_isolated_data) == 1: | 539 if len(list_isolated_data) == 1: |
| 483 return shard_results[0] | 540 return shard_results[0] |
| 484 return _MergeSwarmingTestShards(shard_results) | 541 return _MergeSwarmingTestShards(shard_results) |
| OLD | NEW |