Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2014 The LUCI Authors. All rights reserved. | 1 # Copyright 2014 The LUCI Authors. All rights reserved. |
| 2 # Use of this source code is governed under the Apache License, Version 2.0 | 2 # Use of this source code is governed under the Apache License, Version 2.0 |
| 3 # that can be found in the LICENSE file. | 3 # that can be found in the LICENSE file. |
| 4 | 4 |
| 5 """High level tasks execution scheduling API. | 5 """High level tasks execution scheduling API. |
| 6 | 6 |
| 7 This is the interface closest to the HTTP handlers. | 7 This is the interface closest to the HTTP handlers. |
| 8 """ | 8 """ |
| 9 | 9 |
| 10 import datetime | 10 import datetime |
| (...skipping 129 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 140 # try failed and the retry is being reaped by the same bot. Deny that, as | 140 # try failed and the retry is being reaped by the same bot. Deny that, as |
| 141 # the bot may be deeply broken and could be in a killing spree. | 141 # the bot may be deeply broken and could be in a killing spree. |
| 142 logging.warning( | 142 logging.warning( |
| 143 '%s can\'t retry its own internal failure task', | 143 '%s can\'t retry its own internal failure task', |
| 144 result_summary.task_id) | 144 result_summary.task_id) |
| 145 return None, None | 145 return None, None |
| 146 to_run.queue_number = None | 146 to_run.queue_number = None |
| 147 run_result = task_result.new_run_result( | 147 run_result = task_result.new_run_result( |
| 148 request, (result_summary.try_number or 0) + 1, bot_id, bot_version, | 148 request, (result_summary.try_number or 0) + 1, bot_id, bot_version, |
| 149 bot_dimensions) | 149 bot_dimensions) |
| 150 # Upon bot reap, both .started_ts and .modified_ts matches. They differ on | |
| 151 # the first ping. | |
| 152 run_result.started_ts = now | |
| 150 run_result.modified_ts = now | 153 run_result.modified_ts = now |
| 151 result_summary.set_from_run_result(run_result, request) | 154 result_summary.set_from_run_result(run_result, request) |
| 152 ndb.put_multi([to_run, run_result, result_summary]) | 155 ndb.put_multi([to_run, run_result, result_summary]) |
| 153 if result_summary.state != orig_summary_state: | 156 if result_summary.state != orig_summary_state: |
| 154 _maybe_pubsub_notify_via_tq(result_summary, request) | 157 _maybe_pubsub_notify_via_tq(result_summary, request) |
| 155 return run_result, secret_bytes | 158 return run_result, secret_bytes |
| 156 | 159 |
| 157 # The bot will reap the next available task in case of failure, no big deal. | 160 # The bot will reap the next available task in case of failure, no big deal. |
| 158 try: | 161 try: |
| 159 run_result, secret_bytes = datastore_utils.transaction(run, retries=0) | 162 run_result, secret_bytes = datastore_utils.transaction(run, retries=0) |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 192 run_result, result_summary, to_run = ndb.get_multi( | 195 run_result, result_summary, to_run = ndb.get_multi( |
| 193 (run_result_key, result_summary_key, to_run_key)) | 196 (run_result_key, result_summary_key, to_run_key)) |
| 194 if run_result.state != task_result.State.RUNNING: | 197 if run_result.state != task_result.State.RUNNING: |
| 195 # It was updated already or not updating last. Likely DB index was stale. | 198 # It was updated already or not updating last. Likely DB index was stale. |
| 196 return None, run_result.bot_id | 199 return None, run_result.bot_id |
| 197 if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: | 200 if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: |
| 198 # The query index IS stale. | 201 # The query index IS stale. |
| 199 return None, run_result.bot_id | 202 return None, run_result.bot_id |
| 200 | 203 |
| 201 run_result.signal_server_version(server_version) | 204 run_result.signal_server_version(server_version) |
| 205 old_modified = run_result.modified_ts | |
| 202 run_result.modified_ts = now | 206 run_result.modified_ts = now |
| 203 | 207 |
| 204 orig_summary_state = result_summary.state | 208 orig_summary_state = result_summary.state |
| 205 if result_summary.try_number != run_result.try_number: | 209 if result_summary.try_number != run_result.try_number: |
| 206 # Not updating correct run_result, cancel it without touching | 210 # Not updating correct run_result, cancel it without touching |
| 207 # result_summary. | 211 # result_summary. |
| 208 to_put = (run_result,) | 212 to_put = (run_result,) |
| 209 run_result.state = task_result.State.BOT_DIED | 213 run_result.state = task_result.State.BOT_DIED |
| 210 run_result.internal_failure = True | 214 run_result.internal_failure = True |
| 211 run_result.abandoned_ts = now | 215 run_result.abandoned_ts = now |
| 212 task_is_retried = None | 216 task_is_retried = None |
| 213 elif (result_summary.try_number == 1 and now < request.expiration_ts and | 217 elif (result_summary.try_number == 1 and now < request.expiration_ts and |
| 214 request.properties.idempotent): | 218 (request.properties.idempotent or |
| 215 # Retry it. | 219 run_result.started_ts == old_modified)): |
| 220 # Retry it. It fits: | |
| 221 # - first try | |
| 222 # - not yet expired | |
| 223 # - One of: | |
| 224 # - idempotent | |
| 225 # - task hadn't got any ping at all from task_runner.run_command() | |
|
Vadim Sh.
2017/06/08 22:06:01
What happens if the ping from first bot then comes
M-A Ruel
2017/06/12 17:16:21
Added test_bot_poll_http_500_but_bot_reapears_afte
| |
| 216 to_put = (run_result, result_summary, to_run) | 226 to_put = (run_result, result_summary, to_run) |
| 217 to_run.queue_number = task_to_run.gen_queue_number(request) | 227 to_run.queue_number = task_to_run.gen_queue_number(request) |
| 218 run_result.state = task_result.State.BOT_DIED | 228 run_result.state = task_result.State.BOT_DIED |
| 219 run_result.internal_failure = True | 229 run_result.internal_failure = True |
| 220 run_result.abandoned_ts = now | 230 run_result.abandoned_ts = now |
| 221 # Do not sync data from run_result to result_summary, since the task is | 231 # Do not sync data from run_result to result_summary, since the task is |
| 222 # being retried. | 232 # being retried. |
| 223 result_summary.reset_to_pending() | 233 result_summary.reset_to_pending() |
| 224 result_summary.modified_ts = now | 234 result_summary.modified_ts = now |
| 225 task_is_retried = True | 235 task_is_retried = True |
| (...skipping 710 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 936 ## Task queue tasks. | 946 ## Task queue tasks. |
| 937 | 947 |
| 938 | 948 |
| 939 def task_handle_pubsub_task(payload): | 949 def task_handle_pubsub_task(payload): |
| 940 """Handles task enqueued by _maybe_pubsub_notify_via_tq.""" | 950 """Handles task enqueued by _maybe_pubsub_notify_via_tq.""" |
| 941 # Do not catch errors to trigger task queue task retry. Errors should not | 951 # Do not catch errors to trigger task queue task retry. Errors should not |
| 942 # happen in normal case. | 952 # happen in normal case. |
| 943 _pubsub_notify( | 953 _pubsub_notify( |
| 944 payload['task_id'], payload['topic'], | 954 payload['task_id'], payload['topic'], |
| 945 payload['auth_token'], payload['userdata']) | 955 payload['auth_token'], payload['userdata']) |
| OLD | NEW |