Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(252)

Side by Side Diff: appengine/swarming/server/task_scheduler.py

Issue 2914803004: Fix non-idempotent task that /poll reaped but returned HTTP 500. (Closed)
Patch Set: Rebase on 2927053002 to reduce delta Created 3 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2014 The LUCI Authors. All rights reserved. 1 # Copyright 2014 The LUCI Authors. All rights reserved.
2 # Use of this source code is governed under the Apache License, Version 2.0 2 # Use of this source code is governed under the Apache License, Version 2.0
3 # that can be found in the LICENSE file. 3 # that can be found in the LICENSE file.
4 4
5 """High level tasks execution scheduling API. 5 """High level tasks execution scheduling API.
6 6
7 This is the interface closest to the HTTP handlers. 7 This is the interface closest to the HTTP handlers.
8 """ 8 """
9 9
10 import datetime 10 import datetime
(...skipping 129 matching lines...) Expand 10 before | Expand all | Expand 10 after
140 # try failed and the retry is being reaped by the same bot. Deny that, as 140 # try failed and the retry is being reaped by the same bot. Deny that, as
141 # the bot may be deeply broken and could be in a killing spree. 141 # the bot may be deeply broken and could be in a killing spree.
142 logging.warning( 142 logging.warning(
143 '%s can\'t retry its own internal failure task', 143 '%s can\'t retry its own internal failure task',
144 result_summary.task_id) 144 result_summary.task_id)
145 return None, None 145 return None, None
146 to_run.queue_number = None 146 to_run.queue_number = None
147 run_result = task_result.new_run_result( 147 run_result = task_result.new_run_result(
148 request, (result_summary.try_number or 0) + 1, bot_id, bot_version, 148 request, (result_summary.try_number or 0) + 1, bot_id, bot_version,
149 bot_dimensions) 149 bot_dimensions)
150 # Upon bot reap, both .started_ts and .modified_ts matches. They differ on
151 # the first ping.
152 run_result.started_ts = now
150 run_result.modified_ts = now 153 run_result.modified_ts = now
151 result_summary.set_from_run_result(run_result, request) 154 result_summary.set_from_run_result(run_result, request)
152 ndb.put_multi([to_run, run_result, result_summary]) 155 ndb.put_multi([to_run, run_result, result_summary])
153 if result_summary.state != orig_summary_state: 156 if result_summary.state != orig_summary_state:
154 _maybe_pubsub_notify_via_tq(result_summary, request) 157 _maybe_pubsub_notify_via_tq(result_summary, request)
155 return run_result, secret_bytes 158 return run_result, secret_bytes
156 159
157 # The bot will reap the next available task in case of failure, no big deal. 160 # The bot will reap the next available task in case of failure, no big deal.
158 try: 161 try:
159 run_result, secret_bytes = datastore_utils.transaction(run, retries=0) 162 run_result, secret_bytes = datastore_utils.transaction(run, retries=0)
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
192 run_result, result_summary, to_run = ndb.get_multi( 195 run_result, result_summary, to_run = ndb.get_multi(
193 (run_result_key, result_summary_key, to_run_key)) 196 (run_result_key, result_summary_key, to_run_key))
194 if run_result.state != task_result.State.RUNNING: 197 if run_result.state != task_result.State.RUNNING:
195 # It was updated already or not updating last. Likely DB index was stale. 198 # It was updated already or not updating last. Likely DB index was stale.
196 return None, run_result.bot_id 199 return None, run_result.bot_id
197 if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: 200 if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:
198 # The query index IS stale. 201 # The query index IS stale.
199 return None, run_result.bot_id 202 return None, run_result.bot_id
200 203
201 run_result.signal_server_version(server_version) 204 run_result.signal_server_version(server_version)
205 old_modified = run_result.modified_ts
202 run_result.modified_ts = now 206 run_result.modified_ts = now
203 207
204 orig_summary_state = result_summary.state 208 orig_summary_state = result_summary.state
205 if result_summary.try_number != run_result.try_number: 209 if result_summary.try_number != run_result.try_number:
206 # Not updating correct run_result, cancel it without touching 210 # Not updating correct run_result, cancel it without touching
207 # result_summary. 211 # result_summary.
208 to_put = (run_result,) 212 to_put = (run_result,)
209 run_result.state = task_result.State.BOT_DIED 213 run_result.state = task_result.State.BOT_DIED
210 run_result.internal_failure = True 214 run_result.internal_failure = True
211 run_result.abandoned_ts = now 215 run_result.abandoned_ts = now
212 task_is_retried = None 216 task_is_retried = None
213 elif (result_summary.try_number == 1 and now < request.expiration_ts and 217 elif (result_summary.try_number == 1 and now < request.expiration_ts and
214 request.properties.idempotent): 218 (request.properties.idempotent or
215 # Retry it. 219 run_result.started_ts == old_modified)):
220 # Retry it. It fits:
221 # - first try
222 # - not yet expired
223 # - One of:
224 # - idempotent
225 # - task hadn't got any ping at all from task_runner.run_command()
Vadim Sh. 2017/06/08 22:06:01 What happens if the ping from first bot then comes
M-A Ruel 2017/06/12 17:16:21 Added test_bot_poll_http_500_but_bot_reapears_afte
216 to_put = (run_result, result_summary, to_run) 226 to_put = (run_result, result_summary, to_run)
217 to_run.queue_number = task_to_run.gen_queue_number(request) 227 to_run.queue_number = task_to_run.gen_queue_number(request)
218 run_result.state = task_result.State.BOT_DIED 228 run_result.state = task_result.State.BOT_DIED
219 run_result.internal_failure = True 229 run_result.internal_failure = True
220 run_result.abandoned_ts = now 230 run_result.abandoned_ts = now
221 # Do not sync data from run_result to result_summary, since the task is 231 # Do not sync data from run_result to result_summary, since the task is
222 # being retried. 232 # being retried.
223 result_summary.reset_to_pending() 233 result_summary.reset_to_pending()
224 result_summary.modified_ts = now 234 result_summary.modified_ts = now
225 task_is_retried = True 235 task_is_retried = True
(...skipping 710 matching lines...) Expand 10 before | Expand all | Expand 10 after
936 ## Task queue tasks. 946 ## Task queue tasks.
937 947
938 948
939 def task_handle_pubsub_task(payload): 949 def task_handle_pubsub_task(payload):
940 """Handles task enqueued by _maybe_pubsub_notify_via_tq.""" 950 """Handles task enqueued by _maybe_pubsub_notify_via_tq."""
941 # Do not catch errors to trigger task queue task retry. Errors should not 951 # Do not catch errors to trigger task queue task retry. Errors should not
942 # happen in normal case. 952 # happen in normal case.
943 _pubsub_notify( 953 _pubsub_notify(
944 payload['task_id'], payload['topic'], 954 payload['task_id'], payload['topic'],
945 payload['auth_token'], payload['userdata']) 955 payload['auth_token'], payload['userdata'])
OLDNEW
« no previous file with comments | « appengine/swarming/server/task_result_test.py ('k') | appengine/swarming/server/task_scheduler_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698