appengine/swarming/server/task_scheduler.py - Issue 2914803004: Fix non-idempotent task that /poll reaped but returned HTTP 500.

Side by Side Diff: appengine/swarming/server/task_scheduler.py

Issue 2914803004: Fix non-idempotent task that /poll reaped but returned HTTP 500. (Closed)

Patch Set: Rebase on 2927053002 to reduce delta Created 3 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 # Copyright 2014 The LUCI Authors. All rights reserved.	1 # Copyright 2014 The LUCI Authors. All rights reserved.

2 # Use of this source code is governed under the Apache License, Version 2.0	2 # Use of this source code is governed under the Apache License, Version 2.0

3 # that can be found in the LICENSE file.	3 # that can be found in the LICENSE file.

4	4

5 """High level tasks execution scheduling API.	5 """High level tasks execution scheduling API.

6	6

7 This is the interface closest to the HTTP handlers.	7 This is the interface closest to the HTTP handlers.

8 """	8 """

9	9

10 import datetime	10 import datetime

(...skipping 129 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
140 # try failed and the retry is being reaped by the same bot. Deny that, as	140 # try failed and the retry is being reaped by the same bot. Deny that, as

141 # the bot may be deeply broken and could be in a killing spree.	141 # the bot may be deeply broken and could be in a killing spree.

142 logging.warning(	142 logging.warning(

143 '%s can\'t retry its own internal failure task',	143 '%s can\'t retry its own internal failure task',

144 result_summary.task_id)	144 result_summary.task_id)

145 return None, None	145 return None, None

146 to_run.queue_number = None	146 to_run.queue_number = None

147 run_result = task_result.new_run_result(	147 run_result = task_result.new_run_result(

148 request, (result_summary.try_number or 0) + 1, bot_id, bot_version,	148 request, (result_summary.try_number or 0) + 1, bot_id, bot_version,

149 bot_dimensions)	149 bot_dimensions)

	150 # Upon bot reap, both .started_ts and .modified_ts matches. They differ on

	151 # the first ping.

	152 run_result.started_ts = now

150 run_result.modified_ts = now	153 run_result.modified_ts = now

151 result_summary.set_from_run_result(run_result, request)	154 result_summary.set_from_run_result(run_result, request)

152 ndb.put_multi([to_run, run_result, result_summary])	155 ndb.put_multi([to_run, run_result, result_summary])

153 if result_summary.state != orig_summary_state:	156 if result_summary.state != orig_summary_state:

154 _maybe_pubsub_notify_via_tq(result_summary, request)	157 _maybe_pubsub_notify_via_tq(result_summary, request)

155 return run_result, secret_bytes	158 return run_result, secret_bytes

156	159

157 # The bot will reap the next available task in case of failure, no big deal.	160 # The bot will reap the next available task in case of failure, no big deal.

158 try:	161 try:

159 run_result, secret_bytes = datastore_utils.transaction(run, retries=0)	162 run_result, secret_bytes = datastore_utils.transaction(run, retries=0)

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 run_result, result_summary, to_run = ndb.get_multi(	195 run_result, result_summary, to_run = ndb.get_multi(

193 (run_result_key, result_summary_key, to_run_key))	196 (run_result_key, result_summary_key, to_run_key))

194 if run_result.state != task_result.State.RUNNING:	197 if run_result.state != task_result.State.RUNNING:

195 # It was updated already or not updating last. Likely DB index was stale.	198 # It was updated already or not updating last. Likely DB index was stale.

196 return None, run_result.bot_id	199 return None, run_result.bot_id

197 if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:	200 if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:

198 # The query index IS stale.	201 # The query index IS stale.

199 return None, run_result.bot_id	202 return None, run_result.bot_id

200	203

201 run_result.signal_server_version(server_version)	204 run_result.signal_server_version(server_version)

	205 old_modified = run_result.modified_ts

202 run_result.modified_ts = now	206 run_result.modified_ts = now

203	207

204 orig_summary_state = result_summary.state	208 orig_summary_state = result_summary.state

205 if result_summary.try_number != run_result.try_number:	209 if result_summary.try_number != run_result.try_number:

206 # Not updating correct run_result, cancel it without touching	210 # Not updating correct run_result, cancel it without touching

207 # result_summary.	211 # result_summary.

208 to_put = (run_result,)	212 to_put = (run_result,)

209 run_result.state = task_result.State.BOT_DIED	213 run_result.state = task_result.State.BOT_DIED

210 run_result.internal_failure = True	214 run_result.internal_failure = True

211 run_result.abandoned_ts = now	215 run_result.abandoned_ts = now

212 task_is_retried = None	216 task_is_retried = None

213 elif (result_summary.try_number == 1 and now < request.expiration_ts and	217 elif (result_summary.try_number == 1 and now < request.expiration_ts and

214 request.properties.idempotent):	218 (request.properties.idempotent or

215 # Retry it.	219 run_result.started_ts == old_modified)):

	220 # Retry it. It fits:

	221 # - first try

	222 # - not yet expired

	223 # - One of:

	224 # - idempotent

	225 # - task hadn't got any ping at all from task_runner.run_command()
	Vadim Sh. 2017/06/08 22:06:01 What happens if the ping from first bot then comes What happens if the ping from first bot then comes later anyway, when the task is already given to another bot. Can you add a test for this specific case (if it doesn't exist already)? M-A Ruel 2017/06/12 17:16:21 Added test_bot_poll_http_500_but_bot_reapears_afte Show quoted text On 2017/06/08 22:06:01, Vadim Sh. wrote: > What happens if the ping from first bot then comes later anyway, when the task > is already given to another bot. > > Can you add a test for this specific case (if it doesn't exist already)? Added test_bot_poll_http_500_but_bot_reapears_after_BOT_PING_TOLERANCE to clearly test this condition.
216 to_put = (run_result, result_summary, to_run)	226 to_put = (run_result, result_summary, to_run)

217 to_run.queue_number = task_to_run.gen_queue_number(request)	227 to_run.queue_number = task_to_run.gen_queue_number(request)

218 run_result.state = task_result.State.BOT_DIED	228 run_result.state = task_result.State.BOT_DIED

219 run_result.internal_failure = True	229 run_result.internal_failure = True

220 run_result.abandoned_ts = now	230 run_result.abandoned_ts = now

221 # Do not sync data from run_result to result_summary, since the task is	231 # Do not sync data from run_result to result_summary, since the task is

222 # being retried.	232 # being retried.

223 result_summary.reset_to_pending()	233 result_summary.reset_to_pending()

224 result_summary.modified_ts = now	234 result_summary.modified_ts = now

225 task_is_retried = True	235 task_is_retried = True

(...skipping 710 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
936 ## Task queue tasks.	946 ## Task queue tasks.

937	947

938	948

939 def task_handle_pubsub_task(payload):	949 def task_handle_pubsub_task(payload):

940 """Handles task enqueued by _maybe_pubsub_notify_via_tq."""	950 """Handles task enqueued by _maybe_pubsub_notify_via_tq."""

941 # Do not catch errors to trigger task queue task retry. Errors should not	951 # Do not catch errors to trigger task queue task retry. Errors should not

942 # happen in normal case.	952 # happen in normal case.

943 _pubsub_notify(	953 _pubsub_notify(

944 payload['task_id'], payload['topic'],	954 payload['task_id'], payload['topic'],

945 payload['auth_token'], payload['userdata'])	955 payload['auth_token'], payload['userdata'])

OLD	NEW

« no previous file with comments | « appengine/swarming/server/task_result_test.py ('k') | appengine/swarming/server/task_scheduler_test.py » ('j') | no next file with comments »