| OLD | NEW |
| 1 # Copyright 2014 The LUCI Authors. All rights reserved. | 1 # Copyright 2014 The LUCI Authors. All rights reserved. |
| 2 # Use of this source code is governed under the Apache License, Version 2.0 | 2 # Use of this source code is governed under the Apache License, Version 2.0 |
| 3 # that can be found in the LICENSE file. | 3 # that can be found in the LICENSE file. |
| 4 | 4 |
| 5 """Swarming bot management, e.g. list of known bots and their state. | 5 """Swarming bot management, e.g. list of known bots and their state. |
| 6 | 6 |
| 7 +---------+ | 7 +---------+ |
| 8 |BotRoot | | 8 |BotRoot | |
| 9 |id=bot_id| | 9 |id=bot_id| |
| 10 +---------+ | 10 +---------+ |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 77 # dimensions) or via self-health check. | 77 # dimensions) or via self-health check. |
| 78 # - dimensions['id'] is not exactly one item. | 78 # - dimensions['id'] is not exactly one item. |
| 79 # - invalid HTTP POST request keys. | 79 # - invalid HTTP POST request keys. |
| 80 # - BotSettings.quarantined was set at that moment. | 80 # - BotSettings.quarantined was set at that moment. |
| 81 quarantined = ndb.BooleanProperty(default=False) | 81 quarantined = ndb.BooleanProperty(default=False) |
| 82 | 82 |
| 83 # Affected by event_type == 'request_task', 'task_canceled', 'task_completed', | 83 # Affected by event_type == 'request_task', 'task_canceled', 'task_completed', |
| 84 # 'task_error'. | 84 # 'task_error'. |
| 85 task_id = ndb.StringProperty(indexed=False) | 85 task_id = ndb.StringProperty(indexed=False) |
| 86 | 86 |
| 87 # Machine Provider lease ID, for bots acquired from Machine Provider. |
| 88 lease_id = ndb.StringProperty(indexed=False) |
| 89 |
| 90 # UTC seconds from epoch when bot will be reclaimed by Machine Provider. |
| 91 lease_expiration_ts = ndb.DateTimeProperty(indexed=False) |
| 92 |
| 87 @property | 93 @property |
| 88 def dimensions(self): | 94 def dimensions(self): |
| 89 """Returns a dict representation of self.dimensions_flat.""" | 95 """Returns a dict representation of self.dimensions_flat.""" |
| 90 if self.dimensions_old: | 96 if self.dimensions_old: |
| 91 return self.dimensions_old | 97 return self.dimensions_old |
| 92 out = {} | 98 out = {} |
| 93 for i in self.dimensions_flat: | 99 for i in self.dimensions_flat: |
| 94 k, v = i.split(':', 1) | 100 k, v = i.split(':', 1) |
| 95 out.setdefault(k, []).append(v) | 101 out.setdefault(k, []).append(v) |
| 96 return out | 102 return out |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 168 | 174 |
| 169 class BotEvent(_BotCommon): | 175 class BotEvent(_BotCommon): |
| 170 """This entity is immutable. | 176 """This entity is immutable. |
| 171 | 177 |
| 172 Parent is BotRoot. Key id is monotonically decreasing with | 178 Parent is BotRoot. Key id is monotonically decreasing with |
| 173 datastore_utils.store_new_version(). | 179 datastore_utils.store_new_version(). |
| 174 | 180 |
| 175 This entity is created on each bot state transition. | 181 This entity is created on each bot state transition. |
| 176 """ | 182 """ |
| 177 ALLOWED_EVENTS = { | 183 ALLOWED_EVENTS = { |
| 178 'bot_connected', 'bot_error', 'bot_log', 'bot_rebooting', 'bot_shutdown', | 184 'bot_connected', 'bot_error', 'bot_leased', 'bot_log', 'bot_rebooting', |
| 179 'bot_terminate', | 185 'bot_shutdown', 'bot_terminate', |
| 180 'request_restart', 'request_update', 'request_sleep', 'request_task', | 186 'request_restart', 'request_update', 'request_sleep', 'request_task', |
| 181 'task_completed', 'task_error', 'task_update', | 187 'task_completed', 'task_error', 'task_update', |
| 182 } | 188 } |
| 183 # Dimensions are used for task selection. They are encoded as a list of | 189 # Dimensions are used for task selection. They are encoded as a list of |
| 184 # key:value. Keep in mind that the same key can be used multiple times. The | 190 # key:value. Keep in mind that the same key can be used multiple times. The |
| 185 # list must be sorted. | 191 # list must be sorted. |
| 186 # It is NOT indexed because this is not needed for events. | 192 # It is NOT indexed because this is not needed for events. |
| 187 dimensions_flat = ndb.StringProperty(repeated=True, indexed=False) | 193 dimensions_flat = ndb.StringProperty(repeated=True, indexed=False) |
| 188 | 194 |
| 189 # Common properties for all events (which includes everything in _BotCommon). | 195 # Common properties for all events (which includes everything in _BotCommon). |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 305 - dimensions: Bot's dimensions as self-reported. If not provided, keep | 311 - dimensions: Bot's dimensions as self-reported. If not provided, keep |
| 306 previous value. | 312 previous value. |
| 307 - state: ephemeral state of the bot. It is expected to change constantly. If | 313 - state: ephemeral state of the bot. It is expected to change constantly. If |
| 308 not provided, keep previous value. | 314 not provided, keep previous value. |
| 309 - version: swarming_bot.zip version as self-reported. Used to spot if a bot | 315 - version: swarming_bot.zip version as self-reported. Used to spot if a bot |
| 310 failed to update promptly. If not provided, keep previous value. | 316 failed to update promptly. If not provided, keep previous value. |
| 311 - quarantined: bool to determine if the bot was declared quarantined. | 317 - quarantined: bool to determine if the bot was declared quarantined. |
| 312 - task_id: packed task id if relevant. Set to '' to zap the stored value. | 318 - task_id: packed task id if relevant. Set to '' to zap the stored value. |
| 313 - task_name: task name if relevant. Zapped when task_id is zapped. | 319 - task_name: task name if relevant. Zapped when task_id is zapped. |
| 314 - kwargs: optional values to add to BotEvent relevant to event_type. | 320 - kwargs: optional values to add to BotEvent relevant to event_type. |
| 321 - lease_id (in kwargs): ID assigned by Machine Provider for this bot. |
| 322 - lease_expiration_ts (in kwargs): UTC seconds from epoch when Machine |
| 323 Provider lease expires. |
| 315 """ | 324 """ |
| 316 if not bot_id: | 325 if not bot_id: |
| 317 return | 326 return |
| 318 | 327 |
| 319 # Retrieve the previous BotInfo and update it. | 328 # Retrieve the previous BotInfo and update it. |
| 320 info_key = get_info_key(bot_id) | 329 info_key = get_info_key(bot_id) |
| 321 bot_info = info_key.get() or BotInfo(key=info_key) | 330 bot_info = info_key.get() or BotInfo(key=info_key) |
| 322 bot_info.last_seen_ts = utils.utcnow() | 331 bot_info.last_seen_ts = utils.utcnow() |
| 323 bot_info.external_ip = external_ip | 332 bot_info.external_ip = external_ip |
| 324 bot_info.authenticated_as = authenticated_as | 333 bot_info.authenticated_as = authenticated_as |
| 325 if dimensions: | 334 if dimensions: |
| 326 bot_info.dimensions_flat = dimensions_to_flat(dimensions) | 335 bot_info.dimensions_flat = dimensions_to_flat(dimensions) |
| 327 if state: | 336 if state: |
| 328 bot_info.state = state | 337 bot_info.state = state |
| 329 if quarantined is not None: | 338 if quarantined is not None: |
| 330 bot_info.quarantined = quarantined | 339 bot_info.quarantined = quarantined |
| 331 if task_id is not None: | 340 if task_id is not None: |
| 332 bot_info.task_id = task_id | 341 bot_info.task_id = task_id |
| 333 if task_name: | 342 if task_name: |
| 334 bot_info.task_name = task_name | 343 bot_info.task_name = task_name |
| 335 if version is not None: | 344 if version is not None: |
| 336 bot_info.version = version | 345 bot_info.version = version |
| 346 if kwargs.get('lease_id') is not None: |
| 347 bot_info.lease_id = kwargs['lease_id'] |
| 348 if kwargs.get('lease_expiration_ts') is not None: |
| 349 bot_info.lease_expiration_ts = kwargs['lease_expiration_ts'] |
| 337 | 350 |
| 338 if event_type in ('request_sleep', 'task_update'): | 351 if event_type in ('request_sleep', 'task_update'): |
| 339 # Handle this specifically. It's not much of an even worth saving a BotEvent | 352 # Handle this specifically. It's not much of an even worth saving a BotEvent |
| 340 # for but it's worth updating BotInfo. The only reason BotInfo is GET is to | 353 # for but it's worth updating BotInfo. The only reason BotInfo is GET is to |
| 341 # keep first_seen_ts. It's not necessary to use a transaction here since no | 354 # keep first_seen_ts. It's not necessary to use a transaction here since no |
| 342 # BotEvent is being added, only last_seen_ts is really updated. | 355 # BotEvent is being added, only last_seen_ts is really updated. |
| 343 bot_info.put() | 356 bot_info.put() |
| 344 return | 357 return |
| 345 | 358 |
| 346 event = BotEvent( | 359 event = BotEvent( |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 393 Returns: | 406 Returns: |
| 394 Tuple (True to restart, text message explaining the reason). | 407 Tuple (True to restart, text message explaining the reason). |
| 395 """ | 408 """ |
| 396 # Periodically reboot bots to workaround OS level leaks (especially on Win). | 409 # Periodically reboot bots to workaround OS level leaks (especially on Win). |
| 397 running_time = state.get('running_time', 0) | 410 running_time = state.get('running_time', 0) |
| 398 assert isinstance(running_time, (int, float)) | 411 assert isinstance(running_time, (int, float)) |
| 399 period = get_bot_reboot_period(bot_id, state) | 412 period = get_bot_reboot_period(bot_id, state) |
| 400 if period and running_time > period: | 413 if period and running_time > period: |
| 401 return True, 'Periodic reboot: running longer than %ds' % period | 414 return True, 'Periodic reboot: running longer than %ds' % period |
| 402 return False, '' | 415 return False, '' |
| OLD | NEW |