Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(122)

Side by Side Diff: scripts/master/floating_builder.py

Issue 2250443002: Update floating builder logic, add to "chromiumos" (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Pylint fixes. Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2014 The Chromium Authors. All rights reserved. 1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from datetime import datetime 5 from datetime import datetime
6 6
7 from twisted.python import log 7 from twisted.python import log
8 from twisted.internet import reactor 8 from twisted.internet import reactor
9 9
10
11 class FloatingSet(object):
12 """A set describing available primary/floating slaves."""
13 def __init__(self):
14 self._primary = set()
15 self._floating = set()
16
17 def AddPrimary(self, *s):
18 self._primary.update(s)
19
20 def AddFloating(self, *s):
21 self._floating.update(s)
22
23 def NextSlaveFunc(self, grace_period):
24 """Returns a NextSlaveFunc that uses the contents of this set."""
25 return _FloatingNextSlaveFunc(self, grace_period)
26
27 def Get(self):
28 return (sorted(self._primary), sorted(self._floating))
29
30 def __str__(self):
31 return '%s > %s' % (
32 ', '.join(sorted(self._primary)),
33 ', '.join(sorted(self._floating)))
34
35
10 class PokeBuilderTimer(object): 36 class PokeBuilderTimer(object):
11 def __init__(self, botmaster, buildername): 37 def __init__(self, botmaster, buildername):
12 self.botmaster = botmaster 38 self.botmaster = botmaster
13 self.buildername = buildername 39 self.buildername = buildername
14 self.delayed_call = None 40 self.delayed_call = None
15 41
16 def cancel(self): 42 def cancel(self):
17 if self.delayed_call is not None: 43 if self.delayed_call is not None:
18 self.delayed_call.cancel() 44 self.delayed_call.cancel()
19 self.delayed_call = None 45 self.delayed_call = None
20 46
21 def reset(self, delta): 47 def reset(self, delta):
22 if self.delayed_call is not None: 48 if self.delayed_call is not None:
23 current_delta = (datetime.fromtimestamp(self.delayed_call.getTime()) - 49 current_delta = (datetime.fromtimestamp(self.delayed_call.getTime()) -
24 datetime.datetime.now()) 50 _get_now())
25 if delta < current_delta: 51 if delta < current_delta:
26 self.delayed_call.reset(delta.total_seconds()) 52 self.delayed_call.reset(delta.total_seconds())
27 return 53 return
28 54
29 # Schedule a new call 55 # Schedule a new call
30 self.delayed_call = reactor.callLater( 56 self.delayed_call = reactor.callLater(
31 delta.total_seconds(), 57 delta.total_seconds(),
32 self._poke, 58 self._poke,
33 ) 59 )
34 60
35 def _poke(self): 61 def _poke(self):
36 self.delayed_call = None 62 self.delayed_call = None
37 log.msg("Poking builds for builder %r" % (self.buildername,)) 63 log.msg('Poking builds for builder [%s]' % (self.buildername,))
38 self.botmaster.maybeStartBuildsForBuilder(self.buildername) 64 self.botmaster.maybeStartBuildsForBuilder(self.buildername)
39 65
40 66
41 class FloatingNextSlaveFunc(object): 67 class _FloatingNextSlaveFunc(object):
42 """ 68 """
43 This object, when used as a Builder's 'nextSlave' function, allows a strata- 69 This object, when used as a Builder's 'nextSlave' function, allows a strata-
44 based preferential treatment to be assigned to a Builder's Slaves. 70 based preferential treatment to be assigned to a Builder's Slaves.
45 71
46 The 'nextSlave' function is called on a scheduled build when an associated 72 The 'nextSlave' function is called on a scheduled build when an associated
47 slave becomes available, either coming online or finishing an existing build. 73 slave becomes available, either coming online or finishing an existing build.
48 These events are used as stimulus to enable the primary builder(s) to pick 74 These events are used as stimulus to enable the primary builder(s) to pick
49 up builds when appropriate. 75 up builds when appropriate.
50 76
51 1) If a Primary is available, the build will be assigned to them. 77 1) If a Primary is available, the build will be assigned to them.
52 2) If a Primary builder is busy or is still within its grace period for 78 2) If a Primary builder is busy or is still within its grace period for
53 unavailability, no slave will be assigned in anticipation of the 79 unavailability, no slave will be assigned in anticipation of the
54 'nextSlave' being re-invoked once the builder returns (1). If the grace 80 'nextSlave' being re-invoked once the builder returns (1). If the grace
55 period expires, we "poke" the master to call 'nextSlave', at which point 81 period expires, we "poke" the master to call 'nextSlave', at which point
56 the build will fall through to a lower strata. 82 the build will fall through to a lower strata.
57 3) If a Primary slave is offline past its grace period, the build will be 83 3) If a Primary slave is offline past its grace period, the build will be
58 assigned to a Floating slave. 84 assigned to a Floating slave.
59 85
60 Args: 86 Args:
61 strata_property: (str) The name of the Builder property to use to identify 87 fs (FloatingSet): The set of available primary/floating slaves.
62 its strata. 88 grace_period: (timedelta) The amount of time that a slave can be offline
63 strata: (list) A list of strata values ordered by selection priority 89 before builds fall through to a lower strata.
64 grace_period: (None/timedelta) If not None, the amount of time that a slave
65 can be offline before builds fall through to a lower strata.
66 """ 90 """
67 91
68 def __init__(self, strata_property, strata, grace_period=None): 92 def __init__(self, fs, grace_period):
69 self._strata = tuple(strata) 93 self._primary, self._floating = fs.Get()
70 self._strata_property = strata_property 94 self._fs = fs
71 self._grace_period = grace_period 95 self._grace_period = grace_period
72 self._slave_strata_map = {}
73 self._slave_seen_times = {} 96 self._slave_seen_times = {}
74 self._poke_builder_timers = {} 97 self._poke_builder_timers = {}
75 self.verbose = False 98 self.verbose = False
76 99
77 def __repr__(self): 100 def __repr__(self):
78 return '%s(%s)' % (type(self).__name__, ' > '.join(self._strata)) 101 return '%s(%s)' % (type(self).__name__, self._fs)
79 102
80 def __call__(self, builder, slave_builders): 103 def __call__(self, builder, slave_builders):
81 """Main 'nextSlave' invocation point. 104 """Main 'nextSlave' invocation point.
82 105
83 When this is called, we are given the following information: 106 When this is called, we are given the following information:
84 - The Builder 107 - The Builder
85 - A set of 'SlaveBuilder' instances that are available and ready for 108 - A set of 'SlaveBuilder' instances that are available and ready for
86 assignment (slave_builders). 109 assignment (slave_builders).
87 - The total set of ONLINE 'SlaveBuilder' instances associated with 110 - The total set of ONLINE 'SlaveBuilder' instances associated with
88 'builder' (builder.slaves) 111 'builder' (builder.slaves)
89 - The set of all slaves configured for Builder (via 112 - The set of all slaves configured for Builder (via
90 '_get_all_slave_status') 113 '_get_all_slave_status')
91 114
92 We compile that into a stateful awareness and use it as a decision point. 115 We compile that into a stateful awareness and use it as a decision point.
93 Based on the slave availability and grace period, we will either: 116 Based on the slave availability and grace period, we will either:
94 (1) Return a slave immediately to claim this build 117 (1) Return a slave immediately to claim this build. We do this if:
95 (2) Return 'None' (delaying the build) in anticipation of a higher-strata 118 (1a) There was a "primary" build slave available, or
96 slave becoming available. 119 (1b) We are outside of all of the grace periods for the primary slaves,
120 and there is a floating builder available.
121 (2) Return 'None' (delaying the build) in anticipation of primary/floating
122 availability.
97 123
98 If we go with (2), we will schedule a 'poke' timer to stimulate a future 124 If we go with (2), we will schedule a 'poke' timer to stimulate a future
99 'nextSlave' call if the only higher-strata slave candidates are currently 125 'nextSlave' call, since BuildBot only checks for builds on explicit slave
100 offline. We do this because they could be permanently offline, so there's 126 availability edges. This covers the case where floating builders are
101 no guarentee that a 'nextSlave' will be naturally called in any time frame. 127 available, but aren't enlisted because we're within the grace period. In
128 this case, we need to re-evaluate slaves after the grace period expires,
129 but actual slave state won't haev changed, so no new slave availabilty edge
130 will have occurred.
102 """ 131 """
103 self._debug("Calling %r with builder=[%s], slaves=[%s]", 132 self._debug("Calling [%s] with builder=[%s], slaves=[%s]",
104 self, builder, slave_builders) 133 self, builder, slave_builders)
105 self._cancel_builder_timer(builder) 134 self._cancel_builder_timer(builder)
106 135
107 # Get the set of all 'SlaveStatus' assigned to this Builder (idle, busy, 136 # Get the set of all 'SlaveStatus' assigned to this Builder (idle, busy,
108 # and offline). 137 # and offline).
109 slave_status_map = dict( 138 slave_status_map = dict(
110 (slave_status.name, slave_status) 139 (slave_status.name, slave_status)
111 for slave_status in self._get_all_slave_status(builder) 140 for slave_status in self._get_all_slave_status(builder)
112 ) 141 )
113 142
114 # Index proposed 'nextSlave' slaves by name 143 # Record the names of the slaves that were proposed.
115 proposed_slave_builder_map = {} 144 proposed_slave_builder_map = {}
116 for slave_builder in slave_builders: 145 for slave_builder in slave_builders:
117 proposed_slave_builder_map[slave_builder.slave.slavename] = slave_builder 146 proposed_slave_builder_map[slave_builder.slave.slavename] = slave_builder
118 147
119 # Calculate the oldest a slave can be before we assume something's wrong. 148 # Calculate the oldest a slave can be before we assume something's wrong.
120 grace_threshold = now = None 149 now = _get_now()
121 if self._grace_period is not None: 150 grace_threshold = (now - self._grace_period)
122 now = datetime.now()
123 grace_threshold = (now - self._grace_period)
124 151
125 # Index all builder slaves (even busy ones) by name. Also, record this 152 # Record the last time we've seen any of these slaves online.
126 # slave's strata so we can reference it even if the slave goes offline
127 # in the future.
128 online_slave_builders = set() 153 online_slave_builders = set()
129 for slave_builder in builder.slaves: 154 for slave_builder in builder.slaves:
130 build_slave = slave_builder.slave 155 build_slave = slave_builder.slave
131 if build_slave is None: 156 if build_slave is None:
132 continue 157 continue
133 self._record_strata(build_slave) 158 self._record_slave_seen_time(build_slave, now)
134 if now is not None:
135 self._record_slave_seen_time(build_slave, now)
136 online_slave_builders.add(build_slave.slavename) 159 online_slave_builders.add(build_slave.slavename)
137 160
138 # Check the strata, in order. 161 self._debug('Online proposed slaves: [%s]',
139 for stratum in self._strata: 162 slave_builders)
140 busy_slaves = []
141 offline_slaves = []
142 wait_delta = None
143 163
144 for slave_name in self._slave_strata_map.get(stratum, ()): 164 # Are there any primary slaves that are proposed? If so, use it
145 self._debug("Considering slave %r for stratum %r", slave_name, stratum) 165 within_grace_period = []
166 some_primary_were_busy = False
167 wait_delta = None
168 for slave_name in self._primary:
169 self._debug('Considering primary slave [%s]', slave_name)
146 170
147 # Get the 'SlaveStatus' object for this slave 171 # Was this slave proposed to 'nextSlave'?
148 slave_status = slave_status_map.get(slave_name) 172 slave_builder = proposed_slave_builder_map.get(slave_name)
149 if slave_status is None: 173 if slave_builder is not None:
150 continue 174 # Yes. Use it!
175 self._debug('Slave [%s] is available', slave_name)
176 return slave_builder
151 177
152 # Was this slave proposed by 'nextSlave'? 178 # Is this slave online? If so, we won't consider floating candiates.
179 if slave_name in online_slave_builders:
180 # The slave is online, but is not proposed (BUSY); add it to the
181 # desired slaves list.
182 self._debug('Slave [%s] is online but BUSY.', slave_name)
183 within_grace_period.append(slave_name)
184 some_primary_were_busy = True
185 continue
186
187 # Get the 'SlaveStatus' object for this slave
188 slave_status = slave_status_map.get(slave_name)
189 if slave_status is None:
190 continue
191
192 # The slave is offline. Is this slave within the grace period?
193 last_seen = self._get_latest_seen_time(slave_status)
194 if last_seen < grace_threshold:
195 # No, the slave is older than our grace period.
196 self._debug('Slave [%s] is OFFLINE and outside grace period '
197 '(%s < %s).', slave_name, last_seen, grace_threshold)
198 continue
199
200 # This slave is within its grace threshold. Add it to the list of
201 # desired slaves from this set and update our wait delta in case we
202 # have to poke.
203 #
204 # We track the longest grace period delta, since after this point if
205 # no slaves have taken the build we would otherwise hang.
206 self._debug('Slave %r is OFFLINE but within grace period '
207 '(%s >= %s).', slave_name, last_seen, grace_threshold)
208 within_grace_period.append(slave_name)
209 slave_wait_delta = (self._grace_period - (now - last_seen))
210 if (wait_delta is None) or (slave_wait_delta > wait_delta):
211 wait_delta = slave_wait_delta
212
213 # We've looped through all primary slaves, and none of them were available.
214 # Were some within the grace period?
215 if not within_grace_period:
216 # We're outside of our grace period. Are there floating slaves that we
217 # can use?
218 for slave_name in self._floating:
153 slave_builder = proposed_slave_builder_map.get(slave_name) 219 slave_builder = proposed_slave_builder_map.get(slave_name)
154 if slave_builder is not None: 220 if slave_builder is not None:
155 # Yes. Use it! 221 # Yes. Use it!
156 self._debug("Slave %r is available", slave_name) 222 self._debug('Slave [%s] is available', slave_name)
157 return slave_builder 223 return slave_builder
158 224
159 # Is this slave online? 225 self._debug('No slaves are available; returning None')
160 if slave_name in online_slave_builders: 226 return None
161 # The slave is online, but is not proposed (BUSY); add it to the
162 # desired slaves list.
163 self._debug("Slave %r is online but BUSY; marking preferred",
164 slave_name)
165 busy_slaves.append(slave_name)
166 continue
167 227
168 # The slave is offline; do we have a grace period? 228 # We're going to return 'None' to wait for a primary slave. If all of
169 if grace_threshold is None: 229 # the slaves that we're anticipating are offline, schedule a 'poke'
170 # No grace period, so this slave is not a candidate 230 # after the last candidate has exceeded its grace period to allow the
171 self._debug("Slave %r is OFFLINE with no grace period; ignoring", 231 # build to go to lower strata.
172 slave_name) 232 log.msg('Returning None in anticipation of unavailable primary slaves. '
173 continue 233 'Please disregard the following BuildBot `nextSlave` '
234 'error: %s' % (within_grace_period,))
174 235
175 # Yes; is this slave within the grace period? 236 if (not some_primary_were_busy) and (wait_delta is not None):
176 last_seen = self._get_latest_seen_time(slave_status) 237 self._debug('Scheduling ping for [%s] in [%s]',
177 if last_seen < grace_threshold: 238 builder.name, wait_delta)
178 # Not within grace period, so this slave is out. 239 self._schedule_builder_timer(builder, wait_delta)
179 self._debug("Slave %r is OFFLINE and outside of grace period "
180 "(%s < %s); ignoring",
181 slave_name, last_seen, grace_threshold)
182 continue
183
184 # This slave is within its grace threshold. Add it to the list of
185 # desired stratum slaves and update our wait delta in case we have to
186 # poke.
187 #
188 # We track the longest grace period delta, since after this point if
189 # no slaves have taken the build we would otherwise hang.
190 self._debug("Slave %r is OFFLINE but within grace period "
191 "(%s >= %s); marking preferred",
192 slave_name, last_seen, grace_threshold)
193 offline_slaves.append(slave_name)
194 slave_wait_delta = (self._grace_period - (now - last_seen))
195 if (wait_delta is None) or (slave_wait_delta > wait_delta):
196 wait_delta = slave_wait_delta
197
198 # We've looped through our stratum and found no proposed candidates. Are
199 # there any preferred ones?
200 if busy_slaves or offline_slaves:
201 log.msg("Returning 'None' in anticipation of unavailable slaves. "
202 "Please disregard the following BuildBot 'nextSlave' "
203 "error: %s" % (busy_slaves + offline_slaves,))
204
205 # We're going to return 'None' to wait for a preferred slave. If all of
206 # the slaves that we're anticipating are offline, schedule a 'poke'
207 # after the last candidate has exceeded its grace period to allow the
208 # build to go to lower strata.
209 if (not busy_slaves) and (wait_delta is not None):
210 self._debug("Scheduling 'ping' for %r in %s",
211 builder.name, wait_delta)
212 self._schedule_builder_timer(
213 builder,
214 wait_delta,
215 )
216 return None
217
218 self._debug("No slaves are available; returning 'None'")
219 return None 240 return None
220 241
221 def _debug(self, fmt, *args): 242 def _debug(self, fmt, *args):
222 if not self.verbose: 243 if not self.verbose:
223 return 244 return
224 log.msg(fmt % args) 245 log.msg(fmt % args)
225 246
226 @staticmethod 247 @staticmethod
227 def _get_all_slave_status(builder): 248 def _get_all_slave_status(builder):
228 # Try using the builder's BuilderStatus object to get a list of all slaves 249 # Try using the builder's BuilderStatus object to get a list of all slaves
(...skipping 16 matching lines...) Expand all
245 266
246 # Add the last time we've seen the slave in our 'nextSlave' function 267 # Add the last time we've seen the slave in our 'nextSlave' function
247 last_seen_time = self._slave_seen_times.get(slave_status.name) 268 last_seen_time = self._slave_seen_times.get(slave_status.name)
248 if last_seen_time is not None: 269 if last_seen_time is not None:
249 times.append(last_seen_time) 270 times.append(last_seen_time)
250 271
251 if not times: 272 if not times:
252 return None 273 return None
253 return max(times) 274 return max(times)
254 275
255 def _record_strata(self, build_slave):
256 stratum = build_slave.properties.getProperty(self._strata_property)
257 strata_set = self._slave_strata_map.get(stratum)
258 if strata_set is None:
259 strata_set = set()
260 self._slave_strata_map[stratum] = strata_set
261 strata_set.add(build_slave.slavename)
262
263 def _record_slave_seen_time(self, build_slave, now): 276 def _record_slave_seen_time(self, build_slave, now):
264 self._slave_seen_times[build_slave.slavename] = now 277 self._slave_seen_times[build_slave.slavename] = now
265 278
266 def _schedule_builder_timer(self, builder, delta): 279 def _schedule_builder_timer(self, builder, delta):
267 poke_builder_timer = self._poke_builder_timers.get(builder.name) 280 poke_builder_timer = self._poke_builder_timers.get(builder.name)
268 if poke_builder_timer is None: 281 if poke_builder_timer is None:
269 poke_builder_timer = PokeBuilderTimer( 282 poke_builder_timer = PokeBuilderTimer(
270 builder.botmaster, 283 builder.botmaster,
271 builder.name, 284 builder.name,
272 ) 285 )
273 self._poke_builder_timers[builder.name] = poke_builder_timer 286 self._poke_builder_timers[builder.name] = poke_builder_timer
274 poke_builder_timer.reset(delta) 287 poke_builder_timer.reset(delta)
275 288
276 def _cancel_builder_timer(self, builder): 289 def _cancel_builder_timer(self, builder):
277 poke_builder_timer = self._poke_builder_timers.get(builder.name) 290 poke_builder_timer = self._poke_builder_timers.get(builder.name)
278 if poke_builder_timer is None: 291 if poke_builder_timer is None:
279 return 292 return
280 poke_builder_timer.cancel() 293 poke_builder_timer.cancel()
294
295
296 def _get_now():
297 """Returns (datetime.datetime): The current time.
298
299 This exists so it can be overridden by mocks in unit tests.
300 """
301 return datetime.datetime.now()
302
303
OLDNEW
« no previous file with comments | « scripts/master/cros/builder_config.py ('k') | scripts/master/unittests/floating_builder_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698