verification/try_server.py - Issue 7108020: Add automatic retry mechanism and LKGR support.

Side by Side Diff: verification/try_server.py

Issue 7108020: Add automatic retry mechanism and LKGR support. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/commit-queue

Patch Set: Created 9 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # coding=utf8	1 # coding=utf8

2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.	2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5 """Sends patches to the Try server and reads back results.	5 """Sends patches to the Try server and reads back results.

6 """	6 """

7	7

8 import time	8 import time

9 import logging	9 import logging

10 import urllib2	10 import urllib2

11	11

12 import find_depot_tools # pylint: disable=W0611	12 import find_depot_tools # pylint: disable=W0611

13 import trychange	13 import trychange

14	14

15 import buildbot_json	15 import buildbot_json

16 import model	16 import model

17 from thirdparty.datastructures import SortedDict

18 from verification import base	17 from verification import base

19	18

20	19

21 # We don't want to have trychange use gcl so block it.	20 # We don't want to have trychange use gcl so block it.

22 trychange.gcl = None	21 trychange.gcl = None

23 # Hack out trychange logging.info()	22 # Hack out trychange logging.info()

24 trychange.logging = logging.getLogger('trychange')	23 trychange.logging = logging.getLogger('trychange')

25 trychange.logging.setLevel(logging.WARNING)	24 trychange.logging.setLevel(logging.WARNING)

26	25

27	26

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
73 self.try_jobs = []	72 self.try_jobs = []

74	73

75 def get_state(self):	74 def get_state(self):

76 if not self.try_jobs:	75 if not self.try_jobs:

77 return base.PROCESSING	76 return base.PROCESSING

78 states = set(i.get_state() for i in self.try_jobs)	77 states = set(i.get_state() for i in self.try_jobs)

79 assert states.issubset(base.VALID_STATES)	78 assert states.issubset(base.VALID_STATES)

80 return max(states)	79 return max(states)

81	80

82	81

	82 def steps_quality(steps):

	83 if not steps:

	84 return None

	85 return all(v in (True, None) for v in steps)

	86

	87

83 class StepDb(object):	88 class StepDb(object):

84 """Lists all steps for each revision known to have passed at least once."""	89 """Keeps statistics about all steps for each revisions."""

85 max_cache = 200	90 max_cache = 200

86	91

87 def __init__(self, builders):	92 def __init__(self, builders, buildbot):

88 self._need_full = True	93 self._need_full = True

89 # Builds with a patch.	94 self.builders = builders

90 self.patched_builds = dict((b, SortedDict()) for b in builders)	95 self.buildbot = buildbot

91 # Builds without a patch (with or without a clobber).

92 self.clean_builds = dict((b, SortedDict()) for b in builders)

93	96

94 def need_full(self):	97 def need_full(self):

95 result = self._need_full	98 result = self._need_full

96 self._need_full = False	99 self._need_full = False

97 return result	100 return result

98	101

99 def step_quality(self, builder, revision, step):	102 def revision_quality_builder_steps(self, builder, revision):

100 """Returns if a step is known to have passed at least one time, in the	103 steps = None

101 closed revision.	104 nb_builds = 0

	105 for build in self.buildbot.builders[builder].builds.cached_children:

	106 if build.revision != revision:

	107 continue

	108 nb_builds += 1

	109 assert not steps or len(steps) == len(build.steps)

	110 if not steps or len(steps) != len(build.steps):

	111 # If the number of steps changed after a master restart, we need to

	112 # ditch the previous steps.

	113 # One workaround is to key by name.

	114 steps = [None] * len(build.steps)

	115 for step in build.steps:

	116 steps[step.number] = or_3_way(

	117 steps[step.number], step.simplified_result)

	118 return steps, nb_builds

102	119

103 Warning: A step index is not comparable across builders since each builder	120 def last_good_revision_builder(self, builder):

104 has different steps in a different order.	121 """Returns LKGR for this builder."""

105 """	122 state = {}

106 if (revision not in self.patched_builds[builder] and	123 for build in self.buildbot.builders[builder].builds.cached_children:

107 revision not in self.clean_builds[builder]):	124 state.setdefault(build.revision, [None] * len(build.steps))

	125 for step in build.steps:

	126 state[build.revision][step.number] = or_3_way(

	127 state[build.revision][step.number],

	128 step.simplified_result)

	129

	130 revisions = [

	131 revision for revision in sorted(state)

	132 if all(v in (True, None) for v in state[revision])

	133 ]

	134 if not revisions:

108 return None	135 return None

109 return (	136 return revisions[-1]

110 (revision in self.patched_builds[builder] and

111 self.patched_builds[builder][revision][step]) or

112 (revision in self.clean_builds[builder] and

113 self.clean_builds[builder][revision][step]))

114

115 def revision_quality(self, revision):

116 """Returns True if a revision succeeded at least one time on at least one

117 builder.

118 """

119 return reduce(or_3_way,

120 (self.revision_quality_builder(b, revision)

121 for b in self.patched_builds))

122

123 def revision_quality_builder(self, builder, revision):

124 """Returns if a revision succeeded at least one time.

125

126 Warning: A step index is not comparable across builders since each builder

127 has different steps in a different order.

128 """

129 if (revision not in self.patched_builds[builder] and

130 revision not in self.clean_builds[builder]):

131 return None

132 bad_steps = []

133 for i, value in enumerate(self.patched_builds[builder].get(revision, [])):

134 if value is False:

135 bad_steps.append(i)

136 for i, value in enumerate(self.clean_builds[builder].get(revision, [])):

137 if value is False:

138 bad_steps.append(i)

139 if value and i in bad_steps:

140 bad_steps.remove(i)

141 return not bad_steps

142

143 def seen_revisions(self):

144 """Returns all revisions that returned some status."""

145 revisions = set()

146 for builder in self.patched_builds:

147 revisions \|= set(self.patched_builds[builder].keys())

148 revisions \|= set(self.clean_builds[builder].keys())

149 return sorted(revisions)

150

151 def good_revisions(self):

152 """Returns all revisions that succeeded on all builders."""

153 for revision in self.seen_revisions():

154 if self.revision_quality(revision):

155 yield revision

156

157 def bad_revisions(self):

158 """Returns all revisions that never succeeded on any builder."""

159 for revision in self.seen_revisions():

160 if self.revision_quality(revision) is False:

161 yield revision

162

163 def update(self, buildbot):

164 """Updates the internal db."""

165 for builder in self.clean_builds:

166 # Only access builds already cached.

167 for build in buildbot.builders[builder].builds.cached_children:

168 if build.data['sourceStamp'].get('hasPatch', False):

169 b = self.patched_builds[builder]

170 else:

171 b = self.clean_builds[builder]

172 new_values = [s.simplified_result for s in build.steps]

173 if build.revision not in b:

174 b[build.revision] = new_values

175 else:

176 len_b = len(b[build.revision])

177 len_n = len(new_values)

178 new_length = max(len_b, len_n)

179 b[build.revision].extend([None] * (new_length - len_b))

180 new_values.extend([None] * (new_length - len_n))

181 b[build.revision] = [

182 or_3_way(old_value, new_values[i])

183 for i, old_value in enumerate(b[build.revision])

184 ]

185

186 for builds in self.patched_builds.itervalues():

187 while len(builds) > self.max_cache:

188 builds.popitem(builds.keyOrders[0])

189 for builds in self.clean_builds.itervalues():

190 while len(builds) > self.max_cache:

191 builds.popitem(builds.keyOrders[0])

192	137

193	138

194 class TryRunner(base.Verifier):	139 class TryRunner(base.Verifier):

195 """Stateless communication with a try server.	140 """Stateless communication with a try server.

196	141

197 Sends try jobs and reads try job status.	142 Sends try jobs and reads try job status.

198	143

199 Analysis goes as following:	144 Analysis goes as following:

200 - compile step is not flaky. compile.py already takes care of most flakiness	145 - compile step is not flaky. compile.py already takes care of most flakiness

201 and clobber build is done by default.	146 and clobber build is done by default.

(...skipping 10 matching lines...) Expand all Loading...
212 """	157 """

213 name = 'try server'	158 name = 'try server'

214	159

215 # A try job sent this long ago and that hasn't started yet is deemed to be	160 # A try job sent this long ago and that hasn't started yet is deemed to be

216 # lost.	161 # lost.

217 lost_try_job_delay = 15*60	162 lost_try_job_delay = 15*60

218	163

219 # Only updates a job status once every 60 seconds.	164 # Only updates a job status once every 60 seconds.

220 update_latency = 60	165 update_latency = 60

221	166

222 def __init__(self, try_server_url, commit_user, builders, tests, extra_flags):	167 def __init__(self, try_server_url, commit_user, builders, tests, extra_flags,

	168 lkgr):

223 super(TryRunner, self).__init__()	169 super(TryRunner, self).__init__()

224 self.commit_user = commit_user	170 self.commit_user = commit_user

225 self.try_server_url = try_server_url	171 self.try_server_url = try_server_url

226 self.builders = builders	172 self.builders = builders

227 self.tests = tests	173 self.tests = tests

228 self.extra_flags = extra_flags or []	174 self.extra_flags = extra_flags or []

229 self.status = buildbot_json.Buildbot(self.try_server_url)	175 self.status = buildbot_json.Buildbot(self.try_server_url)

230 self.step_db = StepDb(builders)	176 self.step_db = StepDb(self.builders, self.status)

231 self.last_update = time.time() - self.update_latency	177 self.last_update = time.time() - self.update_latency

	178 self.lkgr = lkgr

232	179

233 def verify(self, pending, revision):	180 def verify(self, pending, revision):

234 """Sends a try job to the try server and returns a TryJob list."""	181 """Sends a try job to the try server and returns a TryJob list."""

235 jobs = pending.verifications.setdefault(self.name, TryJobs())	182 jobs = pending.verifications.setdefault(self.name, TryJobs())

236 jobs.try_jobs = jobs.try_jobs or []	183 jobs.try_jobs = jobs.try_jobs or []

237 assert not jobs.try_jobs	184 assert not jobs.try_jobs

238 new_jobs = [	185 new_jobs = [

239 TryJob(builder, str(revision), False) for builder in self.builders]	186 TryJob(builder, str(revision), False) for builder in self.builders]

240 jobs.try_jobs.extend(new_jobs)	187 jobs.try_jobs.extend(new_jobs)

241 self._send_jobs(pending, new_jobs)	188 self._send_jobs(pending, new_jobs)

242 # Slightly postpone next check.	189 # Slightly postpone next check.

243 self.last_update = min(	190 self.last_update = min(

244 time.time(), self.last_update + (self.update_latency / 4))	191 time.time(), self.last_update + (self.update_latency / 4))

245	192

246 def update_status(self, queue):	193 def update_status(self, queue):

247 """Grabs the current status of all try jobs and update self.queue.	194 """Grabs the current status of all try jobs and update self.queue.

248	195

249 Note: it would be more efficient to be event based.	196 Note: it would be more efficient to be event based.

250 """	197 """

251 if not queue:	198 if not queue:

252 logging.debug('The list is empty, nothing to do')	199 logging.debug('The list is empty, nothing to do')

253 return	200 return

254	201

255 if time.time() - self.last_update < self.update_latency:	202 if time.time() - self.last_update < self.update_latency:

256 logging.debug('Throttling updates')	203 logging.debug('Throttling updates')

257 return	204 return

258 self.last_update = time.time()	205 self.last_update = time.time()

259	206

260 self._reset_cache(queue)	207 self._reset_cache(queue)

261 self.step_db.update(self.status)

262	208

263 # Do the actual processing to update the TryJob status.	209 # Do the actual processing to update the TryJob status.

264 for pending, jobs in self.loop(queue, TryJobs, True):	210 for pending, jobs in self.loop(queue, TryJobs, True):

265 for job in jobs.try_jobs:	211 for job in jobs.try_jobs:

266 if job.get_state() != base.PROCESSING:	212 if job.get_state() != base.PROCESSING:

267 continue	213 continue

268 # There's one try job per builder.	214 # There's one try job per builder.

269 # TODO(maruel): There should be differentiation when there's multiple	215 # TODO(maruel): There should be differentiation when there's multiple

270 # jobs for a single builder.	216 # jobs for a single builder.

271 build = None	217 build = None

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
321 self._upgrade(queue)	267 self._upgrade(queue)

322	268

323 jobs_to_update = []	269 jobs_to_update = []

324 for _, jobs in self.loop(queue, TryJobs, True):	270 for _, jobs in self.loop(queue, TryJobs, True):

325 jobs_to_update.extend(	271 jobs_to_update.extend(

326 job for job in jobs.try_jobs if job.get_state() == base.PROCESSING)	272 job for job in jobs.try_jobs if job.get_state() == base.PROCESSING)

327	273

328 # First determine what data is needed.	274 # First determine what data is needed.

329 builds_to_cache = {}	275 builds_to_cache = {}

330 if self.step_db.need_full():	276 if self.step_db.need_full():

331 logging.info('Fetching all try jobs status because of good_revisions')	277 logging.info('Fetching all try jobs status to fetch good revisions')

332 builders_to_cache = self.builders	278 builders_to_cache = self.builders

333 else:	279 else:

334 builders_to_cache = set()	280 builders_to_cache = set()

335 for job in jobs_to_update:	281 for job in jobs_to_update:

336 if job.build is None:	282 if job.build is None:

337 builders_to_cache.add(job.builder)	283 builders_to_cache.add(job.builder)

338 else:	284 else:

339 if job.get_state() == base.PROCESSING:	285 if job.get_state() == base.PROCESSING:

340 builds_to_cache.setdefault(job.builder, []).append(job.build)	286 builds_to_cache.setdefault(job.builder, []).append(job.build)

341	287

(...skipping 12 matching lines...) Expand all Loading...
354 del builds_to_cache[builder]	300 del builds_to_cache[builder]

355	301

356 # Cache remaining builds. Sort to make testing simpler.	302 # Cache remaining builds. Sort to make testing simpler.

357 for builder, builds in sorted(	303 for builder, builds in sorted(

358 builds_to_cache.iteritems(), key=lambda x: x[0]):	304 builds_to_cache.iteritems(), key=lambda x: x[0]):

359 self.status.builders[builder].builds.cache_partial(builds)	305 self.status.builders[builder].builds.cache_partial(builds)

360	306

361 def _send_job(self, pending, revision, clobber, builders, tests=None):	307 def _send_job(self, pending, revision, clobber, builders, tests=None):

362 """Sends a try job."""	308 """Sends a try job."""

363 # TODO(maruel): If revision is in self.bad_revisions[builder], choose	309 # TODO(maruel): If revision is in self.bad_revisions[builder], choose

364 # max(self.good_revisions[builder]) ? That can't easily be done since the	310 # self.last_good_revision_builder(builder) ? That can't easily be done since

365 # patch was already applied.	311 # the patch was already applied.

366 builders = builders or self.builders	312 builders = builders or self.builders

367 tests = tests or self.tests	313 tests = tests or self.tests

368 cmd = [	314 cmd = [

369 '--no_gclient',	315 '--no_gclient',

370 '--bot', ','.join(builders),	316 '--bot', ','.join(builders),

371 '--revision', str(revision),	317 '--revision', str(revision),

372 '--name', pending.pending_name(),	318 '--name', pending.pending_name(),

373 '--user', self.commit_user.split('@', 1)[0],	319 '--user', self.commit_user.split('@', 1)[0],

374 '--email', ','.join((self.commit_user, pending.owner)),	320 '--email', ','.join((self.commit_user, pending.owner)),

375 '--rietveld_url', pending.patch_url(),	321 '--rietveld_url', pending.patch_url(),

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
423 job.build = build.number	369 job.build = build.number

424 return build	370 return build

425 return None	371 return None

426	372

427 def _build_status_url(self, job):	373 def _build_status_url(self, job):

428 """Html url for this try job."""	374 """Html url for this try job."""

429 assert job.build is not None, str(job)	375 assert job.build is not None, str(job)

430 return '%s/buildstatus?builder=%s&number=%s' % (	376 return '%s/buildstatus?builder=%s&number=%s' % (

431 self.try_server_url.rstrip('/'), job.builder, job.build)	377 self.try_server_url.rstrip('/'), job.builder, job.build)

432	378

	379 def _error_msg(self, name, job, failed_steps):

	380 """Constructs the error message."""

	381 def steps_to_str(steps):

	382 if len(steps) > 1:

	383 return 'steps "%s"' % ', '.join(steps)

	384 else:

	385 return 'step "%s"' % steps[0]

	386

	387 msg = u'Try job failure for %s on %s for %s' % (

	388 name, job.builder, steps_to_str(failed_steps))

	389 if job.clobber:

	390 msg += ' (clobber build)'

	391 msg += '.'

	392 if job.failed_steps:

	393 msg += u'\nIt\'s a second try, previously, %s failed.' % (

	394 steps_to_str(job.failed_steps))

	395 msg += '\n%s' % self._build_status_url(job)

	396 logging.info(msg)

	397 return msg

	398

	399 def get_lkgr(self, builder):

	400 """Caches builds for a builder so lkgr is more useful."""

	401 return max(self.step_db.last_good_revision_builder(builder), self.lkgr())

	402

433 def _handle_try_job(self, pending, jobs, job, build):	403 def _handle_try_job(self, pending, jobs, job, build):

434 """Determines if the try job is a good signal to commit the patch."""	404 """Determines if the try job is a good signal to commit the patch."""

435 if build.simplified_result is None:	405 if build.simplified_result is None:

436 # The build hasn't completed yet.	406 # The build hasn't completed yet.

437 return	407 return

438 assert job.result is None	408 assert job.result is None

439 assert job.build is not None	409 assert job.build is not None

440 job.result = build.result	410 job.result = build.result

	411 # Warning: This code assumes that steps do not abort build on failure.

441 failed_steps = [	412 failed_steps = [

442 step.name for step in build.steps if step.simplified_result is False]	413 step.name for step in build.steps if step.simplified_result is False]

443 if job.get_state() != base.FAILED:	414 if job.get_state() != base.FAILED:

444 assert not failed_steps	415 assert not failed_steps

445 logging.info(u'Try job status for %s on %s: %s\n%s' % (	416 logging.info(u'Try job status for %s on %s: %s\n%s' % (

446 pending.pending_name(),	417 pending.pending_name(),

447 job.builder,	418 job.builder,

448 job.result,	419 job.result,

449 self._build_status_url(job)))	420 self._build_status_url(job)))

450 return	421 return

451	422

452 assert failed_steps	423 msg = self._error_msg(pending.pending_name(), job, failed_steps)

453 msg = (u'Try job failure for %s on %s for step%s %s:\n%s' % (	424 steps, _ = self.step_db.revision_quality_builder_steps(

454 pending.pending_name(),	425 job.builder, int(job.revision))

455 job.builder,	426 quality = steps_quality(steps)

456 's' if len(failed_steps) > 1 else '',

457 ', '.join(failed_steps),

458 self._build_status_url(job)))

459 logging.info(msg)

460	427

461 # Special case update and compile.	428 def retry(msg2, **kwargs):

462 if 'update' in job.failed_steps:	429 """Retry a try job. Will use LKGR if quality is bad."""

463 logging.debug('update is always a major failure')	430 if not quality:

464 jobs.error_message = msg	431 lkgr = self.get_lkgr(job.builder)

	432 if lkgr is None:

	433 logging.error('lkgr should never be None.')

	434 fail('Couldn\'t find a good revision, aborting.')

	435 return

	436 job.revision = lkgr

	437 logging.info(

	438 'Retrying %s on %s, %s; rev=%s; %s' %

	439 (pending.pending_name(), job.builder, kwargs, job.revision, msg2))

465 job.failed_steps = failed_steps	440 job.failed_steps = failed_steps

466 return	441 self._send_jobs(pending, [job], **kwargs)

467 if failed_steps == ['compile'] and not job.clobber:	442

468 logging.info('Trying again with clobber')	443 def fail(msg2):

469 # Note this would reset flaky if tests if there has been. This is fine	444 jobs.error_message = msg + msg2

470 # since a slave could be broken.	445 logging.info(jobs.error_message)

471 job.failed_steps = failed_steps	446 job.failed_steps = failed_steps

472 job.clobber = True

473 self._send_jobs(pending, [job])

474 return

475	447

476 # Look at the quality of the revision on this builder.	448 if 'update' in failed_steps:

477 # TODO(maruel): We should record the number of builds that were done on this	449 # Look at update quality specifically since it's a special step.

478 # revision? One or 2 builds don't give much signal.	450 # Do not take in account nb_builds == 1.

479 quality = self.step_db.revision_quality_builder(	451 if not quality and not steps[build.steps['update'].number]:

480 job.builder, int(job.revision))	452 # 'update' never passed.

	453 return retry('update has no quality')

	454

	455 return fail(

	456 '\n\nStep "update" is always a major failure.\n'

	457 'Look at the try server FAQ for more details.')

	458

	459 if 'compile' in failed_steps:

	460 if not job.clobber:

	461 # Note: this resets previous test failure if there has been on the

	462 # second previous try. This is fine since a slave could be broken.

	463 job.clobber = True

	464 return retry('retry compile with clobber')

	465

	466 return fail('')

481	467

482 if quality:	468 if quality:

483 if job.failed_steps:	469 if job.failed_steps:

484 logging.info('It\'s a second retry for %s on %s, abort' % (	470 # The job had already failed.

485 pending.pending_name(), job.builder))	471 return fail('')

486 jobs.error_message = msg

487 job.failed_steps = failed_steps

488 else:

489 logging.info(

490 'Retrying %s on %s' % (pending.pending_name(), job.builder))

491 job.failed_steps = failed_steps

492 self._send_jobs(pending, [job], tests=job.failed_steps)

493 return

494	472

495 # TODO(maruel): Implement better auto-retry.	473 return retry('Quality but first try', tests=failed_steps)

496 jobs.error_message = msg	474

497 job.failed_steps = failed_steps	475 # TODO(maruel): It would make sense to do a clobber build to see if the

	476 # revision is indeed broken, since this algorithm assumes that the try

	477 # server is continuously used for recent revisions!

	478 # The revision looks like it's broken, retry with lkgr instead.

	479 return retry('No quality, no idea', tests=failed_steps)

OLD	NEW

« tests/try_server_test.py ('K') | « thirdparty/datastructures.py ('k') | no next file » | no next file with comments »