| OLD | NEW |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from collections import defaultdict |
| 5 from datetime import datetime | 6 from datetime import datetime |
| 6 from datetime import timedelta | 7 from datetime import timedelta |
| 7 import logging | 8 import logging |
| 8 | 9 |
| 9 from google.appengine.ext import ndb | 10 from google.appengine.ext import ndb |
| 10 | 11 |
| 11 from common import appengine_util | 12 from common import appengine_util |
| 12 from common import constants | 13 from common import constants |
| 13 from common.waterfall import failure_type | 14 from common.waterfall import failure_type |
| 14 from model import analysis_status | 15 from model import analysis_status |
| 15 from model.wf_analysis import WfAnalysis | 16 from model.wf_analysis import WfAnalysis |
| 16 from model.wf_build import WfBuild | 17 from model.wf_build import WfBuild |
| 17 from model.wf_failure_group import WfFailureGroup | 18 from model.wf_failure_group import WfFailureGroup |
| 18 from model.wf_try_job import WfTryJob | 19 from model.wf_try_job import WfTryJob |
| 19 from waterfall import swarming_tasks_to_try_job_pipeline | |
| 20 from waterfall import waterfall_config | 20 from waterfall import waterfall_config |
| 21 from waterfall.try_job_type import TryJobType | |
| 22 | 21 |
| 23 # TODO(lijeffrey): Move this to config. | 22 # TODO(lijeffrey): Move this to config. |
| 24 MATCHING_GROUPS_SECONDS_AGO = 24 * 60 * 60 # 24 hours. | 23 MATCHING_GROUPS_SECONDS_AGO = 24 * 60 * 60 # 24 hours. |
| 25 | 24 |
| 26 | 25 |
| 27 def _CheckFailureForTryJobKey( | 26 def _ShouldBailOutForOutdatedBuild(build): |
| 28 master_name, builder_name, build_number, | 27 return (datetime.utcnow() - build.start_time).days > 0 |
| 29 failure_result_map, failed_step_or_test, failure): | |
| 30 """Compares the current_failure and first_failure for each failed_step/test. | |
| 31 | |
| 32 If equal, a new try_job needs to start; | |
| 33 If not, apply the key of the first_failure's try_job to this failure. | |
| 34 """ | |
| 35 # TODO(chanli): Need to compare failures across builders | |
| 36 # after the grouping of failures is implemented. | |
| 37 # TODO(chanli): Need to handle cases where first failure is actually | |
| 38 # more than 20 builds back. The implementation should not be here, | |
| 39 # but need to be taken care of. | |
| 40 if not failure.get('last_pass'): | |
| 41 # Bail out since cannot figure out the good_revision. | |
| 42 return False, None | |
| 43 | |
| 44 if failure['current_failure'] == failure['first_failure']: | |
| 45 failure_result_map[failed_step_or_test] = '%s/%s/%s' % ( | |
| 46 master_name, builder_name, build_number) | |
| 47 return True, failure['last_pass'] # A new try_job is needed. | |
| 48 else: | |
| 49 failure_result_map[failed_step_or_test] = '%s/%s/%s' % ( | |
| 50 master_name, builder_name, failure['first_failure']) | |
| 51 return False, None | |
| 52 | 28 |
| 53 | 29 |
| 54 def _CheckIfNeedNewTryJobForTestFailure( | 30 def _CurrentBuildKey(master_name, builder_name, build_number): |
| 55 failure_level, master_name, builder_name, build_number, | 31 return '%s/%s/%d' % (master_name, builder_name, build_number) |
| 56 failure_result_map, failures): | |
| 57 """Traverses failed steps or tests to check if a new try job is needed.""" | |
| 58 need_new_try_job = False | |
| 59 last_pass = build_number | |
| 60 targeted_tests = {} if failure_level == 'step' else [] | |
| 61 | |
| 62 for failure_name, failure in failures.iteritems(): | |
| 63 if 'tests' in failure: | |
| 64 failure_result_map[failure_name] = {} | |
| 65 failure_targeted_tests, failure_need_try_job, failure_last_pass = ( | |
| 66 _CheckIfNeedNewTryJobForTestFailure( | |
| 67 'test', master_name, builder_name, build_number, | |
| 68 failure_result_map[failure_name], failure['tests'])) | |
| 69 if failure_need_try_job: | |
| 70 targeted_tests[failure_name] = failure_targeted_tests | |
| 71 else: | |
| 72 failure_need_try_job, failure_last_pass = _CheckFailureForTryJobKey( | |
| 73 master_name, builder_name, build_number, | |
| 74 failure_result_map, failure_name, failure) | |
| 75 if failure_need_try_job: | |
| 76 if failure_level == 'step': | |
| 77 targeted_tests[failure_name] = [] | |
| 78 else: | |
| 79 targeted_tests.append(failure.get('base_test_name', failure_name)) | |
| 80 | |
| 81 need_new_try_job = need_new_try_job or failure_need_try_job | |
| 82 last_pass = (failure_last_pass if failure_last_pass and | |
| 83 failure_last_pass < last_pass else last_pass) | |
| 84 | |
| 85 return targeted_tests, need_new_try_job, last_pass | |
| 86 | 32 |
| 87 | 33 |
| 88 def _BlameListsIntersection(blame_list_1, blame_list_2): | 34 def _BlameListsIntersection(blame_list_1, blame_list_2): |
| 89 return set(blame_list_1) & set(blame_list_2) | 35 return set(blame_list_1) & set(blame_list_2) |
| 90 | 36 |
| 91 | 37 |
| 92 def _GetStepsAndTests(failed_steps): | 38 def _GetStepsAndTests(failed_steps): |
| 93 """Extracts failed steps and tests from failed_steps data structure. | 39 """Extracts failed steps and tests from failed_steps data structure. |
| 94 | 40 |
| 95 Args: | 41 Args: |
| (...skipping 205 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 301 try_job.put() | 247 try_job.put() |
| 302 else: | 248 else: |
| 303 try_job_entity_revived_or_created = False | 249 try_job_entity_revived_or_created = False |
| 304 else: | 250 else: |
| 305 try_job = WfTryJob.Create(master_name, builder_name, build_number) | 251 try_job = WfTryJob.Create(master_name, builder_name, build_number) |
| 306 try_job.put() | 252 try_job.put() |
| 307 | 253 |
| 308 return try_job_entity_revived_or_created | 254 return try_job_entity_revived_or_created |
| 309 | 255 |
| 310 | 256 |
| 311 def _NeedANewTryJob( | 257 def _NeedANewCompileTryJob( |
| 312 master_name, builder_name, build_number, build_failure_type, failed_steps, | 258 master_name, builder_name, build_number, failure_info): |
| 313 failure_result_map, builds, signals, heuristic_result, force_try_job=False): | |
| 314 """Checks if a new try_job is needed.""" | |
| 315 need_new_try_job = False | |
| 316 last_pass = build_number | |
| 317 | 259 |
| 318 if 'compile' in failed_steps: | 260 compile_failure = failure_info['failed_steps'].get('compile', {}) |
| 319 try_job_type = TryJobType.COMPILE | 261 if compile_failure: |
| 320 targeted_tests = None | 262 analysis = WfAnalysis.Get(master_name, builder_name, build_number) |
| 321 need_new_try_job, last_pass = _CheckFailureForTryJobKey( | 263 analysis.failure_result_map['compile'] = '%s/%s/%d' % ( |
| 322 master_name, builder_name, build_number, | 264 master_name, builder_name, compile_failure['first_failure']) |
| 323 failure_result_map, TryJobType.COMPILE, failed_steps['compile']) | 265 analysis.put() |
| 324 else: | |
| 325 try_job_type = TryJobType.TEST | |
| 326 targeted_tests, need_new_try_job, last_pass = ( | |
| 327 _CheckIfNeedNewTryJobForTestFailure( | |
| 328 'step', master_name, builder_name, build_number, failure_result_map, | |
| 329 failed_steps)) | |
| 330 | 266 |
| 331 need_new_try_job = ( | 267 if compile_failure['first_failure'] == compile_failure['current_failure']: |
| 332 need_new_try_job and ReviveOrCreateTryJobEntity( | 268 return True |
| 333 master_name, builder_name, build_number, force_try_job) and | |
| 334 _IsBuildFailureUniqueAcrossPlatforms( | |
| 335 master_name, builder_name, build_number, build_failure_type, | |
| 336 builds[str(build_number)]['blame_list'], failed_steps, signals, | |
| 337 heuristic_result)) | |
| 338 | 269 |
| 339 # TODO(josiahk): Integrate _IsBuildFailureUniqueAcrossPlatforms() into | 270 return False |
| 340 # need_new_try_job boolean | |
| 341 if need_new_try_job: | |
| 342 _IsBuildFailureUniqueAcrossPlatforms( | |
| 343 master_name, builder_name, build_number, build_failure_type, | |
| 344 builds[str(build_number)]['blame_list'], failed_steps, signals, | |
| 345 heuristic_result) | |
| 346 | |
| 347 return need_new_try_job, last_pass, try_job_type, targeted_tests | |
| 348 | 271 |
| 349 | 272 |
| 350 def _GetFailedTargetsFromSignals(signals, master_name, builder_name): | 273 def _CurrentBuildKeyInFailureResultMap(master_name, builder_name, build_number): |
| 351 compile_targets = [] | 274 analysis = WfAnalysis.Get(master_name, builder_name, build_number) |
| 352 | 275 failure_result_map = analysis.failure_result_map |
| 353 if not signals or 'compile' not in signals: | 276 current_build_key = _CurrentBuildKey(master_name, builder_name, build_number) |
| 354 return compile_targets | 277 for step_keys in failure_result_map.itervalues(): |
| 355 | 278 for test_key in step_keys.itervalues(): |
| 356 if signals['compile'].get('failed_output_nodes'): | 279 if test_key == current_build_key: |
| 357 return signals['compile'].get('failed_output_nodes') | 280 return True |
| 358 | 281 return False |
| 359 strict_regex = waterfall_config.EnableStrictRegexForCompileLinkFailures( | |
| 360 master_name, builder_name) | |
| 361 for source_target in signals['compile'].get('failed_targets', []): | |
| 362 # For link failures, we pass the executable targets directly to try-job, and | |
| 363 # there is no 'source' for link failures. | |
| 364 # For compile failures, only pass the object files as the compile targets | |
| 365 # for the bots that we use strict regex to extract such information. | |
| 366 if not source_target.get('source') or strict_regex: | |
| 367 compile_targets.append(source_target.get('target')) | |
| 368 | |
| 369 return compile_targets | |
| 370 | 282 |
| 371 | 283 |
| 372 def _GetSuspectsFromHeuristicResult(heuristic_result): | 284 def _NeedANewTestTryJob( |
| 373 suspected_revisions = set() | 285 master_name, builder_name, build_number, failure_info, force_try_job): |
| 374 if not heuristic_result: | 286 |
| 375 return list(suspected_revisions) | 287 if failure_info['failure_type'] != failure_type.TEST: |
| 376 for failure in heuristic_result.get('failures', []): | 288 return False |
| 377 for cl in failure['suspected_cls']: | 289 |
| 378 suspected_revisions.add(cl['revision']) | 290 if (not force_try_job and |
| 379 return list(suspected_revisions) | 291 waterfall_config.ShouldSkipTestTryJobs(master_name, builder_name)): |
| 292 logging.info('Test try jobs on %s, %s are not supported yet.', |
| 293 master_name, builder_name) |
| 294 return False |
| 295 |
| 296 return _CurrentBuildKeyInFailureResultMap( |
| 297 master_name, builder_name, build_number) |
| 380 | 298 |
| 381 | 299 |
| 382 def _ShouldBailOutForOutdatedBuild(build): | 300 def NeedANewTryJob( |
| 383 return (datetime.utcnow() - build.start_time).days > 0 | 301 master_name, builder_name, build_number, failure_info, signals, |
| 384 | 302 heuristic_result, force_try_job=False): |
| 385 | |
| 386 def ScheduleTryJobIfNeeded(failure_info, signals, heuristic_result, | |
| 387 force_try_job=False): | |
| 388 master_name = failure_info['master_name'] | |
| 389 builder_name = failure_info['builder_name'] | |
| 390 build_number = failure_info['build_number'] | |
| 391 failed_steps = failure_info.get('failed_steps', []) | |
| 392 builds = failure_info.get('builds', {}) | |
| 393 | 303 |
| 394 tryserver_mastername, tryserver_buildername = ( | 304 tryserver_mastername, tryserver_buildername = ( |
| 395 waterfall_config.GetTrybotForWaterfallBuilder(master_name, builder_name)) | 305 waterfall_config.GetTrybotForWaterfallBuilder(master_name, builder_name)) |
| 396 | 306 |
| 307 try_job_type = failure_info['failure_type'] |
| 397 if not tryserver_mastername or not tryserver_buildername: | 308 if not tryserver_mastername or not tryserver_buildername: |
| 398 logging.info('%s, %s is not supported yet.', master_name, builder_name) | 309 logging.info('%s, %s is not supported yet.', master_name, builder_name) |
| 399 return {} | 310 return False |
| 400 | 311 |
| 401 if not force_try_job: | 312 if not force_try_job: |
| 402 build = WfBuild.Get(master_name, builder_name, build_number) | 313 build = WfBuild.Get(master_name, builder_name, build_number) |
| 403 | 314 |
| 404 if _ShouldBailOutForOutdatedBuild(build): | 315 if _ShouldBailOutForOutdatedBuild(build): |
| 405 logging.error('Build time %s is more than 24 hours old. ' | 316 logging.error('Build time %s is more than 24 hours old. ' |
| 406 'Try job will not be triggered.' % build.start_time) | 317 'Try job will not be triggered.' % build.start_time) |
| 407 return {} | 318 return False |
| 408 | 319 |
| 409 if (failure_info['failure_type'] == failure_type.TEST and | 320 need_new_try_job = (_NeedANewCompileTryJob( |
| 410 waterfall_config.ShouldSkipTestTryJobs(master_name, builder_name)): | 321 master_name, builder_name, build_number, failure_info) |
| 411 logging.info('Test try jobs on %s, %s are not supported yet.', | 322 if try_job_type == failure_type.COMPILE else |
| 412 master_name, builder_name) | 323 _NeedANewTestTryJob( |
| 413 return {} | 324 master_name, builder_name, build_number, failure_info, force_try_job)) |
| 414 | |
| 415 failure_result_map = {} | |
| 416 need_new_try_job, last_pass, try_job_type, targeted_tests = ( | |
| 417 _NeedANewTryJob(master_name, builder_name, build_number, | |
| 418 failure_info['failure_type'], failed_steps, | |
| 419 failure_result_map, builds, signals, heuristic_result, | |
| 420 force_try_job)) | |
| 421 | 325 |
| 422 if need_new_try_job: | 326 if need_new_try_job: |
| 423 compile_targets = (_GetFailedTargetsFromSignals( | 327 # TODO(josiahk): Integrate this into need_new_try_job boolean |
| 424 signals, master_name, builder_name) | 328 _IsBuildFailureUniqueAcrossPlatforms( |
| 425 if try_job_type == TryJobType.COMPILE else None) | 329 master_name, builder_name, build_number, try_job_type, |
| 426 suspected_revisions = _GetSuspectsFromHeuristicResult(heuristic_result) | 330 failure_info['builds'][str(build_number)]['blame_list'], |
| 331 failure_info['failed_steps'], signals, heuristic_result) |
| 427 | 332 |
| 428 pipeline = ( | 333 need_new_try_job = need_new_try_job and ReviveOrCreateTryJobEntity( |
| 429 swarming_tasks_to_try_job_pipeline.SwarmingTasksToTryJobPipeline( | 334 master_name, builder_name, build_number, force_try_job) |
| 430 master_name, builder_name, build_number, | 335 return need_new_try_job |
| 431 builds[str(last_pass)]['chromium_revision'], | |
| 432 builds[str(build_number)]['chromium_revision'], | |
| 433 builds[str(build_number)]['blame_list'], | |
| 434 try_job_type, compile_targets, targeted_tests, suspected_revisions, | |
| 435 force_try_job)) | |
| 436 | 336 |
| 437 pipeline.target = appengine_util.GetTargetNameForModule( | |
| 438 constants.WATERFALL_BACKEND) | |
| 439 pipeline.start(queue_name=constants.WATERFALL_TRY_JOB_QUEUE) | |
| 440 | 337 |
| 441 if try_job_type == TryJobType.TEST: # pragma: no cover | 338 def GetFailedTargetsFromSignals(signals, master_name, builder_name): |
| 442 logging_str = ( | 339 compile_targets = [] |
| 443 'Trying to schedule swarming task(s) for build %s, %s, %s: %s' | |
| 444 ' because of %s failure. A try job may be triggered if some reliable' | |
| 445 ' failure is detected in task(s).') % ( | |
| 446 master_name, builder_name, build_number, | |
| 447 pipeline.pipeline_status_path, try_job_type) | |
| 448 else: # pragma: no cover | |
| 449 logging_str = ( | |
| 450 'Try job was scheduled for build %s, %s, %s: %s because of %s ' | |
| 451 'failure.') % ( | |
| 452 master_name, builder_name, build_number, | |
| 453 pipeline.pipeline_status_path, try_job_type) | |
| 454 logging.info(logging_str) | |
| 455 | 340 |
| 456 return failure_result_map | 341 if not signals or 'compile' not in signals: |
| 342 return compile_targets |
| 343 |
| 344 if signals['compile'].get('failed_output_nodes'): |
| 345 return signals['compile'].get('failed_output_nodes') |
| 346 |
| 347 strict_regex = waterfall_config.EnableStrictRegexForCompileLinkFailures( |
| 348 master_name, builder_name) |
| 349 for source_target in signals['compile'].get('failed_targets', []): |
| 350 # For link failures, we pass the executable targets directly to try-job, and |
| 351 # there is no 'source' for link failures. |
| 352 # For compile failures, only pass the object files as the compile targets |
| 353 # for the bots that we use strict regex to extract such information. |
| 354 if not source_target.get('source') or strict_regex: |
| 355 compile_targets.append(source_target.get('target')) |
| 356 |
| 357 return compile_targets |
| OLD | NEW |