appengine/findit/waterfall/flake/recursive_flake_try_job_pipeline.py - Issue 2630433002: Findit] Flake Checker: Pipeline to trigger try jobs to identify flake culprits

Side by Side Diff: appengine/findit/waterfall/flake/recursive_flake_try_job_pipeline.py

Issue 2630433002: Findit] Flake Checker: Pipeline to trigger try jobs to identify flake culprits (Closed)

Patch Set: Clean up Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« appengine/findit/waterfall/flake/process_flake_try_job_result_pipeline.py ('K') | « appengine/findit/waterfall/flake/recursive_flake_pipeline.py ('k') | appengine/findit/waterfall/flake/test/process_flake_try_job_result_pipeline_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 # Copyright 2016 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import logging

	6

	7 from google.appengine.ext import ndb

	8

	9 from gae_libs.http.http_client_appengine import HttpClientAppengine

	10 from gae_libs.gitiles.cached_gitiles_repository import CachedGitilesRepository

	11 from libs import time_util

	12

	13 from common import appengine_util

	14 from common import constants

	15 from common.pipeline_wrapper import BasePipeline

	16 from common.pipeline_wrapper import pipeline

	17 from common.waterfall import failure_type

	18 from model import analysis_status

	19 from model import result_status

	20 from model.flake.flake_culprit import FlakeCulprit

	21 from model.flake.flake_try_job import FlakeTryJob

	22 from waterfall.flake.process_flake_try_job_result_pipeline import (

	23 ProcessFlakeTryJobResultPipeline)

	24 from waterfall.monitor_try_job_pipeline import MonitorTryJobPipeline

	25 from waterfall.flake.schedule_flake_try_job_pipeline import (

	26 ScheduleFlakeTryJobPipeline)

	27

	28

	29 # TODO(lijeffrey): The lookback algorithms for RecursiveFlakePipeline and

	30 # RecursiveFlakeTryJob are to be identical. Refactor both files to use a base

	31 # algorithm.

	32

	33

	34 _GIT_REPO = CachedGitilesRepository(

	35 HttpClientAppengine(),

	36 'https://chromium.googlesource.com/chromium/src.git')

	37

	38

	39 def _CreateCulprit(revision, commit_position, repo_name='chromium'):

	40 """Sets culprit information."""

	41 change_log = _GIT_REPO.GetChangeLog(revision)

	42

	43 if change_log:

	44 url = change_log.code_review_url or change_log.commit_url

	45 culprit = FlakeCulprit.Create(

	46 repo_name, revision, commit_position, url)

	47 else:

	48 logging.error('Unable to retrieve change logs for %s', revision)

	49 culprit = FlakeCulprit.Create(repo_name, revision, commit_position, None)

	50

	51 return culprit

	52

	53

	54 def _UpdateAnalysisTryJobStatusUponCompletion(

	55 flake_analysis, culprit, status, error):

	56 flake_analysis.end_time = time_util.GetUTCNow()

	57 flake_analysis.try_job_status = status

	58

	59 if error:

	60 flake_analysis.error = error

	61 elif culprit:

	62 flake_analysis.culprit = culprit

	63 flake_analysis.result_status = result_status.FOUND_UNTRIAGED

	64 else:

	65 flake_analysis.result_status = result_status.NOT_FOUND_UNTRIAGED
	stgao 2017/01/12 08:04:16 If we arrive here, that means we have a suspect bu If we arrive here, that means we have a suspect build, but can't identify the exact culprit. Should it be still NOT_FOUND? lijeffrey 2017/01/12 09:52:52 Yes. In this case suspected flake build is a false Show quoted text On 2017/01/12 08:04:16, stgao (slow on Monday) wrote: > If we arrive here, that means we have a suspect build, but can't identify the > exact culprit. Should it be still NOT_FOUND? Yes. In this case suspected flake build is a false positive and we should not count it as having found something. Flake swarming tasks guide the analysis to this point, but the try jobs can verify if that point is correct or not.
	66

	67 flake_analysis.put()

	68

	69

	70 class RecursiveFlakeTryJobPipeline(BasePipeline):

	71 """Starts a series of flake try jobs to identify the exact culprit."""

	72

	73 # Arguments number differs from overridden method - pylint: disable=W0221

	74 def run(self, urlsafe_flake_analysis_key, commit_position, revision):

	75 """Runs a try job at a revision to determine its flakiness.

	76

	77 Args:

	78 urlsafe_flake_analysis_key (str): The urlsafe-key of the flake analysis

	79 for which the try jobs are to analyze.

	80 commit_position (int): The commit position corresponding to \|revision\| to

	81 analyze.

	82 revision (str): The revision to run the try job against corresponding to

	83 \|commit_position\|.

	84 """

	85

	86 flake_analysis = ndb.Key(urlsafe=urlsafe_flake_analysis_key).get()

	87 assert flake_analysis

	88

	89 if (flake_analysis.error or

	90 flake_analysis.status != analysis_status.COMPLETED):

	91 # Don't start start try-jobs if the flake swarming tasks had error or are

	92 # not done yet.

	93 return

	94

	95 try_job = FlakeTryJob.Get(

	96 flake_analysis.master_name, flake_analysis.builder_name,

	97 flake_analysis.step_name, flake_analysis.test_name, revision)

	98

	99 if try_job: # pragma: no cover

	100 if try_job.failed:

	101 try_job.status = analysis_status.PENDING
	stgao 2017/01/12 08:04:16 Why failed try job will have a pending status? Wha Why failed try job will have a pending status? What's the scenario here? lijeffrey 2017/01/12 09:52:52 This is for rerun. Show quoted text On 2017/01/12 08:04:16, stgao (slow on Monday) wrote: > Why failed try job will have a pending status? What's the scenario here? This is for rerun.
	102 try_job.put()

	103 else:

	104 try_job = FlakeTryJob.Create(

	105 flake_analysis.master_name, flake_analysis.builder_name,

	106 flake_analysis.step_name, flake_analysis.test_name, revision)

	107 try_job.put()

	108

	109 if flake_analysis.try_job_status is None: # pragma: no branch

	110 flake_analysis.try_job_status = analysis_status.RUNNING

	111 flake_analysis.put()

	112

	113 with pipeline.InOrder():

	114 try_job_id = yield ScheduleFlakeTryJobPipeline(

	115 flake_analysis.master_name, flake_analysis.builder_name,

	116 flake_analysis.step_name, flake_analysis.test_name, revision)

	117

	118 try_job_result = yield MonitorTryJobPipeline(

	119 try_job.key.urlsafe(), failure_type.FLAKY_TEST, try_job_id)

	120

	121 yield ProcessFlakeTryJobResultPipeline(

	122 revision, commit_position, try_job_result, try_job.key.urlsafe(),

	123 urlsafe_flake_analysis_key)

	124

	125 yield NextCommitPositionPipeline(

	126 urlsafe_flake_analysis_key, try_job.key.urlsafe(), commit_position)

	127

	128

	129 def _IsStable(pass_rate, lower_flake_threshold, upper_flake_threshold):

	130 return (

	131 pass_rate < lower_flake_threshold or pass_rate > upper_flake_threshold)

	132

	133

	134 def _GetNextCommitPosition(data_points, flake_settings,

	135 lower_boundary_commit_position):

	136 """Finds the next commit_position to analyze, or gets final result.

	137

	138 Args:

	139 data_points (list): Already-completed data points.

	140 flake_settings (dict): Parameters for flakiness algorithm.

	141 lower_boundary_commit_position (int): The commit position not to pass when

	142 looking back.

	143

	144 Returns:

	145 (next_commit_position, suspected_commit_position): The commit position of

	146 the next revision to check and suspected commit position that that the

	147 flakiness was introduced in. If next_commit_position needs to be

	148 checked, suspected_commit_position will be None. If

	149 suspected_commit_position is found, next_commit_position will be

	150 None. If no findings eventually, both will be None.

	151 """

	152 lower_flake_threshold = flake_settings.get('lower_flake_threshold')

	153 upper_flake_threshold = flake_settings.get('upper_flake_threshold')

	154 max_stable_in_a_row = flake_settings.get('max_stable_in_a_row')

	155 max_flake_in_a_row = flake_settings.get('max_flake_in_a_row')

	156 max_dive_in_a_row = flake_settings.get('max_dive_in_a_row')

	157 dive_rate_threshold = flake_settings.get('dive_rate_threshold')

	158

	159 stables_in_a_row = 0

	160 flakes_in_a_row = 0

	161 dives_in_a_row = 0

	162 stables_happened = False

	163 flakes_first = 0

	164 flaked_out = False

	165 next_commit_position = None

	166

	167 total_data_points = len(data_points)

	168

	169 for i in xrange(total_data_points):

	170 pass_rate = data_points[i].pass_rate

	171 commit_position = data_points[i].commit_position

	172

	173 if pass_rate < 0: # Test doesn't exist at this revision.

	174 if flaked_out or flakes_first:

	175 stables_in_a_row += 1

	176 lower_boundary = data_points[i - stables_in_a_row + 1].commit_position

	177 return lower_boundary + 1, None

	178 else:

	179 return None, None

	180 elif _IsStable(pass_rate, lower_flake_threshold, upper_flake_threshold):

	181 stables_in_a_row += 1

	182 flakes_in_a_row = 0

	183 dives_in_a_row = 0

	184 stables_happened = True

	185

	186 # These cases are not needed for try jobs.

	187 if stables_in_a_row <= max_stable_in_a_row: # pragma: no cover.

	188 # No stable region yet, keep searching.

	189 next_commit_position = commit_position - 1

	190 continue

	191 # Stable region found.

	192 if not flaked_out and not flakes_first: # pragma: no cover.

	193 # Already stabled_out but no flake region yet, no findings.

	194 return None, None

	195

	196 # Flake region is also found, ready for sequential search.

	197 lower_boundary_index = i - stables_in_a_row + 1

	198 lower_boundary = data_points[lower_boundary_index].commit_position

	199 previous_commit_position = data_points[

	200 lower_boundary_index - 1].commit_position

	201

	202 if previous_commit_position == lower_boundary + 1:

	203 # Sequential search is Done.

	204 return None, previous_commit_position

	205 # Continue sequential search.

	206 return lower_boundary + 1, None

	207

	208 else: # Flaky result.

	209 flakes_in_a_row += 1

	210 stables_in_a_row = 0

	211

	212 if flakes_in_a_row > max_flake_in_a_row: # Identified a flaky region.

	213 flaked_out = True

	214

	215 if not stables_happened: # pragma: no branch

	216 # No stables yet.

	217 flakes_first += 1

	218

	219 if commit_position == lower_boundary_commit_position: # pragma: no branch

	220 # The earliest commit_position to look back is already flaky. This is

	221 # the culprit.

	222 return None, commit_position

	223

	224 # Check the pass_rate of previous run, if this is the first data_point,

	225 # consider the virtual previous run is stable.

	226 previous_pass_rate = data_points[i - 1].pass_rate if i > 0 else 0

	227 if _IsStable(

	228 previous_pass_rate, lower_flake_threshold, upper_flake_threshold):

	229 next_commit_position = commit_position - flakes_in_a_row

	230 continue

	231

	232 # Checks for dives. A dive is a sudden drop in pass rate.

	233 if pass_rate - previous_pass_rate > dive_rate_threshold:

	234 # Possibly a dive just happened.

	235 # Set dives_in_a_row to one since this is the first sign of diving.

	236 # For cases where we have pass rates like 0.1, 0.51, 0.92, we will use

	237 # the earliest dive.

	238 dives_in_a_row = 1

	239 elif previous_pass_rate - pass_rate > dive_rate_threshold:

	240 # A rise just happened, sets dives_in_a_row back to 0.

	241 dives_in_a_row = 0

	242 else:

	243 # Two last results are close, increases dives_in_a_row if not 0.

	244 dives_in_a_row = dives_in_a_row + 1 if dives_in_a_row else 0

	245

	246 if dives_in_a_row <= max_dive_in_a_row:

	247 step_size = 1 if dives_in_a_row else flakes_in_a_row

	248 next_commit_position = commit_position - step_size

	249 continue

	250

	251 # Dived out.

	252 # Flake region must have been found, ready for sequential search.

	253 lower_boundary_index = i - dives_in_a_row + 1

	254 lower_boundary = data_points[lower_boundary_index].commit_position

	255 commit_after_lower_boundary = (

	256 data_points[lower_boundary_index - 1].commit_position)

	257

	258 if commit_after_lower_boundary == lower_boundary + 1:

	259 # Sequential search is Done.

	260 return None, commit_after_lower_boundary

	261 # Sequential search.

	262 return lower_boundary + 1, None

	263

	264 if next_commit_position < lower_boundary_commit_position:

	265 # Do not run past the bounds of the blame list.

	266 return lower_boundary_commit_position, None

	267

	268 return next_commit_position, None

	269

	270

	271 def _GetTryJobDataPoints(analysis):

	272 """Gets which data points should be used to determine the next revision.

	273

	274 Args:

	275 all_data_points (list): A list of data points already analyzed and stored

	276 in a MasterFlakeAnalysis entity.

	277

	278 Returns:

	279 A list of data points used to analyze and determine what try job to trigger

	280 next.

	281 """

	282 all_data_points = analysis.data_points

	283

	284 # Include the suspected build itself first, which already has a result.

	285 data_points = [analysis.GetDataPointOfSuspectedBuild()]

	286

	287 for i in range(0, len(all_data_points)):

	288 if all_data_points[i].try_job_id is not None:

	289 data_points.append(all_data_points[i])

	290

	291 return sorted(data_points, key=lambda k: k.commit_position, reverse=True)

	292

	293

	294 class NextCommitPositionPipeline(BasePipeline):

	295 """Returns the next index in the blame list to run a try job on."""

	296

	297 # Arguments number differs from overridden method - pylint: disable=W0221

	298 def run(self, urlsafe_flake_analysis_key, urlsafe_try_job_key):

	299 """Determines the next commit position to run a try job on.

	300

	301 Args:

	302 urlsafe_flake_analysis_key (str): The url-safe key to the corresponding

	303 flake analysis that triggered this pipeline.

	304 urlsafe_try_job_key (str): The url-safe key to the try job that was just

	305 run.

	306 """

	307 flake_analysis = ndb.Key(urlsafe=urlsafe_flake_analysis_key).get()

	308 try_job = ndb.Key(urlsafe=urlsafe_try_job_key).get()

	309 assert flake_analysis

	310 assert try_job

	311

	312 # Don't call another pipeline if the previous try job failed.

	313 if try_job.status == analysis_status.ERROR:

	314 error = try_job.error or {

	315 'error': 'Try job %s failed' % try_job.try_job_id,

	316 'message': 'The last try job did not complete as expected'

	317 }

	318 _UpdateAnalysisTryJobStatusUponCompletion(

	319 flake_analysis, None, analysis_status.ERROR, error)

	320 return

	321

	322 # TODO(lijeffrey) Move parameters to config.

	323 flake_settings = {

	324 'lower_flake_threshold': 0.02,

	325 'upper_flake_threshold': 0.98,

	326 'max_flake_in_a_row': 1,

	327 'max_stable_in_a_row': 0,

	328 'max_dive_in_a_row': 4,

	329 'dive_rate_threshold': 0.4,

	330 }

	331

	332 suspected_build_data_point = flake_analysis.GetDataPointOfSuspectedBuild()

	333 lower_boundary_commit_position = (

	334 suspected_build_data_point.previous_build_commit_position + 1)

	335

	336 # Because \|suspected_build_data_point\| already sets hard lower and upper

	337 # bounds, only the data points involved in try jobs should be considered

	338 # when determining the next commit position to test.

	339 try_job_data_points = _GetTryJobDataPoints(flake_analysis)

	340

	341 # Figure out what commit position to trigger the next try job on, if any.

	342 next_commit_position, suspected_commit_position = _GetNextCommitPosition(

	343 try_job_data_points, flake_settings, lower_boundary_commit_position)

	344

	345 if (next_commit_position is None or

	346 next_commit_position >= suspected_build_data_point.commit_position):

	347 # Finished.

	348 if next_commit_position == suspected_build_data_point.commit_position:

	349 suspected_commit_position = next_commit_position

	350

	351 culprit_revision = suspected_build_data_point.GetRevisionAtCommitPosition(

	352 suspected_commit_position)

	353 culprit = _CreateCulprit(culprit_revision, suspected_commit_position)

	354 _UpdateAnalysisTryJobStatusUponCompletion(

	355 flake_analysis, culprit, analysis_status.COMPLETED, None)

	356 return

	357

	358 next_revision = suspected_build_data_point.GetRevisionAtCommitPosition(

	359 next_commit_position)

	360

	361 pipeline_job = RecursiveFlakeTryJobPipeline(

	362 urlsafe_flake_analysis_key, next_commit_position, next_revision)

	363 pipeline_job.target = appengine_util.GetTargetNameForModule(

	364 constants.WATERFALL_BACKEND)

	365 pipeline_job.start()

OLD	NEW