Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 import logging | |
| 6 | |
| 7 from google.appengine.ext import ndb | |
| 8 | |
| 9 from gae_libs.http.http_client_appengine import HttpClientAppengine | |
| 10 from gae_libs.gitiles.cached_gitiles_repository import CachedGitilesRepository | |
| 11 from libs import time_util | |
| 12 | |
| 13 from common import appengine_util | |
| 14 from common import constants | |
| 15 from common.pipeline_wrapper import BasePipeline | |
| 16 from common.pipeline_wrapper import pipeline | |
| 17 from common.waterfall import failure_type | |
| 18 from model import analysis_status | |
| 19 from model import result_status | |
| 20 from model.flake.flake_culprit import FlakeCulprit | |
| 21 from model.flake.flake_try_job import FlakeTryJob | |
| 22 from waterfall.flake.process_flake_try_job_result_pipeline import ( | |
| 23 ProcessFlakeTryJobResultPipeline) | |
| 24 from waterfall.monitor_try_job_pipeline import MonitorTryJobPipeline | |
| 25 from waterfall.flake.schedule_flake_try_job_pipeline import ( | |
| 26 ScheduleFlakeTryJobPipeline) | |
| 27 | |
| 28 | |
| 29 # TODO(lijeffrey): The lookback algorithms for RecursiveFlakePipeline and | |
| 30 # RecursiveFlakeTryJob are to be identical. Refactor both files to use a base | |
| 31 # algorithm. | |
| 32 | |
| 33 | |
| 34 _GIT_REPO = CachedGitilesRepository( | |
| 35 HttpClientAppengine(), | |
| 36 'https://chromium.googlesource.com/chromium/src.git') | |
| 37 | |
| 38 | |
| 39 def _CreateCulprit(revision, commit_position, repo_name='chromium'): | |
| 40 """Sets culprit information.""" | |
| 41 change_log = _GIT_REPO.GetChangeLog(revision) | |
| 42 | |
| 43 if change_log: | |
| 44 url = change_log.code_review_url or change_log.commit_url | |
| 45 culprit = FlakeCulprit.Create( | |
| 46 repo_name, revision, commit_position, url) | |
| 47 else: | |
| 48 logging.error('Unable to retrieve change logs for %s', revision) | |
| 49 culprit = FlakeCulprit.Create(repo_name, revision, commit_position, None) | |
| 50 | |
| 51 return culprit | |
| 52 | |
| 53 | |
| 54 def _UpdateAnalysisTryJobStatusUponCompletion( | |
| 55 flake_analysis, culprit, status, error): | |
| 56 flake_analysis.end_time = time_util.GetUTCNow() | |
| 57 flake_analysis.try_job_status = status | |
| 58 | |
| 59 if error: | |
| 60 flake_analysis.error = error | |
| 61 elif culprit: | |
| 62 flake_analysis.culprit = culprit | |
| 63 flake_analysis.result_status = result_status.FOUND_UNTRIAGED | |
| 64 else: | |
| 65 flake_analysis.result_status = result_status.NOT_FOUND_UNTRIAGED | |
|
stgao
2017/01/12 08:04:16
If we arrive here, that means we have a suspect bu
lijeffrey
2017/01/12 09:52:52
Yes. In this case suspected flake build is a false
| |
| 66 | |
| 67 flake_analysis.put() | |
| 68 | |
| 69 | |
| 70 class RecursiveFlakeTryJobPipeline(BasePipeline): | |
| 71 """Starts a series of flake try jobs to identify the exact culprit.""" | |
| 72 | |
| 73 # Arguments number differs from overridden method - pylint: disable=W0221 | |
| 74 def run(self, urlsafe_flake_analysis_key, commit_position, revision): | |
| 75 """Runs a try job at a revision to determine its flakiness. | |
| 76 | |
| 77 Args: | |
| 78 urlsafe_flake_analysis_key (str): The urlsafe-key of the flake analysis | |
| 79 for which the try jobs are to analyze. | |
| 80 commit_position (int): The commit position corresponding to |revision| to | |
| 81 analyze. | |
| 82 revision (str): The revision to run the try job against corresponding to | |
| 83 |commit_position|. | |
| 84 """ | |
| 85 | |
| 86 flake_analysis = ndb.Key(urlsafe=urlsafe_flake_analysis_key).get() | |
| 87 assert flake_analysis | |
| 88 | |
| 89 if (flake_analysis.error or | |
| 90 flake_analysis.status != analysis_status.COMPLETED): | |
| 91 # Don't start start try-jobs if the flake swarming tasks had error or are | |
| 92 # not done yet. | |
| 93 return | |
| 94 | |
| 95 try_job = FlakeTryJob.Get( | |
| 96 flake_analysis.master_name, flake_analysis.builder_name, | |
| 97 flake_analysis.step_name, flake_analysis.test_name, revision) | |
| 98 | |
| 99 if try_job: # pragma: no cover | |
| 100 if try_job.failed: | |
| 101 try_job.status = analysis_status.PENDING | |
|
stgao
2017/01/12 08:04:16
Why failed try job will have a pending status? Wha
lijeffrey
2017/01/12 09:52:52
This is for rerun.
| |
| 102 try_job.put() | |
| 103 else: | |
| 104 try_job = FlakeTryJob.Create( | |
| 105 flake_analysis.master_name, flake_analysis.builder_name, | |
| 106 flake_analysis.step_name, flake_analysis.test_name, revision) | |
| 107 try_job.put() | |
| 108 | |
| 109 if flake_analysis.try_job_status is None: # pragma: no branch | |
| 110 flake_analysis.try_job_status = analysis_status.RUNNING | |
| 111 flake_analysis.put() | |
| 112 | |
| 113 with pipeline.InOrder(): | |
| 114 try_job_id = yield ScheduleFlakeTryJobPipeline( | |
| 115 flake_analysis.master_name, flake_analysis.builder_name, | |
| 116 flake_analysis.step_name, flake_analysis.test_name, revision) | |
| 117 | |
| 118 try_job_result = yield MonitorTryJobPipeline( | |
| 119 try_job.key.urlsafe(), failure_type.FLAKY_TEST, try_job_id) | |
| 120 | |
| 121 yield ProcessFlakeTryJobResultPipeline( | |
| 122 revision, commit_position, try_job_result, try_job.key.urlsafe(), | |
| 123 urlsafe_flake_analysis_key) | |
| 124 | |
| 125 yield NextCommitPositionPipeline( | |
| 126 urlsafe_flake_analysis_key, try_job.key.urlsafe(), commit_position) | |
| 127 | |
| 128 | |
| 129 def _IsStable(pass_rate, lower_flake_threshold, upper_flake_threshold): | |
| 130 return ( | |
| 131 pass_rate < lower_flake_threshold or pass_rate > upper_flake_threshold) | |
| 132 | |
| 133 | |
| 134 def _GetNextCommitPosition(data_points, flake_settings, | |
| 135 lower_boundary_commit_position): | |
| 136 """Finds the next commit_position to analyze, or gets final result. | |
| 137 | |
| 138 Args: | |
| 139 data_points (list): Already-completed data points. | |
| 140 flake_settings (dict): Parameters for flakiness algorithm. | |
| 141 lower_boundary_commit_position (int): The commit position not to pass when | |
| 142 looking back. | |
| 143 | |
| 144 Returns: | |
| 145 (next_commit_position, suspected_commit_position): The commit position of | |
| 146 the next revision to check and suspected commit position that that the | |
| 147 flakiness was introduced in. If next_commit_position needs to be | |
| 148 checked, suspected_commit_position will be None. If | |
| 149 suspected_commit_position is found, next_commit_position will be | |
| 150 None. If no findings eventually, both will be None. | |
| 151 """ | |
| 152 lower_flake_threshold = flake_settings.get('lower_flake_threshold') | |
| 153 upper_flake_threshold = flake_settings.get('upper_flake_threshold') | |
| 154 max_stable_in_a_row = flake_settings.get('max_stable_in_a_row') | |
| 155 max_flake_in_a_row = flake_settings.get('max_flake_in_a_row') | |
| 156 max_dive_in_a_row = flake_settings.get('max_dive_in_a_row') | |
| 157 dive_rate_threshold = flake_settings.get('dive_rate_threshold') | |
| 158 | |
| 159 stables_in_a_row = 0 | |
| 160 flakes_in_a_row = 0 | |
| 161 dives_in_a_row = 0 | |
| 162 stables_happened = False | |
| 163 flakes_first = 0 | |
| 164 flaked_out = False | |
| 165 next_commit_position = None | |
| 166 | |
| 167 total_data_points = len(data_points) | |
| 168 | |
| 169 for i in xrange(total_data_points): | |
| 170 pass_rate = data_points[i].pass_rate | |
| 171 commit_position = data_points[i].commit_position | |
| 172 | |
| 173 if pass_rate < 0: # Test doesn't exist at this revision. | |
| 174 if flaked_out or flakes_first: | |
| 175 stables_in_a_row += 1 | |
| 176 lower_boundary = data_points[i - stables_in_a_row + 1].commit_position | |
| 177 return lower_boundary + 1, None | |
| 178 else: | |
| 179 return None, None | |
| 180 elif _IsStable(pass_rate, lower_flake_threshold, upper_flake_threshold): | |
| 181 stables_in_a_row += 1 | |
| 182 flakes_in_a_row = 0 | |
| 183 dives_in_a_row = 0 | |
| 184 stables_happened = True | |
| 185 | |
| 186 # These cases are not needed for try jobs. | |
| 187 if stables_in_a_row <= max_stable_in_a_row: # pragma: no cover. | |
| 188 # No stable region yet, keep searching. | |
| 189 next_commit_position = commit_position - 1 | |
| 190 continue | |
| 191 # Stable region found. | |
| 192 if not flaked_out and not flakes_first: # pragma: no cover. | |
| 193 # Already stabled_out but no flake region yet, no findings. | |
| 194 return None, None | |
| 195 | |
| 196 # Flake region is also found, ready for sequential search. | |
| 197 lower_boundary_index = i - stables_in_a_row + 1 | |
| 198 lower_boundary = data_points[lower_boundary_index].commit_position | |
| 199 previous_commit_position = data_points[ | |
| 200 lower_boundary_index - 1].commit_position | |
| 201 | |
| 202 if previous_commit_position == lower_boundary + 1: | |
| 203 # Sequential search is Done. | |
| 204 return None, previous_commit_position | |
| 205 # Continue sequential search. | |
| 206 return lower_boundary + 1, None | |
| 207 | |
| 208 else: # Flaky result. | |
| 209 flakes_in_a_row += 1 | |
| 210 stables_in_a_row = 0 | |
| 211 | |
| 212 if flakes_in_a_row > max_flake_in_a_row: # Identified a flaky region. | |
| 213 flaked_out = True | |
| 214 | |
| 215 if not stables_happened: # pragma: no branch | |
| 216 # No stables yet. | |
| 217 flakes_first += 1 | |
| 218 | |
| 219 if commit_position == lower_boundary_commit_position: # pragma: no branch | |
| 220 # The earliest commit_position to look back is already flaky. This is | |
| 221 # the culprit. | |
| 222 return None, commit_position | |
| 223 | |
| 224 # Check the pass_rate of previous run, if this is the first data_point, | |
| 225 # consider the virtual previous run is stable. | |
| 226 previous_pass_rate = data_points[i - 1].pass_rate if i > 0 else 0 | |
| 227 if _IsStable( | |
| 228 previous_pass_rate, lower_flake_threshold, upper_flake_threshold): | |
| 229 next_commit_position = commit_position - flakes_in_a_row | |
| 230 continue | |
| 231 | |
| 232 # Checks for dives. A dive is a sudden drop in pass rate. | |
| 233 if pass_rate - previous_pass_rate > dive_rate_threshold: | |
| 234 # Possibly a dive just happened. | |
| 235 # Set dives_in_a_row to one since this is the first sign of diving. | |
| 236 # For cases where we have pass rates like 0.1, 0.51, 0.92, we will use | |
| 237 # the earliest dive. | |
| 238 dives_in_a_row = 1 | |
| 239 elif previous_pass_rate - pass_rate > dive_rate_threshold: | |
| 240 # A rise just happened, sets dives_in_a_row back to 0. | |
| 241 dives_in_a_row = 0 | |
| 242 else: | |
| 243 # Two last results are close, increases dives_in_a_row if not 0. | |
| 244 dives_in_a_row = dives_in_a_row + 1 if dives_in_a_row else 0 | |
| 245 | |
| 246 if dives_in_a_row <= max_dive_in_a_row: | |
| 247 step_size = 1 if dives_in_a_row else flakes_in_a_row | |
| 248 next_commit_position = commit_position - step_size | |
| 249 continue | |
| 250 | |
| 251 # Dived out. | |
| 252 # Flake region must have been found, ready for sequential search. | |
| 253 lower_boundary_index = i - dives_in_a_row + 1 | |
| 254 lower_boundary = data_points[lower_boundary_index].commit_position | |
| 255 commit_after_lower_boundary = ( | |
| 256 data_points[lower_boundary_index - 1].commit_position) | |
| 257 | |
| 258 if commit_after_lower_boundary == lower_boundary + 1: | |
| 259 # Sequential search is Done. | |
| 260 return None, commit_after_lower_boundary | |
| 261 # Sequential search. | |
| 262 return lower_boundary + 1, None | |
| 263 | |
| 264 if next_commit_position < lower_boundary_commit_position: | |
| 265 # Do not run past the bounds of the blame list. | |
| 266 return lower_boundary_commit_position, None | |
| 267 | |
| 268 return next_commit_position, None | |
| 269 | |
| 270 | |
| 271 def _GetTryJobDataPoints(analysis): | |
| 272 """Gets which data points should be used to determine the next revision. | |
| 273 | |
| 274 Args: | |
| 275 all_data_points (list): A list of data points already analyzed and stored | |
| 276 in a MasterFlakeAnalysis entity. | |
| 277 | |
| 278 Returns: | |
| 279 A list of data points used to analyze and determine what try job to trigger | |
| 280 next. | |
| 281 """ | |
| 282 all_data_points = analysis.data_points | |
| 283 | |
| 284 # Include the suspected build itself first, which already has a result. | |
| 285 data_points = [analysis.GetDataPointOfSuspectedBuild()] | |
| 286 | |
| 287 for i in range(0, len(all_data_points)): | |
| 288 if all_data_points[i].try_job_id is not None: | |
| 289 data_points.append(all_data_points[i]) | |
| 290 | |
| 291 return sorted(data_points, key=lambda k: k.commit_position, reverse=True) | |
| 292 | |
| 293 | |
| 294 class NextCommitPositionPipeline(BasePipeline): | |
| 295 """Returns the next index in the blame list to run a try job on.""" | |
| 296 | |
| 297 # Arguments number differs from overridden method - pylint: disable=W0221 | |
| 298 def run(self, urlsafe_flake_analysis_key, urlsafe_try_job_key): | |
| 299 """Determines the next commit position to run a try job on. | |
| 300 | |
| 301 Args: | |
| 302 urlsafe_flake_analysis_key (str): The url-safe key to the corresponding | |
| 303 flake analysis that triggered this pipeline. | |
| 304 urlsafe_try_job_key (str): The url-safe key to the try job that was just | |
| 305 run. | |
| 306 """ | |
| 307 flake_analysis = ndb.Key(urlsafe=urlsafe_flake_analysis_key).get() | |
| 308 try_job = ndb.Key(urlsafe=urlsafe_try_job_key).get() | |
| 309 assert flake_analysis | |
| 310 assert try_job | |
| 311 | |
| 312 # Don't call another pipeline if the previous try job failed. | |
| 313 if try_job.status == analysis_status.ERROR: | |
| 314 error = try_job.error or { | |
| 315 'error': 'Try job %s failed' % try_job.try_job_id, | |
| 316 'message': 'The last try job did not complete as expected' | |
| 317 } | |
| 318 _UpdateAnalysisTryJobStatusUponCompletion( | |
| 319 flake_analysis, None, analysis_status.ERROR, error) | |
| 320 return | |
| 321 | |
| 322 # TODO(lijeffrey) Move parameters to config. | |
| 323 flake_settings = { | |
| 324 'lower_flake_threshold': 0.02, | |
| 325 'upper_flake_threshold': 0.98, | |
| 326 'max_flake_in_a_row': 1, | |
| 327 'max_stable_in_a_row': 0, | |
| 328 'max_dive_in_a_row': 4, | |
| 329 'dive_rate_threshold': 0.4, | |
| 330 } | |
| 331 | |
| 332 suspected_build_data_point = flake_analysis.GetDataPointOfSuspectedBuild() | |
| 333 lower_boundary_commit_position = ( | |
| 334 suspected_build_data_point.previous_build_commit_position + 1) | |
| 335 | |
| 336 # Because |suspected_build_data_point| already sets hard lower and upper | |
| 337 # bounds, only the data points involved in try jobs should be considered | |
| 338 # when determining the next commit position to test. | |
| 339 try_job_data_points = _GetTryJobDataPoints(flake_analysis) | |
| 340 | |
| 341 # Figure out what commit position to trigger the next try job on, if any. | |
| 342 next_commit_position, suspected_commit_position = _GetNextCommitPosition( | |
| 343 try_job_data_points, flake_settings, lower_boundary_commit_position) | |
| 344 | |
| 345 if (next_commit_position is None or | |
| 346 next_commit_position >= suspected_build_data_point.commit_position): | |
| 347 # Finished. | |
| 348 if next_commit_position == suspected_build_data_point.commit_position: | |
| 349 suspected_commit_position = next_commit_position | |
| 350 | |
| 351 culprit_revision = suspected_build_data_point.GetRevisionAtCommitPosition( | |
| 352 suspected_commit_position) | |
| 353 culprit = _CreateCulprit(culprit_revision, suspected_commit_position) | |
| 354 _UpdateAnalysisTryJobStatusUponCompletion( | |
| 355 flake_analysis, culprit, analysis_status.COMPLETED, None) | |
| 356 return | |
| 357 | |
| 358 next_revision = suspected_build_data_point.GetRevisionAtCommitPosition( | |
| 359 next_commit_position) | |
| 360 | |
| 361 pipeline_job = RecursiveFlakeTryJobPipeline( | |
| 362 urlsafe_flake_analysis_key, next_commit_position, next_revision) | |
| 363 pipeline_job.target = appengine_util.GetTargetNameForModule( | |
| 364 constants.WATERFALL_BACKEND) | |
| 365 pipeline_job.start() | |
| OLD | NEW |