| OLD | NEW |
| 1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import datetime | 5 import datetime |
| 6 import dateutil.parser | 6 import dateutil.parser |
| 7 import logging | 7 import logging |
| 8 import json | 8 import json |
| 9 import time | 9 import time |
| 10 import urllib2 | 10 import urllib2 |
| 11 | 11 |
| 12 from google.appengine.api import taskqueue | 12 from google.appengine.api import taskqueue |
| 13 from google.appengine.api import urlfetch | 13 from google.appengine.api import urlfetch |
| 14 from google.appengine.ext import deferred | 14 from google.appengine.ext import deferred |
| 15 from google.appengine.ext import ndb | 15 from google.appengine.ext import ndb |
| 16 from google.appengine.runtime import DeadlineExceededError | 16 from google.appengine.runtime import DeadlineExceededError |
| 17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS | 17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS |
| 18 from model.build_run import BuildRun | 18 from model.build_run import BuildRun |
| 19 from model.build_run import PatchsetBuilderRuns | 19 from model.build_run import PatchsetBuilderRuns |
| 20 from model.fetch_status import FetchStatus | 20 from model.fetch_status import FetchStatus |
| 21 from model.flake import Flake | 21 from model.flake import Flake |
| 22 from model.flake import FlakeOccurrence | 22 from model.flake import FlakeOccurrence |
| 23 from model.flake import FlakyRun | 23 from model.flake import FlakyRun |
| 24 from status import build_result | 24 from status import build_result, util |
| 25 import time_functions.timestamp | 25 import time_functions.timestamp |
| 26 | 26 |
| 27 | 27 |
| 28 @ndb.transactional | 28 @ndb.transactional |
| 29 def get_patchset_builder_runs(issue, patchset, master, builder): | 29 def get_patchset_builder_runs(issue, patchset, master, builder): |
| 30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master, | 30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master, |
| 31 builder) | 31 builder) |
| 32 | 32 |
| 33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id( | 33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id( |
| 34 patchset_builder_runs_id) | 34 patchset_builder_runs_id) |
| 35 if not patchset_builder_runs: | 35 if not patchset_builder_runs: |
| 36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue, | 36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue, |
| 37 patchset=patchset, | 37 patchset=patchset, |
| 38 master=master, | 38 master=master, |
| 39 builder=builder, | 39 builder=builder, |
| 40 id=patchset_builder_runs_id) | 40 id=patchset_builder_runs_id) |
| 41 patchset_builder_runs.put() | 41 patchset_builder_runs.put() |
| 42 | 42 |
| 43 return patchset_builder_runs | 43 return patchset_builder_runs |
| 44 | 44 |
| 45 | 45 |
| 46 def is_last_hour(date): | |
| 47 return (datetime.datetime.utcnow() - date) < datetime.timedelta(hours=1) | |
| 48 | |
| 49 | |
| 50 def is_last_day(date): | |
| 51 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=1) | |
| 52 | |
| 53 | |
| 54 def is_last_week(date): | |
| 55 return (datetime.datetime.utcnow() - date) < datetime.timedelta(weeks=1) | |
| 56 | |
| 57 | |
| 58 def is_last_month(date): | |
| 59 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=31) | |
| 60 | |
| 61 | |
| 62 # Updates a Flake object, which spans all the instances of one flake, with the | 46 # Updates a Flake object, which spans all the instances of one flake, with the |
| 63 # time of an occurrence of that flake. | 47 # time of an occurrence of that flake. |
| 64 def add_occurrence_time_to_flake(flake, occurrence_time): # pragma: no cover | |
| 65 if occurrence_time > flake.last_time_seen: | |
| 66 flake.last_time_seen = occurrence_time | |
| 67 if is_last_hour(occurrence_time): | |
| 68 flake.count_hour += 1 | |
| 69 flake.last_hour = True | |
| 70 if is_last_day(occurrence_time): | |
| 71 flake.count_day += 1 | |
| 72 flake.last_day = True | |
| 73 if is_last_week(occurrence_time): | |
| 74 flake.count_week += 1 | |
| 75 flake.last_week = True | |
| 76 if is_last_month(occurrence_time): | |
| 77 flake.count_month += 1 | |
| 78 flake.last_month = True | |
| 79 flake.count_all += 1 | |
| 80 | |
| 81 | |
| 82 # Calculate the counters for a Flake object. | 48 # Calculate the counters for a Flake object. |
| 83 def update_flake_counters(flake): # pragma: no cover | 49 def update_flake_counters(flake): # pragma: no cover |
| 84 occurrences = ndb.get_multi(flake.occurrences) | 50 occurrences = ndb.get_multi(flake.occurrences) |
| 85 flake.count_hour = 0 | 51 flake.count_hour = 0 |
| 86 flake.count_day = 0 | 52 flake.count_day = 0 |
| 87 flake.count_week = 0 | 53 flake.count_week = 0 |
| 88 flake.count_month = 0 | 54 flake.count_month = 0 |
| 89 flake.count_all = 0 | 55 flake.count_all = 0 |
| 90 flake.last_hour = False | 56 flake.last_hour = False |
| 91 flake.last_day = False | 57 flake.last_day = False |
| 92 flake.last_week = False | 58 flake.last_week = False |
| 93 flake.last_month = False | 59 flake.last_month = False |
| 94 flake.last_time_seen = datetime.datetime.min | 60 flake.last_time_seen = datetime.datetime.min |
| 95 for o in occurrences: | 61 for o in occurrences: |
| 96 add_occurrence_time_to_flake(flake, o.failure_run_time_finished) | 62 util.add_occurrence_time_to_flake(flake, o.failure_run_time_finished) |
| 97 flake.put() | 63 flake.put() |
| 98 | 64 |
| 99 | 65 |
| 100 # The following four functions are cron jobs which update the counters for | 66 # The following four functions are cron jobs which update the counters for |
| 101 # flakes. To speed things up, we don't update last month/week/day as often as we | 67 # flakes. To speed things up, we don't update last month/week/day as often as we |
| 102 # update hourly counters. | 68 # update hourly counters. |
| 103 def update_flake_hour_counter(): # pragma: no cover | 69 def update_flake_hour_counter(): # pragma: no cover |
| 104 query = Flake.query().filter(Flake.last_hour == True) | 70 query = Flake.query().filter(Flake.last_hour == True) |
| 105 for flake in query: | 71 for flake in query: |
| 106 update_flake_counters(flake) | 72 update_flake_counters(flake) |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 139 url='/issues/process/%s' % flake.key.urlsafe()) | 105 url='/issues/process/%s' % flake.key.urlsafe()) |
| 140 | 106 |
| 141 | 107 |
| 142 def update_stale_issues(): | 108 def update_stale_issues(): |
| 143 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id], | 109 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id], |
| 144 distinct=True): | 110 distinct=True): |
| 145 taskqueue.add(queue_name='issue-updates', | 111 taskqueue.add(queue_name='issue-updates', |
| 146 url='/issues/update-if-stale/%s' % flake.issue_id) | 112 url='/issues/update-if-stale/%s' % flake.issue_id) |
| 147 | 113 |
| 148 | 114 |
| 149 @ndb.transactional(xg=True) # pylint: disable=no-value-for-parameter | |
| 150 def add_failure_to_flake(name, flaky_run): | |
| 151 flake = Flake.get_by_id(name) | |
| 152 if not flake: | |
| 153 flake = Flake(name=name, id=name, last_time_seen=datetime.datetime.min) | |
| 154 flake.put() | |
| 155 | |
| 156 flake.occurrences.append(flaky_run.key) | |
| 157 | |
| 158 flaky_run_time = flaky_run.failure_run.get().time_finished | |
| 159 add_occurrence_time_to_flake(flake, flaky_run_time) | |
| 160 | |
| 161 flake.put() | |
| 162 | |
| 163 # see examples: | |
| 164 # compile http://build.chromium.org/p/tryserver.chromium.mac/json/builders/ | |
| 165 # mac_chromium_compile_dbg/builds/11167?as_text=1 | |
| 166 # gtest http://build.chromium.org/p/tryserver.chromium.win/json/builders/ | |
| 167 # win_chromium_x64_rel_swarming/builds/4357?as_text=1 | |
| 168 # TODO(jam): get specific problem with compile so we can use that as name | |
| 169 # TODO(jam): It's unfortunate to have to parse this html. Can we get it from | |
| 170 # another place instead of the tryserver's json? | |
| 171 def get_flakes(step): | |
| 172 combined = ' '.join(step['text']) | |
| 173 | |
| 174 # If test results were invalid, report whole step as flaky. | |
| 175 if 'TEST RESULTS WERE INVALID' in combined: | |
| 176 return [combined] | |
| 177 | |
| 178 #gtest | |
| 179 gtest_search_str = 'failures:<br/>' | |
| 180 gtest_search_index = combined.find(gtest_search_str) | |
| 181 if gtest_search_index != -1: | |
| 182 failures = combined[gtest_search_index + len(gtest_search_str):] | |
| 183 failures = failures.split('<br/>') | |
| 184 results = [] | |
| 185 for failure in failures: | |
| 186 if not failure: | |
| 187 continue | |
| 188 if failure == 'ignored:': | |
| 189 break # layout test output | |
| 190 results.append(failure) | |
| 191 return results | |
| 192 | |
| 193 #gpu | |
| 194 gpu_search_str = '&tests=' | |
| 195 gpu_search_index = combined.find(gpu_search_str) | |
| 196 if gpu_search_index != -1: | |
| 197 failures = combined[gpu_search_index + len(gpu_search_str):] | |
| 198 end_index = failures.find('">') | |
| 199 failures = failures[:end_index ] | |
| 200 failures = failures.split(',') | |
| 201 results = [] | |
| 202 for failure in failures: | |
| 203 if not failure: | |
| 204 continue | |
| 205 results.append(failure) | |
| 206 return results | |
| 207 | |
| 208 return [combined] | |
| 209 | |
| 210 | |
| 211 # A queued task which polls the tryserver to get more information about why a | |
| 212 # run failed. | |
| 213 def get_flaky_run_reason(flaky_run_key): | |
| 214 flaky_run = flaky_run_key.get() | |
| 215 failure_run = flaky_run.failure_run.get() | |
| 216 success_time = flaky_run.success_run.get().time_finished | |
| 217 failure_time = flaky_run.failure_run_time_finished | |
| 218 patchset_builder_runs = failure_run.key.parent().get() | |
| 219 url = ('http://build.chromium.org/p/' + patchset_builder_runs.master + | |
| 220 '/json/builders/' + patchset_builder_runs.builder +'/builds/' + | |
| 221 str(failure_run.buildnumber)) | |
| 222 urlfetch.set_default_fetch_deadline(60) | |
| 223 logging.info('get_flaky_run_reason ' + url) | |
| 224 result = urlfetch.fetch(url).content | |
| 225 try: | |
| 226 json_result = json.loads(result) | |
| 227 except ValueError: | |
| 228 logging.exception('couldnt decode json for %s', url) | |
| 229 return | |
| 230 steps = json_result['steps'] | |
| 231 | |
| 232 failed_steps = [] | |
| 233 passed_steps = [] | |
| 234 for step in steps: | |
| 235 result = step['results'][0] | |
| 236 if build_result.isResultSuccess(result): | |
| 237 passed_steps.append(step) | |
| 238 continue | |
| 239 if not build_result.isResultFailure(result): | |
| 240 continue | |
| 241 step_name = step['name'] | |
| 242 step_text = ' '.join(step['text']) | |
| 243 # The following step failures are ignored: | |
| 244 # - steps: always red when any other step is red (not actual failure) | |
| 245 # - [swarming] ...: summary step would also be red (do not double count) | |
| 246 # - presubmit: typically red due to missing OWNERs LGTM, not a flake | |
| 247 # - recipe failure reason: always red when build fails (not actual failure) | |
| 248 # - Patch failure: if success run was before failure run, it is | |
| 249 # likely a legitimate failure. For example it often happens that | |
| 250 # developers use CQ dry run and then wait for a review. Once getting LGTM | |
| 251 # they check CQ checkbox, but the patch does not cleanly apply anymore. | |
| 252 # - bot_update PATCH FAILED: Corresponds to 'Patch failure' step. | |
| 253 # - test results: always red when another step is red (not actual failure) | |
| 254 # - Uncaught Exception: summary step referring to an exception in another | |
| 255 # step (e.g. bot_update) | |
| 256 if (step_name == 'steps' or step_name.startswith('[swarming]') or | |
| 257 step_name == 'presubmit' or step_name == 'recipe failure reason' or | |
| 258 (step_name == 'Patch failure' and success_time < failure_time) or | |
| 259 (step_name == 'bot_update' and 'PATCH FAILED' in step_text) or | |
| 260 step_name == 'test results' or step_name == 'Uncaught Exception'): | |
| 261 continue | |
| 262 failed_steps.append(step) | |
| 263 | |
| 264 steps_to_ignore = [] | |
| 265 for step in failed_steps: | |
| 266 step_name = step['name'] | |
| 267 if ' (with patch)' in step_name: | |
| 268 # Android instrumentation tests add a prefix before the step name, which | |
| 269 # doesn't appear on the summary step (without suffixes). To make sure we | |
| 270 # correctly ignore duplicate failures, we remove the prefix. | |
| 271 step_name = step_name.replace('Instrumentation test ', '') | |
| 272 | |
| 273 step_name_with_no_modifier = step_name.replace(' (with patch)', '') | |
| 274 for other_step in failed_steps: | |
| 275 # A step which fails, and then is retried and also fails, will have its | |
| 276 # name without the ' (with patch)' again. Don't double count. | |
| 277 if other_step['name'] == step_name_with_no_modifier: | |
| 278 steps_to_ignore.append(other_step['name']) | |
| 279 | |
| 280 # If a step fails without the patch, then the tree is busted. Don't count | |
| 281 # as flake. | |
| 282 step_name_without_patch = step_name_with_no_modifier + ' (without patch)' | |
| 283 for other_step in failed_steps: | |
| 284 if other_step['name'] == step_name_without_patch: | |
| 285 steps_to_ignore.append(step['name']) | |
| 286 steps_to_ignore.append(other_step['name']) | |
| 287 | |
| 288 for step in failed_steps: | |
| 289 step_name = step['name'] | |
| 290 if step_name in steps_to_ignore: | |
| 291 continue | |
| 292 flakes = get_flakes(step) | |
| 293 if not flakes: | |
| 294 continue | |
| 295 for flake in flakes: | |
| 296 flake_occurrence = FlakeOccurrence(name=step_name, failure=flake) | |
| 297 flaky_run.flakes.append(flake_occurrence) | |
| 298 | |
| 299 add_failure_to_flake(flake, flaky_run) | |
| 300 flaky_run.put() | |
| 301 | |
| 302 | |
| 303 def get_int_value(properties, key): | 115 def get_int_value(properties, key): |
| 304 if not key in properties: | 116 if not key in properties: |
| 305 raise ValueError('key not found') | 117 raise ValueError('key not found') |
| 306 value = properties[key] | 118 value = properties[key] |
| 307 if type(value) == type(list()): | 119 if type(value) == type(list()): |
| 308 value = value[0] | 120 value = value[0] |
| 309 return int(value) | 121 return int(value) |
| 310 | 122 |
| 311 | 123 |
| 312 # Parses the json which we get from chromium-cq-status. | 124 # Parses the json which we get from chromium-cq-status. |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 393 if duplicate: | 205 if duplicate: |
| 394 continue | 206 continue |
| 395 | 207 |
| 396 build_run.put() | 208 build_run.put() |
| 397 | 209 |
| 398 for previous_run in previous_runs: | 210 for previous_run in previous_runs: |
| 399 if previous_run.is_success == build_run.is_success: | 211 if previous_run.is_success == build_run.is_success: |
| 400 continue | 212 continue |
| 401 if success: | 213 if success: |
| 402 # We saw the flake and then the pass. | 214 # We saw the flake and then the pass. |
| 403 flaky_run = FlakyRun( | 215 failure_run = previous_run |
| 404 failure_run=previous_run.key, | 216 success_run = build_run |
| 405 failure_run_time_started=previous_run.time_started, | |
| 406 failure_run_time_finished=previous_run.time_finished, | |
| 407 success_run=build_run.key) | |
| 408 flaky_run.put() | |
| 409 logging_output.append(previous_run.key.parent().get().builder + | |
| 410 str(previous_run.buildnumber)) | |
| 411 else: | 217 else: |
| 412 # We saw the pass and then the failure. Could happen when fetching | 218 # We saw the pass and then the failure. Could happen when fetching |
| 413 # historical data, or for the bot_update step (patch can't be | 219 # historical data, or for the bot_update step (patch can't be |
| 414 # applied cleanly anymore). | 220 # applied cleanly anymore). |
| 415 flaky_run = FlakyRun( | 221 failure_run = build_run |
| 416 failure_run=build_run.key, | 222 success_run = previous_run |
| 417 failure_run_time_started=build_run.time_started, | |
| 418 failure_run_time_finished=build_run.time_finished, | |
| 419 success_run=previous_run.key) | |
| 420 flaky_run.put() | |
| 421 logging_output.append(build_run.key.parent().get().builder + | |
| 422 str(build_run.buildnumber)) | |
| 423 | 223 |
| 424 # Queue a task to fetch the error of this failure. | 224 logging_output.append(failure_run.key.parent().get().builder + |
| 425 deferred.defer(get_flaky_run_reason, flaky_run.key) | 225 str(failure_run.buildnumber)) |
| 226 |
| 227 # Queue a task to fetch the error of this failure and create FlakyRun. |
| 228 taskqueue.add( |
| 229 queue_name='issue-updates', |
| 230 url='/issues/create_flaky_run', |
| 231 params={'failure_run_key': failure_run.key.urlsafe(), |
| 232 'success_run_key': success_run.key.urlsafe()}) |
| 426 | 233 |
| 427 return logging_output | 234 return logging_output |
| 428 | 235 |
| 429 | 236 |
| 430 def fetch_cq_status(): | 237 def fetch_cq_status(): |
| 431 """Fetches data from chromium-cq-status app and saves new data. | 238 """Fetches data from chromium-cq-status app and saves new data. |
| 432 | 239 |
| 433 Remembers old cursor and fetches new data. | 240 Remembers old cursor and fetches new data. |
| 434 """ | 241 """ |
| 435 | 242 |
| (...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 540 else: | 347 else: |
| 541 fetch_status.begin = begin | 348 fetch_status.begin = begin |
| 542 fetch_status.end = end | 349 fetch_status.end = end |
| 543 fetch_status.cursor = cursor | 350 fetch_status.cursor = cursor |
| 544 fetch_status.put() | 351 fetch_status.put() |
| 545 | 352 |
| 546 if not more: | 353 if not more: |
| 547 return # finish the cron job and wait for next iteration | 354 return # finish the cron job and wait for next iteration |
| 548 except urllib2.URLError, e: | 355 except urllib2.URLError, e: |
| 549 logging.warning('Failed to fetch CQ status: %s', e.reason) | 356 logging.warning('Failed to fetch CQ status: %s', e.reason) |
| OLD | NEW |