appengine/chromium_try_flakes/handlers/flake_issues.py - Issue 1660043002: Move flaky run processing into a taskqueue

Side by Side Diff: appengine/chromium_try_flakes/handlers/flake_issues.py

Issue 1660043002: Move flaky run processing into a taskqueue (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Added new files Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Task queue endpoints for creating and updating issues on issue tracker."""	5 """Task queue endpoints for creating and updating issues on issue tracker."""

6	6

7 import datetime	7 import datetime

	8 import json

8 import logging	9 import logging

9 import webapp2	10 import webapp2

10	11

11 from google.appengine.api import app_identity	12 from google.appengine.api import app_identity

12 from google.appengine.api import taskqueue	13 from google.appengine.api import taskqueue

	14 from google.appengine.api import urlfetch

13 from google.appengine.ext import ndb	15 from google.appengine.ext import ndb

14	16

	17 from infra_libs import ts_mon

15 from issue_tracker import issue_tracker_api, issue	18 from issue_tracker import issue_tracker_api, issue

16 from model.flake import FlakeUpdateSingleton, FlakeUpdate, Flake	19 from model.flake import (

17 from infra_libs import ts_mon	20 Flake, FlakeOccurrence, FlakeUpdate, FlakeUpdateSingleton, FlakyRun)

	21 from status import build_result, util

18	22

19	23

20 MAX_UPDATED_ISSUES_PER_DAY = 50	24 MAX_UPDATED_ISSUES_PER_DAY = 50

21 MAX_TIME_DIFFERENCE_SECONDS = 12 * 60 * 60	25 MAX_TIME_DIFFERENCE_SECONDS = 12 * 60 * 60

22 MIN_REQUIRED_FLAKY_RUNS = 5	26 MIN_REQUIRED_FLAKY_RUNS = 5

23 DAYS_TILL_STALE = 3	27 DAYS_TILL_STALE = 3

24 USE_MONORAIL = False	28 USE_MONORAIL = False

25 DAYS_TO_REOPEN_ISSUE = 3	29 DAYS_TO_REOPEN_ISSUE = 3

26 FLAKY_RUNS_TEMPLATE = (	30 FLAKY_RUNS_TEMPLATE = (

27 'Detected %(new_flakes_count)d new flakes for test/step "%(name)s". To see '	31 'Detected %(new_flakes_count)d new flakes for test/step "%(name)s". To see '

(...skipping 299 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
327 api.update(flake_issue, comment=BACK_TO_SHERIFF_MESSAGE)	331 api.update(flake_issue, comment=BACK_TO_SHERIFF_MESSAGE)

328 return	332 return

329	333

330 # Report to stale-flakes-reports@ if the issue has no updates for 7 days.	334 # Report to stale-flakes-reports@ if the issue has no updates for 7 days.

331 week_ago = now - datetime.timedelta(days=7)	335 week_ago = now - datetime.timedelta(days=7)

332 if (last_third_party_update < week_ago and	336 if (last_third_party_update < week_ago and

333 STALE_FLAKES_ML not in flake_issue.cc):	337 STALE_FLAKES_ML not in flake_issue.cc):

334 flake_issue.cc.append(STALE_FLAKES_ML)	338 flake_issue.cc.append(STALE_FLAKES_ML)

335 logging.info('Reporting issue %s to %s', flake_issue.id, STALE_FLAKES_ML)	339 logging.info('Reporting issue %s to %s', flake_issue.id, STALE_FLAKES_ML)

336 api.update(flake_issue, comment=VERY_STALE_FLAKES_MESSAGE)	340 api.update(flake_issue, comment=VERY_STALE_FLAKES_MESSAGE)

	341

	342

	343 class CreateFlakyRun(webapp2.RequestHandler):

	344 # We execute below method in an indepedent transaction since otherwise we

	345 # would exceed the maximum number of entities allowed within a single

	346 # transaction.
	tandrii(chromium) 2016/02/02 21:54:18 i do not fully understand why, though I believe yo i do not fully understand why, though I believe you. I'd certainly recommend you get review of someone with more AE experience to review this bit. Paweł Hajdan Jr. 2016/02/03 09:09:51 Yes, AE has limits on transactions, not all of whi Show quoted text On 2016/02/02 at 21:54:18, tandrii(chromium) wrote: > i do not fully understand why, though I believe you. I'd certainly recommend you get review of someone with more AE experience to review this bit. Yes, AE has limits on transactions, not all of which even used to be documented. This looks fine to me so far.
	347 @staticmethod

	348 # pylint: disable=E1120

	349 @ndb.transactional(xg=True, propagation=ndb.TransactionOptions.INDEPENDENT)

	350 def add_failure_to_flake(name, flaky_run_key, failure_time):

	351 flake = Flake.get_by_id(name)

	352 if not flake:

	353 flake = Flake(name=name, id=name, last_time_seen=datetime.datetime.min)

	354 flake.put()

	355

	356 flake.occurrences.append(flaky_run_key)

	357 util.add_occurrence_time_to_flake(flake, failure_time)

	358 flake.put()

	359

	360 # see examples:

	361 # compile http://build.chromium.org/p/tryserver.chromium.mac/json/builders/

	362 # mac_chromium_compile_dbg/builds/11167?as_text=1

	363 # gtest http://build.chromium.org/p/tryserver.chromium.win/json/builders/

	364 # win_chromium_x64_rel_swarming/builds/4357?as_text=1

	365 # TODO(jam): get specific problem with compile so we can use that as name

	366 # TODO(jam): It's unfortunate to have to parse this html. Can we get it from
	Paweł Hajdan Jr. 2016/02/03 09:09:51 I know this is pre-existing, but IMHO important in I know this is pre-existing, but IMHO important in general (not in scope for this patch of course). Consider that with move to DM we'd no longer have buildbot JSON/HTML anyway. Having some independent mechanism to avoid this parsing would be very useful. It'd also help improve more precise communication between the recipe and this app - currently there seem to be lots of "fixups" that wouldn't necessarily be needed otherwise. Sergiy Byelozyorov 2016/02/03 09:15:56 I was actually planning to move to Buildbot JSON + Show quoted text On 2016/02/03 09:09:51, Paweł Hajdan Jr. wrote: > I know this is pre-existing, but IMHO important in general (not in scope for > this patch of course). > > Consider that with move to DM we'd no longer have buildbot JSON/HTML anyway. > > Having some independent mechanism to avoid this parsing would be very useful. > It'd also help improve more precise communication between the recipe and this > app - currently there seem to be lots of "fixups" that wouldn't necessarily be > needed otherwise. I was actually planning to move to Buildbot JSON + GTest JSON output very soon. I am not sure what DM will offer, but we will have to fix it one way or another.
	367 # another place instead of the tryserver's json?

	368 @staticmethod

	369 def get_flakes(step):

	370 combined = ' '.join(step['text'])

	371

	372 # If test results were invalid, report whole step as flaky.

	373 if 'TEST RESULTS WERE INVALID' in combined:

	374 return [combined]

	375

	376 #gtest

	377 gtest_search_str = 'failures:<br/>'

	378 gtest_search_index = combined.find(gtest_search_str)

	379 if gtest_search_index != -1:

	380 failures = combined[gtest_search_index + len(gtest_search_str):]

	381 failures = failures.split('<br/>')

	382 results = []

	383 for failure in failures:

	384 if not failure:

	385 continue

	386 if failure == 'ignored:':

	387 break # layout test output

	388 results.append(failure)

	389 return results

	390

	391 #gpu

	392 gpu_search_str = '&tests='

	393 gpu_search_index = combined.find(gpu_search_str)

	394 if gpu_search_index != -1:

	395 failures = combined[gpu_search_index + len(gpu_search_str):]

	396 end_index = failures.find('">')

	397 failures = failures[:end_index ]

	398 failures = failures.split(',')

	399 results = []

	400 for failure in failures:

	401 if not failure:

	402 continue

	403 results.append(failure)

	404 return results

	405

	406 return [combined]

	407

	408 @ndb.transactional(xg=True) # pylint: disable=E1120

	409 def post(self):

	410 if (not self.request.get('failure_run_key') or

	411 not self.request.get('success_run_key')):

	412 self.response.set_status(400, 'Invalid request parameters')

	413 return

	414

	415 failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get()

	416 success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get()

	417

	418 flaky_run = FlakyRun(

	419 failure_run=failure_run.key,

	420 failure_run_time_started=failure_run.time_started,

	421 failure_run_time_finished=failure_run.time_finished,

	422 success_run=success_run.key)

	423

	424 success_time = success_run.time_finished

	425 failure_time = failure_run.time_finished

	426 patchset_builder_runs = failure_run.key.parent().get()

	427 url = ('http://build.chromium.org/p/' + patchset_builder_runs.master +

	428 '/json/builders/' + patchset_builder_runs.builder +'/builds/' +

	429 str(failure_run.buildnumber))

	430 urlfetch.set_default_fetch_deadline(60)

	431 logging.info('get_flaky_run_reason ' + url)

	432 result = urlfetch.fetch(url).content

	433 try:

	434 json_result = json.loads(result)

	435 except ValueError:

	436 logging.exception('couldnt decode json for %s', url)

	437 return

	438 steps = json_result['steps']

	439

	440 failed_steps = []

	441 passed_steps = []

	442 for step in steps:

	443 result = step['results'][0]

	444 if build_result.isResultSuccess(result):

	445 passed_steps.append(step)

	446 continue

	447 if not build_result.isResultFailure(result):

	448 continue

	449 step_name = step['name']

	450 step_text = ' '.join(step['text'])

	451 # The following step failures are ignored:

	452 # - steps: always red when any other step is red (not a failure)

	453 # - [swarming] ...: summary step would also be red (do not double count)

	454 # - presubmit: typically red due to missing OWNERs LGTM, not a flake

	455 # - recipe failure reason: always red when build fails (not a failure)

	456 # - Patch failure: if success run was before failure run, it is

	457 # likely a legitimate failure. For example it often happens that

	458 # developers use CQ dry run and then wait for a review. Once getting

	459 # LGTM they check CQ checkbox, but the patch does not cleanly apply

	460 # anymore.

	461 # - bot_update PATCH FAILED: Corresponds to 'Patch failure' step.

	462 # - test results: always red when another step is red (not a failure)

	463 # - Uncaught Exception: summary step referring to an exception in another

	464 # step (e.g. bot_update)

	465 if (step_name == 'steps' or step_name.startswith('[swarming]') or

	466 step_name == 'presubmit' or step_name == 'recipe failure reason' or

	467 (step_name == 'Patch failure' and success_time < failure_time) or

	468 (step_name == 'bot_update' and 'PATCH FAILED' in step_text) or

	469 step_name == 'test results' or step_name == 'Uncaught Exception'):

	470 continue

	471 failed_steps.append(step)

	472

	473 steps_to_ignore = []

	474 for step in failed_steps:

	475 step_name = step['name']

	476 if ' (with patch)' in step_name:

	477 # Android instrumentation tests add a prefix before the step name, which

	478 # doesn't appear on the summary step (without suffixes). To make sure we

	479 # correctly ignore duplicate failures, we remove the prefix.

	480 step_name = step_name.replace('Instrumentation test ', '')

	481

	482 step_name_with_no_modifier = step_name.replace(' (with patch)', '')

	483 for other_step in failed_steps:

	484 # A step which fails, and then is retried and also fails, will have

	485 # its name without the ' (with patch)' again. Don't double count.

	486 if other_step['name'] == step_name_with_no_modifier:

	487 steps_to_ignore.append(other_step['name'])

	488

	489 # If a step fails without the patch, then the tree is busted. Don't

	490 # count as flake.

	491 step_name_without_patch = (

	492 '%s (without patch)' % step_name_with_no_modifier)

	493 for other_step in failed_steps:

	494 if other_step['name'] == step_name_without_patch:

	495 steps_to_ignore.append(step['name'])

	496 steps_to_ignore.append(other_step['name'])

	497

	498 flakes_to_update = []

	499 for step in failed_steps:

	500 step_name = step['name']

	501 if step_name in steps_to_ignore:

	502 continue

	503 flakes = self.get_flakes(step)

	504 if not flakes:

	505 continue

	506 for flake in flakes:

	507 flake_occurrence = FlakeOccurrence(name=step_name, failure=flake)

	508 flaky_run.flakes.append(flake_occurrence)

	509 flakes_to_update.append(flake)

	510

	511 flaky_run_key = flaky_run.put()

	512 for flake in flakes_to_update:

	513 self.add_failure_to_flake(flake, flaky_run_key, failure_time)

OLD	NEW

« no previous file with comments | « no previous file | appengine/chromium_try_flakes/handlers/index.py » ('j') | appengine/chromium_try_flakes/status/util.py » ('J')