Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1316)

Unified Diff: appengine/findit/waterfall/extract_signal_pipeline.py

Issue 1149743002: [Findit] Use step level analysis to exclude flaky test failures. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/waterfall/extract_signal_pipeline.py
diff --git a/appengine/findit/waterfall/extract_signal_pipeline.py b/appengine/findit/waterfall/extract_signal_pipeline.py
index a1c2305ece73d38a766c177b2312e25eccef6601..db6f9c687d62a904984d82540a2544170f7b5223 100644
--- a/appengine/findit/waterfall/extract_signal_pipeline.py
+++ b/appengine/findit/waterfall/extract_signal_pipeline.py
@@ -3,6 +3,7 @@
# found in the LICENSE file.
import logging
+import json
from google.appengine.api.urlfetch import ResponseTooLargeError
@@ -45,6 +46,46 @@ class ExtractSignalPipeline(BasePipeline):
else:
return log_data # pragma: no cover - this won't be reached.
+ @staticmethod
+ def _GetTestLevelFailures(step_log):
+ """Analyze the step log and extract reliable failures only.
+
+ Args:
+ step_log (file): A JSON file for failed step log.
+
+ Returns:
+ A dict like below:
+ {
+ 'test_name1': [
+ {
+ "elapsed_time_ms": ..,
+ "losless_snippet": ..,
+ "output_snippet": ..,
+ "status": "FAILURE",
+ "output_snippet_base64":..
+ },
+ ..
+ ],
+ ..
+ }
+ """
+ failed_test_log = {}
+ step_failure_data = json.loads(step_log)
+
+ for iteration in step_failure_data['gtest_results']['per_iteration_data']:
+ for key in iteration.keys(): # Keys are test names.
+ is_reliable_failure = True
+
+ for test in iteration[key]:
+ # We will ignore the test if one of the attempts passes.
+ if test['status'] != 'FAILURE':
stgao 2015/05/21 00:29:56 This code doesn't match the comment. I think statu
+ is_reliable_failure = False
+ break
+
+ if is_reliable_failure: # All attempts failed, it's a reliable failure.
+ failed_test_log[key] = iteration[key]
+
+ return failed_test_log
# Arguments number differs from overridden method - pylint: disable=W0221
def run(self, failure_info):
@@ -67,44 +108,50 @@ class ExtractSignalPipeline(BasePipeline):
for step_name in failure_info.get('failed_steps', []):
step = WfStep.Get(master_name, builder_name, build_number, step_name)
if step and step.log_data:
- stdio_log = step.log_data
+ test_failure_log = step.log_data
else:
if not lock_util.WaitUntilDownloadAllowed(
stgao 2015/05/21 00:29:56 We don't have to wait unless the request is to the
master_name): # pragma: no cover
- raise pipeline.Retry('Failed to pull stdio of step %s of master %s'
+ raise pipeline.Retry('Failed to pull log of step %s of master %s'
% (step_name, master_name))
- # TODO: do test-level analysis instead of step-level.
- try:
- stdio_log = buildbot.GetStepStdio(
- master_name, builder_name, build_number, step_name,
- self.HTTP_CLIENT)
- except ResponseTooLargeError: # pragma: no cover.
- logging.exception(
- 'Log of step "%s" is too large for urlfetch.', step_name)
- # If the stdio log of a step is too large, we don't want to pull it
- # again in next run, because that might lead to DDoS to the master.
- # TODO: Use archived stdio logs in Google Storage instead.
stgao 2015/05/21 00:29:56 Please still keep this TODO. Because we still pull
- stdio_log = 'Stdio log is too large for urlfetch.'
-
- if not stdio_log: # pragma: no cover
- raise pipeline.Retry('Failed to pull stdio of step %s of master %s'
- % (step_name, master_name))
-
- # Save stdio in datastore and avoid downloading again during retry.
+ # TODO: Add test level log info to signal
stgao 2015/05/21 00:29:56 style.
+ step_log = buildbot.GetGsStepLog(
+ master_name, builder_name, build_number, step_name)
+ if step_log:
+ test_failure_log = str(self._GetTestLevelFailures(step_log))
stgao 2015/05/21 00:29:56 This is a little hacky. How about using StringIO
+ else:
+ try:
+ test_failure_log = buildbot.GetStepStdio(
+ master_name, builder_name, build_number, step_name,
+ self.HTTP_CLIENT)
+ except ResponseTooLargeError: # pragma: no cover.
+ logging.exception(
+ 'Log of step "%s" is too large for urlfetch.', step_name)
+ # If the stdio log of a step is too large, we don't want to pull it
+ # again in next run, because that might lead to DDoS to the master.
+ test_failure_log = 'Stdio log is too large for urlfetch.'
+
+ if not test_failure_log: # pragma: no cover
+ raise pipeline.Retry('Failed to pull stdio of step %s of master %s'
+ % (step_name, master_name))
+
+ # Save step log in datastore and avoid downloading again during retry.
if not step: # pragma: no cover
step = WfStep.Create(
master_name, builder_name, build_number, step_name)
- step.log_data = self._ExtractStorablePortionOfLog(stdio_log)
+ step.log_data = self._ExtractStorablePortionOfLog(test_failure_log)
+
try:
step.put()
except Exception as e: # pragma: no cover
- # Sometimes, the stdio log is too large to save in datastore.
+ # Sometimes, the step log is too large to save in datastore.
logging.exception(e)
# TODO: save result in datastore?
signals[step_name] = extractors.ExtractSignal(
- master_name, builder_name, step_name, None, stdio_log).ToJson()
+ master_name, builder_name, step_name, None,
+ str(test_failure_log)).ToJson()
return signals

Powered by Google App Engine
This is Rietveld 408576698