Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(579)

Side by Side Diff: appengine/chromium_try_flakes/status/cq_status.py

Issue 1660043002: Move flaky run processing into a taskqueue (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2014 The Chromium Authors. All rights reserved. 1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import datetime 5 import datetime
6 import dateutil.parser 6 import dateutil.parser
7 import logging 7 import logging
8 import json 8 import json
9 import time 9 import time
10 import urllib2 10 import urllib2
11 11
12 from google.appengine.api import taskqueue 12 from google.appengine.api import taskqueue
13 from google.appengine.api import urlfetch 13 from google.appengine.api import urlfetch
14 from google.appengine.ext import deferred 14 from google.appengine.ext import deferred
15 from google.appengine.ext import ndb 15 from google.appengine.ext import ndb
16 from google.appengine.runtime import DeadlineExceededError 16 from google.appengine.runtime import DeadlineExceededError
17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS 17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS
18 from model.build_run import BuildRun 18 from model.build_run import BuildRun
19 from model.build_run import PatchsetBuilderRuns 19 from model.build_run import PatchsetBuilderRuns
20 from model.fetch_status import FetchStatus 20 from model.fetch_status import FetchStatus
21 from model.flake import Flake 21 from model.flake import Flake
22 from model.flake import FlakeOccurrence 22 from model.flake import FlakeOccurrence
23 from model.flake import FlakyRun 23 from model.flake import FlakyRun
24 from status import build_result 24 from status import build_result, util
25 import time_functions.timestamp 25 import time_functions.timestamp
26 26
27 27
28 @ndb.transactional 28 @ndb.transactional
29 def get_patchset_builder_runs(issue, patchset, master, builder): 29 def get_patchset_builder_runs(issue, patchset, master, builder):
30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master, 30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master,
31 builder) 31 builder)
32 32
33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id( 33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id(
34 patchset_builder_runs_id) 34 patchset_builder_runs_id)
35 if not patchset_builder_runs: 35 if not patchset_builder_runs:
36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue, 36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue,
37 patchset=patchset, 37 patchset=patchset,
38 master=master, 38 master=master,
39 builder=builder, 39 builder=builder,
40 id=patchset_builder_runs_id) 40 id=patchset_builder_runs_id)
41 patchset_builder_runs.put() 41 patchset_builder_runs.put()
42 42
43 return patchset_builder_runs 43 return patchset_builder_runs
44 44
45 45
46 def is_last_hour(date):
47 return (datetime.datetime.utcnow() - date) < datetime.timedelta(hours=1)
48
49
50 def is_last_day(date):
51 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=1)
52
53
54 def is_last_week(date):
55 return (datetime.datetime.utcnow() - date) < datetime.timedelta(weeks=1)
56
57
58 def is_last_month(date):
59 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=31)
60
61
62 # Updates a Flake object, which spans all the instances of one flake, with the 46 # Updates a Flake object, which spans all the instances of one flake, with the
63 # time of an occurrence of that flake. 47 # time of an occurrence of that flake.
64 def add_occurrence_time_to_flake(flake, occurrence_time): # pragma: no cover
65 if occurrence_time > flake.last_time_seen:
66 flake.last_time_seen = occurrence_time
67 if is_last_hour(occurrence_time):
68 flake.count_hour += 1
69 flake.last_hour = True
70 if is_last_day(occurrence_time):
71 flake.count_day += 1
72 flake.last_day = True
73 if is_last_week(occurrence_time):
74 flake.count_week += 1
75 flake.last_week = True
76 if is_last_month(occurrence_time):
77 flake.count_month += 1
78 flake.last_month = True
79 flake.count_all += 1
80
81
82 # Calculate the counters for a Flake object. 48 # Calculate the counters for a Flake object.
83 def update_flake_counters(flake): # pragma: no cover 49 def update_flake_counters(flake): # pragma: no cover
84 occurrences = ndb.get_multi(flake.occurrences) 50 occurrences = ndb.get_multi(flake.occurrences)
85 flake.count_hour = 0 51 flake.count_hour = 0
86 flake.count_day = 0 52 flake.count_day = 0
87 flake.count_week = 0 53 flake.count_week = 0
88 flake.count_month = 0 54 flake.count_month = 0
89 flake.count_all = 0 55 flake.count_all = 0
90 flake.last_hour = False 56 flake.last_hour = False
91 flake.last_day = False 57 flake.last_day = False
92 flake.last_week = False 58 flake.last_week = False
93 flake.last_month = False 59 flake.last_month = False
94 flake.last_time_seen = datetime.datetime.min 60 flake.last_time_seen = datetime.datetime.min
95 for o in occurrences: 61 for o in occurrences:
96 add_occurrence_time_to_flake(flake, o.failure_run_time_finished) 62 util.add_occurrence_time_to_flake(flake, o.failure_run_time_finished)
97 flake.put() 63 flake.put()
98 64
99 65
100 # The following four functions are cron jobs which update the counters for 66 # The following four functions are cron jobs which update the counters for
101 # flakes. To speed things up, we don't update last month/week/day as often as we 67 # flakes. To speed things up, we don't update last month/week/day as often as we
102 # update hourly counters. 68 # update hourly counters.
103 def update_flake_hour_counter(): # pragma: no cover 69 def update_flake_hour_counter(): # pragma: no cover
104 query = Flake.query().filter(Flake.last_hour == True) 70 query = Flake.query().filter(Flake.last_hour == True)
105 for flake in query: 71 for flake in query:
106 update_flake_counters(flake) 72 update_flake_counters(flake)
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
139 url='/issues/process/%s' % flake.key.urlsafe()) 105 url='/issues/process/%s' % flake.key.urlsafe())
140 106
141 107
142 def update_stale_issues(): 108 def update_stale_issues():
143 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id], 109 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id],
144 distinct=True): 110 distinct=True):
145 taskqueue.add(queue_name='issue-updates', 111 taskqueue.add(queue_name='issue-updates',
146 url='/issues/update-if-stale/%s' % flake.issue_id) 112 url='/issues/update-if-stale/%s' % flake.issue_id)
147 113
148 114
149 @ndb.transactional(xg=True) # pylint: disable=no-value-for-parameter
150 def add_failure_to_flake(name, flaky_run):
151 flake = Flake.get_by_id(name)
152 if not flake:
153 flake = Flake(name=name, id=name, last_time_seen=datetime.datetime.min)
154 flake.put()
155
156 flake.occurrences.append(flaky_run.key)
157
158 flaky_run_time = flaky_run.failure_run.get().time_finished
159 add_occurrence_time_to_flake(flake, flaky_run_time)
160
161 flake.put()
162
163 # see examples:
164 # compile http://build.chromium.org/p/tryserver.chromium.mac/json/builders/
165 # mac_chromium_compile_dbg/builds/11167?as_text=1
166 # gtest http://build.chromium.org/p/tryserver.chromium.win/json/builders/
167 # win_chromium_x64_rel_swarming/builds/4357?as_text=1
168 # TODO(jam): get specific problem with compile so we can use that as name
169 # TODO(jam): It's unfortunate to have to parse this html. Can we get it from
170 # another place instead of the tryserver's json?
171 def get_flakes(step):
172 combined = ' '.join(step['text'])
173
174 # If test results were invalid, report whole step as flaky.
175 if 'TEST RESULTS WERE INVALID' in combined:
176 return [combined]
177
178 #gtest
179 gtest_search_str = 'failures:<br/>'
180 gtest_search_index = combined.find(gtest_search_str)
181 if gtest_search_index != -1:
182 failures = combined[gtest_search_index + len(gtest_search_str):]
183 failures = failures.split('<br/>')
184 results = []
185 for failure in failures:
186 if not failure:
187 continue
188 if failure == 'ignored:':
189 break # layout test output
190 results.append(failure)
191 return results
192
193 #gpu
194 gpu_search_str = '&tests='
195 gpu_search_index = combined.find(gpu_search_str)
196 if gpu_search_index != -1:
197 failures = combined[gpu_search_index + len(gpu_search_str):]
198 end_index = failures.find('">')
199 failures = failures[:end_index ]
200 failures = failures.split(',')
201 results = []
202 for failure in failures:
203 if not failure:
204 continue
205 results.append(failure)
206 return results
207
208 return [combined]
209
210
211 # A queued task which polls the tryserver to get more information about why a
212 # run failed.
213 def get_flaky_run_reason(flaky_run_key):
214 flaky_run = flaky_run_key.get()
215 failure_run = flaky_run.failure_run.get()
216 success_time = flaky_run.success_run.get().time_finished
217 failure_time = flaky_run.failure_run_time_finished
218 patchset_builder_runs = failure_run.key.parent().get()
219 url = ('http://build.chromium.org/p/' + patchset_builder_runs.master +
220 '/json/builders/' + patchset_builder_runs.builder +'/builds/' +
221 str(failure_run.buildnumber))
222 urlfetch.set_default_fetch_deadline(60)
223 logging.info('get_flaky_run_reason ' + url)
224 result = urlfetch.fetch(url).content
225 try:
226 json_result = json.loads(result)
227 except ValueError:
228 logging.exception('couldnt decode json for %s', url)
229 return
230 steps = json_result['steps']
231
232 failed_steps = []
233 passed_steps = []
234 for step in steps:
235 result = step['results'][0]
236 if build_result.isResultSuccess(result):
237 passed_steps.append(step)
238 continue
239 if not build_result.isResultFailure(result):
240 continue
241 step_name = step['name']
242 step_text = ' '.join(step['text'])
243 # The following step failures are ignored:
244 # - steps: always red when any other step is red (not actual failure)
245 # - [swarming] ...: summary step would also be red (do not double count)
246 # - presubmit: typically red due to missing OWNERs LGTM, not a flake
247 # - recipe failure reason: always red when build fails (not actual failure)
248 # - Patch failure: if success run was before failure run, it is
249 # likely a legitimate failure. For example it often happens that
250 # developers use CQ dry run and then wait for a review. Once getting LGTM
251 # they check CQ checkbox, but the patch does not cleanly apply anymore.
252 # - bot_update PATCH FAILED: Corresponds to 'Patch failure' step.
253 # - test results: always red when another step is red (not actual failure)
254 # - Uncaught Exception: summary step referring to an exception in another
255 # step (e.g. bot_update)
256 if (step_name == 'steps' or step_name.startswith('[swarming]') or
257 step_name == 'presubmit' or step_name == 'recipe failure reason' or
258 (step_name == 'Patch failure' and success_time < failure_time) or
259 (step_name == 'bot_update' and 'PATCH FAILED' in step_text) or
260 step_name == 'test results' or step_name == 'Uncaught Exception'):
261 continue
262 failed_steps.append(step)
263
264 steps_to_ignore = []
265 for step in failed_steps:
266 step_name = step['name']
267 if ' (with patch)' in step_name:
268 # Android instrumentation tests add a prefix before the step name, which
269 # doesn't appear on the summary step (without suffixes). To make sure we
270 # correctly ignore duplicate failures, we remove the prefix.
271 step_name = step_name.replace('Instrumentation test ', '')
272
273 step_name_with_no_modifier = step_name.replace(' (with patch)', '')
274 for other_step in failed_steps:
275 # A step which fails, and then is retried and also fails, will have its
276 # name without the ' (with patch)' again. Don't double count.
277 if other_step['name'] == step_name_with_no_modifier:
278 steps_to_ignore.append(other_step['name'])
279
280 # If a step fails without the patch, then the tree is busted. Don't count
281 # as flake.
282 step_name_without_patch = step_name_with_no_modifier + ' (without patch)'
283 for other_step in failed_steps:
284 if other_step['name'] == step_name_without_patch:
285 steps_to_ignore.append(step['name'])
286 steps_to_ignore.append(other_step['name'])
287
288 for step in failed_steps:
289 step_name = step['name']
290 if step_name in steps_to_ignore:
291 continue
292 flakes = get_flakes(step)
293 if not flakes:
294 continue
295 for flake in flakes:
296 flake_occurrence = FlakeOccurrence(name=step_name, failure=flake)
297 flaky_run.flakes.append(flake_occurrence)
298
299 add_failure_to_flake(flake, flaky_run)
300 flaky_run.put()
301
302
303 def get_int_value(properties, key): 115 def get_int_value(properties, key):
304 if not key in properties: 116 if not key in properties:
305 raise ValueError('key not found') 117 raise ValueError('key not found')
306 value = properties[key] 118 value = properties[key]
307 if type(value) == type(list()): 119 if type(value) == type(list()):
308 value = value[0] 120 value = value[0]
309 return int(value) 121 return int(value)
310 122
311 123
312 # Parses the json which we get from chromium-cq-status. 124 # Parses the json which we get from chromium-cq-status.
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
393 if duplicate: 205 if duplicate:
394 continue 206 continue
395 207
396 build_run.put() 208 build_run.put()
397 209
398 for previous_run in previous_runs: 210 for previous_run in previous_runs:
399 if previous_run.is_success == build_run.is_success: 211 if previous_run.is_success == build_run.is_success:
400 continue 212 continue
401 if success: 213 if success:
402 # We saw the flake and then the pass. 214 # We saw the flake and then the pass.
403 flaky_run = FlakyRun( 215 failure_run = previous_run
404 failure_run=previous_run.key, 216 success_run = build_run
405 failure_run_time_started=previous_run.time_started,
406 failure_run_time_finished=previous_run.time_finished,
407 success_run=build_run.key)
408 flaky_run.put()
409 logging_output.append(previous_run.key.parent().get().builder +
410 str(previous_run.buildnumber))
411 else: 217 else:
412 # We saw the pass and then the failure. Could happen when fetching 218 # We saw the pass and then the failure. Could happen when fetching
413 # historical data, or for the bot_update step (patch can't be 219 # historical data, or for the bot_update step (patch can't be
414 # applied cleanly anymore). 220 # applied cleanly anymore).
415 flaky_run = FlakyRun( 221 failure_run = build_run
416 failure_run=build_run.key, 222 success_run = previous_run
417 failure_run_time_started=build_run.time_started,
418 failure_run_time_finished=build_run.time_finished,
419 success_run=previous_run.key)
420 flaky_run.put()
421 logging_output.append(build_run.key.parent().get().builder +
422 str(build_run.buildnumber))
423 223
424 # Queue a task to fetch the error of this failure. 224 logging_output.append(failure_run.key.parent().get().builder +
425 deferred.defer(get_flaky_run_reason, flaky_run.key) 225 str(failure_run.buildnumber))
226
227 # Queue a task to fetch the error of this failure and create FlakyRun.
228 taskqueue.add(
229 queue_name='issue-updates',
230 url='/issues/create_flaky_run',
231 params={'failure_run_key': failure_run.key.urlsafe(),
232 'success_run_key': success_run.key.urlsafe()})
426 233
427 return logging_output 234 return logging_output
428 235
429 236
430 def fetch_cq_status(): 237 def fetch_cq_status():
431 """Fetches data from chromium-cq-status app and saves new data. 238 """Fetches data from chromium-cq-status app and saves new data.
432 239
433 Remembers old cursor and fetches new data. 240 Remembers old cursor and fetches new data.
434 """ 241 """
435 242
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after
540 else: 347 else:
541 fetch_status.begin = begin 348 fetch_status.begin = begin
542 fetch_status.end = end 349 fetch_status.end = end
543 fetch_status.cursor = cursor 350 fetch_status.cursor = cursor
544 fetch_status.put() 351 fetch_status.put()
545 352
546 if not more: 353 if not more:
547 return # finish the cron job and wait for next iteration 354 return # finish the cron job and wait for next iteration
548 except urllib2.URLError, e: 355 except urllib2.URLError, e:
549 logging.warning('Failed to fetch CQ status: %s', e.reason) 356 logging.warning('Failed to fetch CQ status: %s', e.reason)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698