OLD | NEW |
1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import datetime | 5 import datetime |
6 import dateutil.parser | 6 import dateutil.parser |
7 import logging | 7 import logging |
8 import json | 8 import json |
9 import time | 9 import time |
10 import urllib2 | 10 import urllib2 |
11 | 11 |
12 from google.appengine.api import taskqueue | 12 from google.appengine.api import taskqueue |
13 from google.appengine.api import urlfetch | 13 from google.appengine.api import urlfetch |
14 from google.appengine.ext import deferred | 14 from google.appengine.ext import deferred |
15 from google.appengine.ext import ndb | 15 from google.appengine.ext import ndb |
16 from google.appengine.runtime import DeadlineExceededError | 16 from google.appengine.runtime import DeadlineExceededError |
17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS | 17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS |
18 from model.build_run import BuildRun | 18 from model.build_run import BuildRun |
19 from model.build_run import PatchsetBuilderRuns | 19 from model.build_run import PatchsetBuilderRuns |
20 from model.fetch_status import FetchStatus | 20 from model.fetch_status import FetchStatus |
21 from model.flake import Flake | 21 from model.flake import Flake |
22 from model.flake import FlakeOccurrence | 22 from model.flake import FlakeOccurrence |
23 from model.flake import FlakyRun | 23 from model.flake import FlakyRun |
24 from status import build_result | 24 from status import build_result, util |
25 import time_functions.timestamp | 25 import time_functions.timestamp |
26 | 26 |
27 | 27 |
28 @ndb.transactional | 28 @ndb.transactional |
29 def get_patchset_builder_runs(issue, patchset, master, builder): | 29 def get_patchset_builder_runs(issue, patchset, master, builder): |
30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master, | 30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master, |
31 builder) | 31 builder) |
32 | 32 |
33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id( | 33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id( |
34 patchset_builder_runs_id) | 34 patchset_builder_runs_id) |
35 if not patchset_builder_runs: | 35 if not patchset_builder_runs: |
36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue, | 36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue, |
37 patchset=patchset, | 37 patchset=patchset, |
38 master=master, | 38 master=master, |
39 builder=builder, | 39 builder=builder, |
40 id=patchset_builder_runs_id) | 40 id=patchset_builder_runs_id) |
41 patchset_builder_runs.put() | 41 patchset_builder_runs.put() |
42 | 42 |
43 return patchset_builder_runs | 43 return patchset_builder_runs |
44 | 44 |
45 | 45 |
46 def is_last_hour(date): | |
47 return (datetime.datetime.utcnow() - date) < datetime.timedelta(hours=1) | |
48 | |
49 | |
50 def is_last_day(date): | |
51 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=1) | |
52 | |
53 | |
54 def is_last_week(date): | |
55 return (datetime.datetime.utcnow() - date) < datetime.timedelta(weeks=1) | |
56 | |
57 | |
58 def is_last_month(date): | |
59 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=31) | |
60 | |
61 | |
62 # Updates a Flake object, which spans all the instances of one flake, with the | 46 # Updates a Flake object, which spans all the instances of one flake, with the |
63 # time of an occurrence of that flake. | 47 # time of an occurrence of that flake. |
64 def add_occurrence_time_to_flake(flake, occurrence_time): # pragma: no cover | |
65 if occurrence_time > flake.last_time_seen: | |
66 flake.last_time_seen = occurrence_time | |
67 if is_last_hour(occurrence_time): | |
68 flake.count_hour += 1 | |
69 flake.last_hour = True | |
70 if is_last_day(occurrence_time): | |
71 flake.count_day += 1 | |
72 flake.last_day = True | |
73 if is_last_week(occurrence_time): | |
74 flake.count_week += 1 | |
75 flake.last_week = True | |
76 if is_last_month(occurrence_time): | |
77 flake.count_month += 1 | |
78 flake.last_month = True | |
79 flake.count_all += 1 | |
80 | |
81 | |
82 # Calculate the counters for a Flake object. | 48 # Calculate the counters for a Flake object. |
83 def update_flake_counters(flake): # pragma: no cover | 49 def update_flake_counters(flake): # pragma: no cover |
84 occurrences = ndb.get_multi(flake.occurrences) | 50 occurrences = ndb.get_multi(flake.occurrences) |
85 flake.count_hour = 0 | 51 flake.count_hour = 0 |
86 flake.count_day = 0 | 52 flake.count_day = 0 |
87 flake.count_week = 0 | 53 flake.count_week = 0 |
88 flake.count_month = 0 | 54 flake.count_month = 0 |
89 flake.count_all = 0 | 55 flake.count_all = 0 |
90 flake.last_hour = False | 56 flake.last_hour = False |
91 flake.last_day = False | 57 flake.last_day = False |
92 flake.last_week = False | 58 flake.last_week = False |
93 flake.last_month = False | 59 flake.last_month = False |
94 flake.last_time_seen = datetime.datetime.min | 60 flake.last_time_seen = datetime.datetime.min |
95 for o in occurrences: | 61 for o in occurrences: |
96 add_occurrence_time_to_flake(flake, o.failure_run_time_finished) | 62 util.add_occurrence_time_to_flake(flake, o.failure_run_time_finished) |
97 flake.put() | 63 flake.put() |
98 | 64 |
99 | 65 |
100 # The following four functions are cron jobs which update the counters for | 66 # The following four functions are cron jobs which update the counters for |
101 # flakes. To speed things up, we don't update last month/week/day as often as we | 67 # flakes. To speed things up, we don't update last month/week/day as often as we |
102 # update hourly counters. | 68 # update hourly counters. |
103 def update_flake_hour_counter(): # pragma: no cover | 69 def update_flake_hour_counter(): # pragma: no cover |
104 query = Flake.query().filter(Flake.last_hour == True) | 70 query = Flake.query().filter(Flake.last_hour == True) |
105 for flake in query: | 71 for flake in query: |
106 update_flake_counters(flake) | 72 update_flake_counters(flake) |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
139 url='/issues/process/%s' % flake.key.urlsafe()) | 105 url='/issues/process/%s' % flake.key.urlsafe()) |
140 | 106 |
141 | 107 |
142 def update_stale_issues(): | 108 def update_stale_issues(): |
143 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id], | 109 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id], |
144 distinct=True): | 110 distinct=True): |
145 taskqueue.add(queue_name='issue-updates', | 111 taskqueue.add(queue_name='issue-updates', |
146 url='/issues/update-if-stale/%s' % flake.issue_id) | 112 url='/issues/update-if-stale/%s' % flake.issue_id) |
147 | 113 |
148 | 114 |
149 @ndb.transactional(xg=True) # pylint: disable=no-value-for-parameter | |
150 def add_failure_to_flake(name, flaky_run): | |
151 flake = Flake.get_by_id(name) | |
152 if not flake: | |
153 flake = Flake(name=name, id=name, last_time_seen=datetime.datetime.min) | |
154 flake.put() | |
155 | |
156 flake.occurrences.append(flaky_run.key) | |
157 | |
158 flaky_run_time = flaky_run.failure_run.get().time_finished | |
159 add_occurrence_time_to_flake(flake, flaky_run_time) | |
160 | |
161 flake.put() | |
162 | |
163 # see examples: | |
164 # compile http://build.chromium.org/p/tryserver.chromium.mac/json/builders/ | |
165 # mac_chromium_compile_dbg/builds/11167?as_text=1 | |
166 # gtest http://build.chromium.org/p/tryserver.chromium.win/json/builders/ | |
167 # win_chromium_x64_rel_swarming/builds/4357?as_text=1 | |
168 # TODO(jam): get specific problem with compile so we can use that as name | |
169 # TODO(jam): It's unfortunate to have to parse this html. Can we get it from | |
170 # another place instead of the tryserver's json? | |
171 def get_flakes(step): | |
172 combined = ' '.join(step['text']) | |
173 | |
174 # If test results were invalid, report whole step as flaky. | |
175 if 'TEST RESULTS WERE INVALID' in combined: | |
176 return [combined] | |
177 | |
178 #gtest | |
179 gtest_search_str = 'failures:<br/>' | |
180 gtest_search_index = combined.find(gtest_search_str) | |
181 if gtest_search_index != -1: | |
182 failures = combined[gtest_search_index + len(gtest_search_str):] | |
183 failures = failures.split('<br/>') | |
184 results = [] | |
185 for failure in failures: | |
186 if not failure: | |
187 continue | |
188 if failure == 'ignored:': | |
189 break # layout test output | |
190 results.append(failure) | |
191 return results | |
192 | |
193 #gpu | |
194 gpu_search_str = '&tests=' | |
195 gpu_search_index = combined.find(gpu_search_str) | |
196 if gpu_search_index != -1: | |
197 failures = combined[gpu_search_index + len(gpu_search_str):] | |
198 end_index = failures.find('">') | |
199 failures = failures[:end_index ] | |
200 failures = failures.split(',') | |
201 results = [] | |
202 for failure in failures: | |
203 if not failure: | |
204 continue | |
205 results.append(failure) | |
206 return results | |
207 | |
208 return [combined] | |
209 | |
210 | |
211 # A queued task which polls the tryserver to get more information about why a | |
212 # run failed. | |
213 def get_flaky_run_reason(flaky_run_key): | |
214 flaky_run = flaky_run_key.get() | |
215 failure_run = flaky_run.failure_run.get() | |
216 success_time = flaky_run.success_run.get().time_finished | |
217 failure_time = flaky_run.failure_run_time_finished | |
218 patchset_builder_runs = failure_run.key.parent().get() | |
219 url = ('http://build.chromium.org/p/' + patchset_builder_runs.master + | |
220 '/json/builders/' + patchset_builder_runs.builder +'/builds/' + | |
221 str(failure_run.buildnumber)) | |
222 urlfetch.set_default_fetch_deadline(60) | |
223 logging.info('get_flaky_run_reason ' + url) | |
224 result = urlfetch.fetch(url).content | |
225 try: | |
226 json_result = json.loads(result) | |
227 except ValueError: | |
228 logging.exception('couldnt decode json for %s', url) | |
229 return | |
230 steps = json_result['steps'] | |
231 | |
232 failed_steps = [] | |
233 passed_steps = [] | |
234 for step in steps: | |
235 result = step['results'][0] | |
236 if build_result.isResultSuccess(result): | |
237 passed_steps.append(step) | |
238 continue | |
239 if not build_result.isResultFailure(result): | |
240 continue | |
241 step_name = step['name'] | |
242 step_text = ' '.join(step['text']) | |
243 # The following step failures are ignored: | |
244 # - steps: always red when any other step is red (not actual failure) | |
245 # - [swarming] ...: summary step would also be red (do not double count) | |
246 # - presubmit: typically red due to missing OWNERs LGTM, not a flake | |
247 # - recipe failure reason: always red when build fails (not actual failure) | |
248 # - Patch failure: if success run was before failure run, it is | |
249 # likely a legitimate failure. For example it often happens that | |
250 # developers use CQ dry run and then wait for a review. Once getting LGTM | |
251 # they check CQ checkbox, but the patch does not cleanly apply anymore. | |
252 # - bot_update PATCH FAILED: Corresponds to 'Patch failure' step. | |
253 # - test results: always red when another step is red (not actual failure) | |
254 # - Uncaught Exception: summary step referring to an exception in another | |
255 # step (e.g. bot_update) | |
256 if (step_name == 'steps' or step_name.startswith('[swarming]') or | |
257 step_name == 'presubmit' or step_name == 'recipe failure reason' or | |
258 (step_name == 'Patch failure' and success_time < failure_time) or | |
259 (step_name == 'bot_update' and 'PATCH FAILED' in step_text) or | |
260 step_name == 'test results' or step_name == 'Uncaught Exception'): | |
261 continue | |
262 failed_steps.append(step) | |
263 | |
264 steps_to_ignore = [] | |
265 for step in failed_steps: | |
266 step_name = step['name'] | |
267 if ' (with patch)' in step_name: | |
268 # Android instrumentation tests add a prefix before the step name, which | |
269 # doesn't appear on the summary step (without suffixes). To make sure we | |
270 # correctly ignore duplicate failures, we remove the prefix. | |
271 step_name = step_name.replace('Instrumentation test ', '') | |
272 | |
273 step_name_with_no_modifier = step_name.replace(' (with patch)', '') | |
274 for other_step in failed_steps: | |
275 # A step which fails, and then is retried and also fails, will have its | |
276 # name without the ' (with patch)' again. Don't double count. | |
277 if other_step['name'] == step_name_with_no_modifier: | |
278 steps_to_ignore.append(other_step['name']) | |
279 | |
280 # If a step fails without the patch, then the tree is busted. Don't count | |
281 # as flake. | |
282 step_name_without_patch = step_name_with_no_modifier + ' (without patch)' | |
283 for other_step in failed_steps: | |
284 if other_step['name'] == step_name_without_patch: | |
285 steps_to_ignore.append(step['name']) | |
286 steps_to_ignore.append(other_step['name']) | |
287 | |
288 for step in failed_steps: | |
289 step_name = step['name'] | |
290 if step_name in steps_to_ignore: | |
291 continue | |
292 flakes = get_flakes(step) | |
293 if not flakes: | |
294 continue | |
295 for flake in flakes: | |
296 flake_occurrence = FlakeOccurrence(name=step_name, failure=flake) | |
297 flaky_run.flakes.append(flake_occurrence) | |
298 | |
299 add_failure_to_flake(flake, flaky_run) | |
300 flaky_run.put() | |
301 | |
302 | |
303 def get_int_value(properties, key): | 115 def get_int_value(properties, key): |
304 if not key in properties: | 116 if not key in properties: |
305 raise ValueError('key not found') | 117 raise ValueError('key not found') |
306 value = properties[key] | 118 value = properties[key] |
307 if type(value) == type(list()): | 119 if type(value) == type(list()): |
308 value = value[0] | 120 value = value[0] |
309 return int(value) | 121 return int(value) |
310 | 122 |
311 | 123 |
312 # Parses the json which we get from chromium-cq-status. | 124 # Parses the json which we get from chromium-cq-status. |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
393 if duplicate: | 205 if duplicate: |
394 continue | 206 continue |
395 | 207 |
396 build_run.put() | 208 build_run.put() |
397 | 209 |
398 for previous_run in previous_runs: | 210 for previous_run in previous_runs: |
399 if previous_run.is_success == build_run.is_success: | 211 if previous_run.is_success == build_run.is_success: |
400 continue | 212 continue |
401 if success: | 213 if success: |
402 # We saw the flake and then the pass. | 214 # We saw the flake and then the pass. |
403 flaky_run = FlakyRun( | 215 failure_run = previous_run |
404 failure_run=previous_run.key, | 216 success_run = build_run |
405 failure_run_time_started=previous_run.time_started, | |
406 failure_run_time_finished=previous_run.time_finished, | |
407 success_run=build_run.key) | |
408 flaky_run.put() | |
409 logging_output.append(previous_run.key.parent().get().builder + | |
410 str(previous_run.buildnumber)) | |
411 else: | 217 else: |
412 # We saw the pass and then the failure. Could happen when fetching | 218 # We saw the pass and then the failure. Could happen when fetching |
413 # historical data, or for the bot_update step (patch can't be | 219 # historical data, or for the bot_update step (patch can't be |
414 # applied cleanly anymore). | 220 # applied cleanly anymore). |
415 flaky_run = FlakyRun( | 221 failure_run = build_run |
416 failure_run=build_run.key, | 222 success_run = previous_run |
417 failure_run_time_started=build_run.time_started, | |
418 failure_run_time_finished=build_run.time_finished, | |
419 success_run=previous_run.key) | |
420 flaky_run.put() | |
421 logging_output.append(build_run.key.parent().get().builder + | |
422 str(build_run.buildnumber)) | |
423 | 223 |
424 # Queue a task to fetch the error of this failure. | 224 logging_output.append(failure_run.key.parent().get().builder + |
425 deferred.defer(get_flaky_run_reason, flaky_run.key) | 225 str(failure_run.buildnumber)) |
| 226 |
| 227 # Queue a task to fetch the error of this failure and create FlakyRun. |
| 228 taskqueue.add( |
| 229 queue_name='issue-updates', |
| 230 url='/issues/create_flaky_run', |
| 231 params={'failure_run_key': failure_run.key.urlsafe(), |
| 232 'success_run_key': success_run.key.urlsafe()}) |
426 | 233 |
427 return logging_output | 234 return logging_output |
428 | 235 |
429 | 236 |
430 def fetch_cq_status(): | 237 def fetch_cq_status(): |
431 """Fetches data from chromium-cq-status app and saves new data. | 238 """Fetches data from chromium-cq-status app and saves new data. |
432 | 239 |
433 Remembers old cursor and fetches new data. | 240 Remembers old cursor and fetches new data. |
434 """ | 241 """ |
435 | 242 |
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
540 else: | 347 else: |
541 fetch_status.begin = begin | 348 fetch_status.begin = begin |
542 fetch_status.end = end | 349 fetch_status.end = end |
543 fetch_status.cursor = cursor | 350 fetch_status.cursor = cursor |
544 fetch_status.put() | 351 fetch_status.put() |
545 | 352 |
546 if not more: | 353 if not more: |
547 return # finish the cron job and wait for next iteration | 354 return # finish the cron job and wait for next iteration |
548 except urllib2.URLError, e: | 355 except urllib2.URLError, e: |
549 logging.warning('Failed to fetch CQ status: %s', e.reason) | 356 logging.warning('Failed to fetch CQ status: %s', e.reason) |
OLD | NEW |