appengine/chromium_try_flakes/status/cq_status.py - Issue 1660043002: Move flaky run processing into a taskqueue

Side by Side Diff: appengine/chromium_try_flakes/status/cq_status.py

Issue 1660043002: Move flaky run processing into a taskqueue (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Added new files Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« appengine/chromium_try_flakes/handlers/flake_issues.py ('K') | « appengine/chromium_try_flakes/main.py ('k') | appengine/chromium_try_flakes/status/test/cq_status_test.py » ('j') | appengine/chromium_try_flakes/status/util.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2014 The Chromium Authors. All rights reserved.	1 # Copyright 2014 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import datetime	5 import datetime

6 import dateutil.parser	6 import dateutil.parser

7 import logging	7 import logging

8 import json	8 import json

9 import time	9 import time

10 import urllib2	10 import urllib2

11	11

12 from google.appengine.api import taskqueue	12 from google.appengine.api import taskqueue

13 from google.appengine.api import urlfetch	13 from google.appengine.api import urlfetch

14 from google.appengine.ext import deferred	14 from google.appengine.ext import deferred

15 from google.appengine.ext import ndb	15 from google.appengine.ext import ndb

16 from google.appengine.runtime import DeadlineExceededError	16 from google.appengine.runtime import DeadlineExceededError

17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS	17 from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS

18 from model.build_run import BuildRun	18 from model.build_run import BuildRun

19 from model.build_run import PatchsetBuilderRuns	19 from model.build_run import PatchsetBuilderRuns

20 from model.fetch_status import FetchStatus	20 from model.fetch_status import FetchStatus

21 from model.flake import Flake	21 from model.flake import Flake

22 from model.flake import FlakeOccurrence	22 from model.flake import FlakeOccurrence

23 from model.flake import FlakyRun	23 from model.flake import FlakyRun

24 from status import build_result	24 from status import build_result, util

25 import time_functions.timestamp	25 import time_functions.timestamp

26	26

27	27

28 @ndb.transactional	28 @ndb.transactional

29 def get_patchset_builder_runs(issue, patchset, master, builder):	29 def get_patchset_builder_runs(issue, patchset, master, builder):

30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master,	30 patchset_builder_runs_id = PatchsetBuilderRuns.getId(issue, patchset, master,

31 builder)	31 builder)

32	32

33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id(	33 patchset_builder_runs = PatchsetBuilderRuns.get_by_id(

34 patchset_builder_runs_id)	34 patchset_builder_runs_id)

35 if not patchset_builder_runs:	35 if not patchset_builder_runs:

36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue,	36 patchset_builder_runs = PatchsetBuilderRuns(issue=issue,

37 patchset=patchset,	37 patchset=patchset,

38 master=master,	38 master=master,

39 builder=builder,	39 builder=builder,

40 id=patchset_builder_runs_id)	40 id=patchset_builder_runs_id)

41 patchset_builder_runs.put()	41 patchset_builder_runs.put()

42	42

43 return patchset_builder_runs	43 return patchset_builder_runs

44	44

45	45

46 def is_last_hour(date):

47 return (datetime.datetime.utcnow() - date) < datetime.timedelta(hours=1)

48

49

50 def is_last_day(date):

51 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=1)

52

53

54 def is_last_week(date):

55 return (datetime.datetime.utcnow() - date) < datetime.timedelta(weeks=1)

56

57

58 def is_last_month(date):

59 return (datetime.datetime.utcnow() - date) < datetime.timedelta(days=31)

60

61

62 # Updates a Flake object, which spans all the instances of one flake, with the	46 # Updates a Flake object, which spans all the instances of one flake, with the

63 # time of an occurrence of that flake.	47 # time of an occurrence of that flake.

64 def add_occurrence_time_to_flake(flake, occurrence_time): # pragma: no cover

65 if occurrence_time > flake.last_time_seen:

66 flake.last_time_seen = occurrence_time

67 if is_last_hour(occurrence_time):

68 flake.count_hour += 1

69 flake.last_hour = True

70 if is_last_day(occurrence_time):

71 flake.count_day += 1

72 flake.last_day = True

73 if is_last_week(occurrence_time):

74 flake.count_week += 1

75 flake.last_week = True

76 if is_last_month(occurrence_time):

77 flake.count_month += 1

78 flake.last_month = True

79 flake.count_all += 1

80

81

82 # Calculate the counters for a Flake object.	48 # Calculate the counters for a Flake object.

83 def update_flake_counters(flake): # pragma: no cover	49 def update_flake_counters(flake): # pragma: no cover

84 occurrences = ndb.get_multi(flake.occurrences)	50 occurrences = ndb.get_multi(flake.occurrences)

85 flake.count_hour = 0	51 flake.count_hour = 0

86 flake.count_day = 0	52 flake.count_day = 0

87 flake.count_week = 0	53 flake.count_week = 0

88 flake.count_month = 0	54 flake.count_month = 0

89 flake.count_all = 0	55 flake.count_all = 0

90 flake.last_hour = False	56 flake.last_hour = False

91 flake.last_day = False	57 flake.last_day = False

92 flake.last_week = False	58 flake.last_week = False

93 flake.last_month = False	59 flake.last_month = False

94 flake.last_time_seen = datetime.datetime.min	60 flake.last_time_seen = datetime.datetime.min

95 for o in occurrences:	61 for o in occurrences:

96 add_occurrence_time_to_flake(flake, o.failure_run_time_finished)	62 util.add_occurrence_time_to_flake(flake, o.failure_run_time_finished)

97 flake.put()	63 flake.put()

98	64

99	65

100 # The following four functions are cron jobs which update the counters for	66 # The following four functions are cron jobs which update the counters for

101 # flakes. To speed things up, we don't update last month/week/day as often as we	67 # flakes. To speed things up, we don't update last month/week/day as often as we

102 # update hourly counters.	68 # update hourly counters.

103 def update_flake_hour_counter(): # pragma: no cover	69 def update_flake_hour_counter(): # pragma: no cover

104 query = Flake.query().filter(Flake.last_hour == True)	70 query = Flake.query().filter(Flake.last_hour == True)

105 for flake in query:	71 for flake in query:

106 update_flake_counters(flake)	72 update_flake_counters(flake)

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
139 url='/issues/process/%s' % flake.key.urlsafe())	105 url='/issues/process/%s' % flake.key.urlsafe())

140	106

141	107

142 def update_stale_issues():	108 def update_stale_issues():

143 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id],	109 for flake in Flake.query(Flake.issue_id > 0, projection=[Flake.issue_id],

144 distinct=True):	110 distinct=True):

145 taskqueue.add(queue_name='issue-updates',	111 taskqueue.add(queue_name='issue-updates',

146 url='/issues/update-if-stale/%s' % flake.issue_id)	112 url='/issues/update-if-stale/%s' % flake.issue_id)

147	113

148	114

149 @ndb.transactional(xg=True) # pylint: disable=no-value-for-parameter

150 def add_failure_to_flake(name, flaky_run):

151 flake = Flake.get_by_id(name)

152 if not flake:

153 flake = Flake(name=name, id=name, last_time_seen=datetime.datetime.min)

154 flake.put()

155

156 flake.occurrences.append(flaky_run.key)

157

158 flaky_run_time = flaky_run.failure_run.get().time_finished

159 add_occurrence_time_to_flake(flake, flaky_run_time)

160

161 flake.put()

162

163 # see examples:

164 # compile http://build.chromium.org/p/tryserver.chromium.mac/json/builders/

165 # mac_chromium_compile_dbg/builds/11167?as_text=1

166 # gtest http://build.chromium.org/p/tryserver.chromium.win/json/builders/

167 # win_chromium_x64_rel_swarming/builds/4357?as_text=1

168 # TODO(jam): get specific problem with compile so we can use that as name

169 # TODO(jam): It's unfortunate to have to parse this html. Can we get it from

170 # another place instead of the tryserver's json?

171 def get_flakes(step):

172 combined = ' '.join(step['text'])

173

174 # If test results were invalid, report whole step as flaky.

175 if 'TEST RESULTS WERE INVALID' in combined:

176 return [combined]

177

178 #gtest

179 gtest_search_str = 'failures:<br/>'

180 gtest_search_index = combined.find(gtest_search_str)

181 if gtest_search_index != -1:

182 failures = combined[gtest_search_index + len(gtest_search_str):]

183 failures = failures.split('<br/>')

184 results = []

185 for failure in failures:

186 if not failure:

187 continue

188 if failure == 'ignored:':

189 break # layout test output

190 results.append(failure)

191 return results

192

193 #gpu

194 gpu_search_str = '&tests='

195 gpu_search_index = combined.find(gpu_search_str)

196 if gpu_search_index != -1:

197 failures = combined[gpu_search_index + len(gpu_search_str):]

198 end_index = failures.find('">')

199 failures = failures[:end_index ]

200 failures = failures.split(',')

201 results = []

202 for failure in failures:

203 if not failure:

204 continue

205 results.append(failure)

206 return results

207

208 return [combined]

209

210

211 # A queued task which polls the tryserver to get more information about why a

212 # run failed.

213 def get_flaky_run_reason(flaky_run_key):

214 flaky_run = flaky_run_key.get()

215 failure_run = flaky_run.failure_run.get()

216 success_time = flaky_run.success_run.get().time_finished

217 failure_time = flaky_run.failure_run_time_finished

218 patchset_builder_runs = failure_run.key.parent().get()

219 url = ('http://build.chromium.org/p/' + patchset_builder_runs.master +

220 '/json/builders/' + patchset_builder_runs.builder +'/builds/' +

221 str(failure_run.buildnumber))

222 urlfetch.set_default_fetch_deadline(60)

223 logging.info('get_flaky_run_reason ' + url)

224 result = urlfetch.fetch(url).content

225 try:

226 json_result = json.loads(result)

227 except ValueError:

228 logging.exception('couldnt decode json for %s', url)

229 return

230 steps = json_result['steps']

231

232 failed_steps = []

233 passed_steps = []

234 for step in steps:

235 result = step['results'][0]

236 if build_result.isResultSuccess(result):

237 passed_steps.append(step)

238 continue

239 if not build_result.isResultFailure(result):

240 continue

241 step_name = step['name']

242 step_text = ' '.join(step['text'])

243 # The following step failures are ignored:

244 # - steps: always red when any other step is red (not actual failure)

245 # - [swarming] ...: summary step would also be red (do not double count)

246 # - presubmit: typically red due to missing OWNERs LGTM, not a flake

247 # - recipe failure reason: always red when build fails (not actual failure)

248 # - Patch failure: if success run was before failure run, it is

249 # likely a legitimate failure. For example it often happens that

250 # developers use CQ dry run and then wait for a review. Once getting LGTM

251 # they check CQ checkbox, but the patch does not cleanly apply anymore.

252 # - bot_update PATCH FAILED: Corresponds to 'Patch failure' step.

253 # - test results: always red when another step is red (not actual failure)

254 # - Uncaught Exception: summary step referring to an exception in another

255 # step (e.g. bot_update)

256 if (step_name == 'steps' or step_name.startswith('[swarming]') or

257 step_name == 'presubmit' or step_name == 'recipe failure reason' or

258 (step_name == 'Patch failure' and success_time < failure_time) or

259 (step_name == 'bot_update' and 'PATCH FAILED' in step_text) or

260 step_name == 'test results' or step_name == 'Uncaught Exception'):

261 continue

262 failed_steps.append(step)

263

264 steps_to_ignore = []

265 for step in failed_steps:

266 step_name = step['name']

267 if ' (with patch)' in step_name:

268 # Android instrumentation tests add a prefix before the step name, which

269 # doesn't appear on the summary step (without suffixes). To make sure we

270 # correctly ignore duplicate failures, we remove the prefix.

271 step_name = step_name.replace('Instrumentation test ', '')

272

273 step_name_with_no_modifier = step_name.replace(' (with patch)', '')

274 for other_step in failed_steps:

275 # A step which fails, and then is retried and also fails, will have its

276 # name without the ' (with patch)' again. Don't double count.

277 if other_step['name'] == step_name_with_no_modifier:

278 steps_to_ignore.append(other_step['name'])

279

280 # If a step fails without the patch, then the tree is busted. Don't count

281 # as flake.

282 step_name_without_patch = step_name_with_no_modifier + ' (without patch)'

283 for other_step in failed_steps:

284 if other_step['name'] == step_name_without_patch:

285 steps_to_ignore.append(step['name'])

286 steps_to_ignore.append(other_step['name'])

287

288 for step in failed_steps:

289 step_name = step['name']

290 if step_name in steps_to_ignore:

291 continue

292 flakes = get_flakes(step)

293 if not flakes:

294 continue

295 for flake in flakes:

296 flake_occurrence = FlakeOccurrence(name=step_name, failure=flake)

297 flaky_run.flakes.append(flake_occurrence)

298

299 add_failure_to_flake(flake, flaky_run)

300 flaky_run.put()

301

302

303 def get_int_value(properties, key):	115 def get_int_value(properties, key):

304 if not key in properties:	116 if not key in properties:

305 raise ValueError('key not found')	117 raise ValueError('key not found')

306 value = properties[key]	118 value = properties[key]

307 if type(value) == type(list()):	119 if type(value) == type(list()):

308 value = value[0]	120 value = value[0]

309 return int(value)	121 return int(value)

310	122

311	123

312 # Parses the json which we get from chromium-cq-status.	124 # Parses the json which we get from chromium-cq-status.

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
393 if duplicate:	205 if duplicate:

394 continue	206 continue

395	207

396 build_run.put()	208 build_run.put()

397	209

398 for previous_run in previous_runs:	210 for previous_run in previous_runs:

399 if previous_run.is_success == build_run.is_success:	211 if previous_run.is_success == build_run.is_success:

400 continue	212 continue

401 if success:	213 if success:

402 # We saw the flake and then the pass.	214 # We saw the flake and then the pass.

403 flaky_run = FlakyRun(	215 failure_run = previous_run

404 failure_run=previous_run.key,	216 success_run = build_run

405 failure_run_time_started=previous_run.time_started,

406 failure_run_time_finished=previous_run.time_finished,

407 success_run=build_run.key)

408 flaky_run.put()

409 logging_output.append(previous_run.key.parent().get().builder +

410 str(previous_run.buildnumber))

411 else:	217 else:

412 # We saw the pass and then the failure. Could happen when fetching	218 # We saw the pass and then the failure. Could happen when fetching

413 # historical data, or for the bot_update step (patch can't be	219 # historical data, or for the bot_update step (patch can't be

414 # applied cleanly anymore).	220 # applied cleanly anymore).

415 flaky_run = FlakyRun(	221 failure_run = build_run

416 failure_run=build_run.key,	222 success_run = previous_run

417 failure_run_time_started=build_run.time_started,

418 failure_run_time_finished=build_run.time_finished,

419 success_run=previous_run.key)

420 flaky_run.put()

421 logging_output.append(build_run.key.parent().get().builder +

422 str(build_run.buildnumber))

423	223

424 # Queue a task to fetch the error of this failure.	224 logging_output.append(failure_run.key.parent().get().builder +

425 deferred.defer(get_flaky_run_reason, flaky_run.key)	225 str(failure_run.buildnumber))

	226

	227 # Queue a task to fetch the error of this failure and create FlakyRun.

	228 taskqueue.add(

	229 queue_name='issue-updates',

	230 url='/issues/create_flaky_run',

	231 params={'failure_run_key': failure_run.key.urlsafe(),

	232 'success_run_key': success_run.key.urlsafe()})

426	233

427 return logging_output	234 return logging_output

428	235

429	236

430 def fetch_cq_status():	237 def fetch_cq_status():

431 """Fetches data from chromium-cq-status app and saves new data.	238 """Fetches data from chromium-cq-status app and saves new data.

432	239

433 Remembers old cursor and fetches new data.	240 Remembers old cursor and fetches new data.

434 """	241 """

435	242

(...skipping 104 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
540 else:	347 else:

541 fetch_status.begin = begin	348 fetch_status.begin = begin

542 fetch_status.end = end	349 fetch_status.end = end

543 fetch_status.cursor = cursor	350 fetch_status.cursor = cursor

544 fetch_status.put()	351 fetch_status.put()

545	352

546 if not more:	353 if not more:

547 return # finish the cron job and wait for next iteration	354 return # finish the cron job and wait for next iteration

548 except urllib2.URLError, e:	355 except urllib2.URLError, e:

549 logging.warning('Failed to fetch CQ status: %s', e.reason)	356 logging.warning('Failed to fetch CQ status: %s', e.reason)

OLD	NEW