appengine/findit/util_scripts/remote_queries/try_job_data_metrics.py - Issue 2160763002: [Findit] Adding spike detection for try job requests to data query script

Side by Side Diff: appengine/findit/util_scripts/remote_queries/try_job_data_metrics.py

Issue 2160763002: [Findit] Adding spike detection for try job requests to data query script (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Pulls historical try job metadata from Findit and prints a report."""	5 """Pulls historical try job metadata from Findit and prints a report."""

6	6

7 import argparse	7 import argparse

8 from collections import defaultdict	8 from collections import defaultdict

9 import datetime	9 import datetime

10 import json	10 import json

11 import numpy	11 import numpy

12 import os	12 import os

13 import sys	13 import sys

14	14

	15 try:

	16 from matplotlib import pyplot

	17 except ImportError:

	18 pyplot = None

	19

15 _REMOTE_API_DIR = os.path.join(os.path.dirname(__file__), os.path.pardir)	20 _REMOTE_API_DIR = os.path.join(os.path.dirname(__file__), os.path.pardir)

16 sys.path.insert(1, _REMOTE_API_DIR)	21 sys.path.insert(1, _REMOTE_API_DIR)

17	22

18 import remote_api	23 import remote_api

19	24

20 from model.wf_try_job_data import WfTryJobData	25 from model.wf_try_job_data import WfTryJobData

21	26

22	27

23 NOT_AVAILABLE = 'N/A'	28 NOT_AVAILABLE = 'N/A'

24	29

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
88 def _FormatSecondsAsHMS(seconds):	93 def _FormatSecondsAsHMS(seconds):

89 """Formats the number of seconds into hours, minutes, seconds."""	94 """Formats the number of seconds into hours, minutes, seconds."""

90 if seconds == NOT_AVAILABLE:	95 if seconds == NOT_AVAILABLE:

91 return NOT_AVAILABLE	96 return NOT_AVAILABLE

92	97

93 minutes, seconds = divmod(seconds, 60)	98 minutes, seconds = divmod(seconds, 60)

94 hours, minutes = divmod(minutes, 60)	99 hours, minutes = divmod(minutes, 60)

95 return '%d:%02d:%02d' % (hours, minutes, seconds)	100 return '%d:%02d:%02d' % (hours, minutes, seconds)

96	101

97	102

	103 def _GetRequestSpikes(request_times, time_window_seconds=30*60,

	104 minimum_spike_size=3, show_plot=False):

	105 """Calculates and plots try jobs by request time.

	106

	107 Args:

	108 request_time: List of datetime objects representing try job request times.

	109 time_window_seconds: Maximum number of seconds between requests to count

	110 as a spike.

	111 minimum_spike_size: Minimum number of requests within the specified time

	112 window needed to count as a spike.

	113 show_plot: Boolean whether to display visual graphs of the request times.

	114

	115 Returns:

	116 spike_count: The number of spikes found.

	117 average_spike_size: The average number of requests in each spike.

	118 maximum_spike_size: The number of requests in the biggest spike.

	119 """

	120 request_times = sorted(request_times)

	121

	122 if show_plot:

	123 if pyplot:

	124 pyplot.plot(request_times, [i for i in range(len(request_times))], 'x')

	125 pyplot.show()

	126 else:

	127 print ('In order to show plots, matplotlib needs to be installed. To '

	128 'install, please run \'sudo pip install matplotlib\'')

	129

	130 candidate_spike_start = request_times[0]

	131 points_in_spike = 1

	132 spike_count = 0

	133 spike_sizes = []

	134

	135 for point_being_examined in request_times[1:]:
	Sharu Jiang 2016/07/18 22:15:31 I have one concern, in worst case, this approach w I have one concern, in worst case, this approach would only detect half of the spike load, it may be a problem. lijeffrey 2016/07/19 00:33:11 Yes, with this vanilla approach it is possible som Show quoted text On 2016/07/18 22:15:31, sharu jiang wrote: > I have one concern, in worst case, this approach would only detect half of the > spike load, it may be a problem. Yes, with this vanilla approach it is possible some spikes are improperly interpreted. For example in a 30 minute window at minute 0 we have a request, then at 28 and 29 we have 2 more requests. This is a spike of size 3. But at 30, 31, 32, 33, there are 4 more requests. This algorithm will detect 2 spikes, one of size 3 (0, 28, 29), and one of size 4 (30, 31, 32, 33), when the real spike is actually 28-33 of size 6. But this is expected to be a corner case. Since this code is to estimate how many bots are needed, we are only considering average spike size over a large period (3 months) of many spikes to minimize compile try jobs being queued up under theoretical expected load.
	136 if ((point_being_examined - candidate_spike_start).total_seconds() <

	137 time_window_seconds):

	138 points_in_spike += 1

	139 else:

	140 # The time window has passed. Need a new starting point.

	141 if points_in_spike >= minimum_spike_size:

	142 spike_count += 1

	143 spike_sizes.append(points_in_spike)

	144

	145 candidate_spike_start = point_being_examined

	146 points_in_spike = 1 # Start over.

	147

	148 return (spike_count, _GetAverageOfNumbersInList(spike_sizes),

	149 max(spike_sizes) if spike_sizes else 0)

	150

	151

98 def _GetReportInformation(try_job_data_list, start_date, end_date):	152 def _GetReportInformation(try_job_data_list, start_date, end_date):

99 """Computes and returns try job metadata.	153 """Computes and returns try job metadata.

100	154

101 Args:	155 Args:

102 try_job_data_list: A list of WfTryJobData entities.	156 try_job_data_list: A list of WfTryJobData entities.

103 start_date: The earliest request date to compute data.	157 start_date: The earliest request date to compute data.

104 end_date: The latest request date to compute data.	158 end_date: The latest request date to compute data.

105	159

106 Returns:	160 Returns:

107 A dict in the following format:	161 A dict in the following format:

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155 under_five_minutes_rate = NOT_AVAILABLE	209 under_five_minutes_rate = NOT_AVAILABLE

156 under_fifteen_minutes_rate = NOT_AVAILABLE	210 under_fifteen_minutes_rate = NOT_AVAILABLE

157 under_thirty_minutes_rate = NOT_AVAILABLE	211 under_thirty_minutes_rate = NOT_AVAILABLE

158 over_thirty_minutes_rate = NOT_AVAILABLE	212 over_thirty_minutes_rate = NOT_AVAILABLE

159	213

160 if try_job_data_list:	214 if try_job_data_list:

161 try_jobs_per_day = (	215 try_jobs_per_day = (

162 len(try_job_data_list) / float((end_date - start_date).days))	216 len(try_job_data_list) / float((end_date - start_date).days))

163 regression_range_sizes = []	217 regression_range_sizes = []

164 execution_times_seconds = []	218 execution_times_seconds = []

	219 request_times = []

165 in_queue_times = []	220 in_queue_times = []

166 end_to_end_times = []	221 end_to_end_times = []

167 commits_analyzed = []	222 commits_analyzed = []

168 culprits_detected = 0	223 culprits_detected = 0

169 errors_detected = 0	224 errors_detected = 0

170 number_under_five_minutes = 0	225 number_under_five_minutes = 0

171 number_under_fifteen_minutes = 0	226 number_under_fifteen_minutes = 0

172 number_under_thirty_minutes = 0	227 number_under_thirty_minutes = 0

173 number_over_thirty_minutes = 0	228 number_over_thirty_minutes = 0

174 total_number_of_try_jobs = len(try_job_data_list)	229 total_number_of_try_jobs = len(try_job_data_list)

(...skipping 11 matching lines...) Expand all Loading...
186 execution_times_seconds.append(execution_time)	241 execution_times_seconds.append(execution_time)

187	242

188 # In-queue time.	243 # In-queue time.

189 if try_job_data.start_time and try_job_data.request_time:	244 if try_job_data.start_time and try_job_data.request_time:

190 in_queue_time_delta = (	245 in_queue_time_delta = (

191 try_job_data.start_time - try_job_data.request_time)	246 try_job_data.start_time - try_job_data.request_time)

192 in_queue_time = in_queue_time_delta.total_seconds()	247 in_queue_time = in_queue_time_delta.total_seconds()

193 in_queue_times.append(in_queue_time)	248 in_queue_times.append(in_queue_time)

194	249

195 # Total time end-to-end.	250 # Total time end-to-end.

196 if try_job_data.request_time and try_job_data.end_time:	251 if try_job_data.request_time:

197 total_time_delta = try_job_data.end_time - try_job_data.start_time	252 request_times.append(try_job_data.request_time)

198 total_time_seconds = total_time_delta.total_seconds()

199 end_to_end_times.append(total_time_seconds)

200	253

201 if total_time_seconds < 300: # Under 5 minutes.	254 if try_job_data.end_time:

202 number_under_five_minutes += 1	255 total_time_delta = try_job_data.end_time - try_job_data.start_time

203 elif total_time_seconds < 900: # Under 15 minutes.	256 total_time_seconds = total_time_delta.total_seconds()

204 number_under_fifteen_minutes += 1	257 end_to_end_times.append(total_time_seconds)

205 elif total_time_seconds < 1800: # Under 30 minutes.	258

206 number_under_thirty_minutes += 1	259 if total_time_seconds < 300: # Under 5 minutes.

207 else: # Over 30 minutes.	260 number_under_five_minutes += 1

208 number_over_thirty_minutes += 1	261 elif total_time_seconds < 900: # Under 15 minutes.

	262 number_under_fifteen_minutes += 1

	263 elif total_time_seconds < 1800: # Under 30 minutes.

	264 number_under_thirty_minutes += 1

	265 else: # Over 30 minutes.

	266 number_over_thirty_minutes += 1

209	267

210 # Number of commits analyzed.	268 # Number of commits analyzed.

211 if try_job_data.number_of_commits_analyzed:	269 if try_job_data.number_of_commits_analyzed:

212 commits_analyzed.append(try_job_data.number_of_commits_analyzed)	270 commits_analyzed.append(try_job_data.number_of_commits_analyzed)

213	271

214 # Culprit detection rate.	272 # Culprit detection rate.

215 if try_job_data.culprits:	273 if try_job_data.culprits:

216 culprits_detected += 1	274 culprits_detected += 1

217	275

218 if try_job_data.error:	276 if try_job_data.error:

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
258	316

259 under_five_minutes_rate = (	317 under_five_minutes_rate = (

260 float(number_under_five_minutes) / total_number_of_try_jobs)	318 float(number_under_five_minutes) / total_number_of_try_jobs)

261 under_fifteen_minutes_rate = (	319 under_fifteen_minutes_rate = (

262 float(number_under_fifteen_minutes) / total_number_of_try_jobs)	320 float(number_under_fifteen_minutes) / total_number_of_try_jobs)

263 under_thirty_minutes_rate = (	321 under_thirty_minutes_rate = (

264 float(number_under_thirty_minutes) / total_number_of_try_jobs)	322 float(number_under_thirty_minutes) / total_number_of_try_jobs)

265 over_thirty_minutes_rate = (	323 over_thirty_minutes_rate = (

266 float(number_over_thirty_minutes) / total_number_of_try_jobs)	324 float(number_over_thirty_minutes) / total_number_of_try_jobs)

267	325

	326 # Calculate try job spikes.

	327 spike_count, average_spike_size, maximum_spike_size = _GetRequestSpikes(

	328 request_times, time_window_seconds=30*60, minimum_spike_size=3,

	329 show_plot=False)

	330

268 return {	331 return {

269 'try_jobs_per_day': _FormatDigits(try_jobs_per_day),	332 'try_jobs_per_day': _FormatDigits(try_jobs_per_day),

270 'average_regression_range_size': _FormatDigits(	333 'average_regression_range_size': _FormatDigits(

271 average_regression_range_size),	334 average_regression_range_size),

272 'median_regression_range_size': median_regression_range_size,	335 'median_regression_range_size': median_regression_range_size,

273 'average_execution_time': _FormatSecondsAsHMS(_FormatDigits(	336 'average_execution_time': _FormatSecondsAsHMS(_FormatDigits(

274 average_execution_time)),	337 average_execution_time)),

275 'median_execution_time': _FormatSecondsAsHMS(_FormatDigits(	338 'median_execution_time': _FormatSecondsAsHMS(_FormatDigits(

276 median_execution_time)),	339 median_execution_time)),

277 'average_end_to_end_time': _FormatSecondsAsHMS(_FormatDigits(	340 'average_end_to_end_time': _FormatSecondsAsHMS(_FormatDigits(

278 average_end_to_end_time)),	341 average_end_to_end_time)),

279 'median_end_to_end_time': _FormatSecondsAsHMS(_FormatDigits(	342 'median_end_to_end_time': _FormatSecondsAsHMS(_FormatDigits(

280 median_end_to_end_time)),	343 median_end_to_end_time)),

281 'average_time_in_queue': _FormatSecondsAsHMS(	344 'average_time_in_queue': _FormatSecondsAsHMS(

282 _FormatDigits(average_time_in_queue)),	345 _FormatDigits(average_time_in_queue)),

283 'median_time_in_queue': _FormatSecondsAsHMS(_FormatDigits(	346 'median_time_in_queue': _FormatSecondsAsHMS(_FormatDigits(

284 median_time_in_queue)),	347 median_time_in_queue)),

285 'average_commits_analyzed': _FormatDigits(average_commits_analyzed),	348 'average_commits_analyzed': _FormatDigits(average_commits_analyzed),

286 'median_commits_analyzed': median_commits_analyzed,	349 'median_commits_analyzed': median_commits_analyzed,

287 'longest_execution_time': longest_execution_time,	350 'longest_execution_time': longest_execution_time,

288 'shortest_execution_time': shortest_execution_time,	351 'shortest_execution_time': shortest_execution_time,

289 'number_of_try_jobs': number_of_try_jobs,	352 'number_of_try_jobs': number_of_try_jobs,

290 'detection_rate': _FormatDigits(detection_rate),	353 'detection_rate': _FormatDigits(detection_rate),

291 'error_rate': _FormatDigits(error_rate),	354 'error_rate': _FormatDigits(error_rate),

292 'time_per_revision': _FormatSecondsAsHMS(	355 'time_per_revision': _FormatSecondsAsHMS(

293 _FormatDigits(time_per_revision)),	356 _FormatDigits(time_per_revision)),

294 'under_five_minutes_rate': _FormatDigits(under_five_minutes_rate),	357 'under_five_minutes_rate': _FormatDigits(under_five_minutes_rate),

295 'under_fifteen_minutes_rate': _FormatDigits(under_fifteen_minutes_rate),	358 'under_fifteen_minutes_rate': _FormatDigits(under_fifteen_minutes_rate),

296 'under_thirty_minutes_rate': _FormatDigits(under_thirty_minutes_rate),	359 'under_thirty_minutes_rate': _FormatDigits(under_thirty_minutes_rate),

297 'over_thirty_minutes_rate': _FormatDigits(over_thirty_minutes_rate)	360 'over_thirty_minutes_rate': _FormatDigits(over_thirty_minutes_rate),

	361 'request_spike_count': spike_count,

	362 'request_spike_average_size': average_spike_size,

	363 'request_spike_maximum_size': maximum_spike_size,

298 }	364 }

299	365

300	366

301 def PrintCommonStats(try_job_data_list, start_date, end_date, indent):	367 def PrintCommonStats(try_job_data_list, start_date, end_date, indent):

302 """Takes a list of WfTryJobData entities and prints their stats."""	368 """Takes a list of WfTryJobData entities and prints their stats."""

303 spaces = ''	369 spaces = ''

304 for _ in range(indent):	370 for _ in range(indent):

305 spaces += ' '	371 spaces += ' '

306	372

307 report_info = _GetReportInformation(try_job_data_list, start_date, end_date)	373 report_info = _GetReportInformation(try_job_data_list, start_date, end_date)

(...skipping 208 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
516 if args_dict[parsed_arg]:	582 if args_dict[parsed_arg]:

517 ordered_args.append(parsed_arg)	583 ordered_args.append(parsed_arg)

518	584

519 return ordered_args	585 return ordered_args

520	586

521	587

522 if __name__ == '__main__':	588 if __name__ == '__main__':

523 # Set up the Remote API to use services on the live App Engine.	589 # Set up the Remote API to use services on the live App Engine.

524 remote_api.EnableRemoteApi(app_id='findit-for-me')	590 remote_api.EnableRemoteApi(app_id='findit-for-me')

525	591

526 START_DATE = datetime.datetime(2016, 5, 1)	592 START_DATE = datetime.datetime(2016, 4, 17)

527 END_DATE = datetime.datetime(2016, 6, 23)	593 END_DATE = datetime.datetime(2016, 7, 15)

528	594

529 try_job_data_query = WfTryJobData.query(	595 try_job_data_query = WfTryJobData.query(

530 WfTryJobData.request_time >= START_DATE,	596 WfTryJobData.request_time >= START_DATE,

531 WfTryJobData.request_time < END_DATE)	597 WfTryJobData.request_time < END_DATE)

532 categorized_data = try_job_data_query.fetch()	598 categorized_data = try_job_data_query.fetch()

533	599

534 args = GetArgsInOrder()	600 args = GetArgsInOrder()

535 for arg in args:	601 for arg in args:

536 categorized_data = SplitStructByOption(categorized_data, arg)	602 categorized_data = SplitStructByOption(categorized_data, arg)

537	603

538 # TODO(lijeffrey): Display data in an html page instead of printing.	604 # TODO(lijeffrey): Display data in an html page instead of printing.

539 PrettyPrint(categorized_data, START_DATE, END_DATE)	605 PrettyPrint(categorized_data, START_DATE, END_DATE)

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »