Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Side by Side Diff: infra/tools/antibody/compute_stats.py

Issue 1235373004: Added script to generate stats on a git checkout (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@new_antibody_db_schema
Patch Set: Fixed nits and changed output for antibody stats Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « infra/tools/antibody/cloudsql_connect.py ('k') | infra/tools/antibody/git_commit_parser.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 import json
6
7
8 # functions that return stats calculated as lists of lists by month
9 def ratio_calculator(numerator, denominator):
10 """Computes the ratio of the counts in two lists
11
12 Args:
13 numerator(list): list of lists with a year-month string as the first index
14 and a count as the second index
15 denominator(list): list of lists with a year-month string as the first index
16 and a count as the second index
17
18 Return:
19 ratios(list): a list of lists with ratios rounded to three decimal places
20 e.g. [['2015-01', .251], ['2014-10', .742]]
21 """
22 ratios = []
23 for i in numerator:
24 for j in denominator:
25 if i[0] == j[0] and j[1] != 0:
26 ratios.append([i[0], round(float(i[1]) / j[1], 3)])
27 break
28 elif i[0] == j[0]:
29 ratios.append([i[0], 0])
30 return ratios
31
32
33 def total_commits(cc):
34 """Counts all the git commits sorted by month and year
35
36 Args:
37 cc(cursor)
38
39 Return:
40 results(list): a list of lists e.g. [['2014-01', 20], ['2014-02', 45]]
41 """
42 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
43 FROM git_commit""")
44 months = cc.fetchall()
45 results = []
46 for month in months:
47 month = month[0]
48 cc.execute("""SELECT COUNT(*)
49 FROM git_commit
50 WHERE YEAR(git_commit.timestamp) = %s
51 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
52 result = cc.fetchone()
53 results.append([month, int(result[0])])
54 return results
pgervais 2015/07/21 16:48:34 (not for this CL) This can be done in SQL only, an
55
56
57 def total_suspicious(cc):
58 """Counts the number of commits with no review url or TBRed with no lgtm
59 sorted by month and year
60
61 Args:
62 cc(cursor)
63
64 Return:
65 results(list): a list of lists
66 """
67 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
68 FROM git_commit""")
69 months = cc.fetchall()
70 results = []
71 for month in months:
72 month = month[0]
73 cc.execute("""SELECT COUNT(*)
74 FROM review
75 INNER JOIN git_commit
76 ON review.review_url = git_commit.review_url
77 INNER JOIN commit_people
78 ON commit_people.git_commit_hash = git_commit.hash
79 LEFT JOIN (
80 SELECT review_url, COUNT(*) AS c
81 FROM review_people
82 WHERE type = 'lgtm'
83 GROUP BY review_url) lgtm_count
84 ON review.review_url = lgtm_count.review_url
85 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
86 AND commit_people.type = 'tbr'
87 AND YEAR(git_commit.timestamp) = %s
88 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
89 no_lgtm = cc.fetchone()
90 cc.execute("""SELECT COUNT(*)
91 FROM git_commit
92 WHERE review_url IS NULL
93 AND YEAR(git_commit.timestamp) = %s
94 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
95 no_review = cc.fetchone()
96 result = int(no_lgtm[0]) + int(no_review[0])
97 results.append([month, result])
98 return results
99
100
101 def total_tbr(cc):
102 """Counts the number of commits with a TBR
103
104 Args:
105 cc(cursor)
106
107 Return:
108 results(list): a list of lists
109 """
110 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
111 FROM git_commit""")
112 months = cc.fetchall()
113 results = []
114 for month in months:
115 month = month[0]
116 cc.execute("""SELECT COUNT(DISTINCT git_commit_hash)
117 FROM commit_people
118 INNER JOIN git_commit
119 ON commit_people.git_commit_hash = git_commit.hash
120 WHERE commit_people.type = 'tbr'
121 AND YEAR(git_commit.timestamp) = %s
122 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
123 result = cc.fetchone()
124 results.append([month, int(result[0])])
125 return results
126
127
128 def tbr_no_lgtm(cc):
129 """Counts the number of commits with a TBR that have not been lgtm'ed
130
131 Args:
132 cc(cursor)
133
134 Return:
135 results(list): a list of lists
136 """
137 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
138 FROM git_commit""")
139 months = cc.fetchall()
140 results = []
141 for month in months:
142 month = month[0]
143 cc.execute("""SELECT COUNT(*)
144 FROM review
145 INNER JOIN git_commit ON review.review_url = git_commit.review_url
146 INNER JOIN commit_people
147 ON commit_people.git_commit_hash = git_commit.hash
148 LEFT JOIN (
149 SELECT review_url, COUNT(*) AS c FROM review_people
150 WHERE type = 'lgtm' GROUP BY review_url) lgtm_count
151 ON review.review_url = lgtm_count.review_url
152 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
153 AND commit_people.type = 'tbr'
154 AND YEAR(git_commit.timestamp) = %s
155 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
156 result = cc.fetchone()
157 results.append([month, int(result[0])])
158 return results
pgervais 2015/07/21 16:48:34 See my comment on total_commits above. You may be
159
160
161 def blank_tbr(cc):
162 """Counts the number of occurences of TBR= with no reviewer listed
163
164 Args:
165 cc(cursor)
166
167 Return:
168 results(list): a list of lists
169 """
170 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
171 FROM git_commit""")
172 months = cc.fetchall()
173 results = []
174 for month in months:
175 month = month[0]
176 cc.execute("""SELECT COUNT(*)
177 FROM commit_people
178 INNER JOIN git_commit
179 ON commit_people.git_commit_hash = git_commit.hash
180 WHERE commit_people.people_email_address = 'NOBODY'
181 AND YEAR(git_commit.timestamp) = %s
182 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
183 result = cc.fetchone()
184 results.append([month, int(result[0])])
185 return results
186
187
188 def no_review_url(cc):
189 """Counts the number of commits with no review url
190
191 Args:
192 cc(cursor)
193
194 Return:
195 results(list): a list of lists
196 """
197 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
198 FROM git_commit""")
199 months = cc.fetchall()
200 results = []
201 for month in months:
202 month = month[0]
203 cc.execute("""SELECT COUNT(*)
204 FROM git_commit
205 WHERE review_url IS NULL
206 AND YEAR(git_commit.timestamp) = %s
207 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
208 result = cc.fetchone()
209 results.append([month, int(result[0])])
210 return results
211
212
213 # functions that return totaled stats for a set period back in time
214 def totaled_ratio_calculator(numerator, denominator):
215 """Computes the ratio of the counts in two lists
216
217 Args:
218 numerator(int): a totaled count
219 denominator(int): a totaled count
220
221 Return:
222 ratio(float): a ratio rounded to three decimal places
223 """
224 ratio = round(float(numerator) / denominator, 3)
pgervais 2015/07/21 16:48:34 Take into account denominator == 0, as you've done
225 return ratio
226
227
228 def totaled_total_commits(cc, sql_time_specification):
229 """Counts all the git commits in a given timeframe
230
231 Args:
232 cc(cursor)
233
pgervais 2015/07/21 16:48:34 Describe sql_time_specification (same below)
234 Return:
235 result(int): a count of all the commits
236 """
237 cc.execute("""SELECT COUNT(*)
238 FROM git_commit
239 WHERE %s""" % sql_time_specification)
240 result = cc.fetchone()
241 return int(result[0])
242
243
244 def totaled_total_suspicious(cc, sql_time_specification):
245 """Counts the number of commits with no review url or TBRed with no lgtm
246 in a given timeframe
247
248 Args:
249 cc(cursor)
250
251 Return:
252 result(int): a count of all the suspicious commits
253 """
254 cc.execute("""SELECT COUNT(*)
255 FROM review
256 INNER JOIN git_commit
257 ON review.review_url = git_commit.review_url
258 INNER JOIN commit_people
259 ON commit_people.git_commit_hash = git_commit.hash
260 LEFT JOIN (
261 SELECT review_url, COUNT(*) AS c
262 FROM review_people
263 WHERE type = 'lgtm'
264 GROUP BY review_url) lgtm_count
265 ON review.review_url = lgtm_count.review_url
266 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
267 AND commit_people.type = 'tbr' AND %s""" % sql_time_specification)
268 no_lgtm = cc.fetchone()
269 cc.execute("""SELECT COUNT(*)
270 FROM git_commit
271 WHERE review_url IS NULL AND %s""" % sql_time_specification)
272 no_review = cc.fetchone()
273 result = int(no_lgtm[0]) + int(no_review[0])
274 return result
275
276
277 def totaled_total_tbr(cc, sql_time_specification):
278 """Counts the total number of commits with a TBR in a given timeframe
279
280 Args:
281 cc(cursor)
282
283 Return:
284 result(int): a count of all commits with a TBR
285 """
286 cc.execute("""SELECT COUNT(DISTINCT git_commit_hash)
287 FROM commit_people
288 INNER JOIN git_commit
289 ON commit_people.git_commit_hash = git_commit.hash
290 WHERE commit_people.type = 'tbr' AND %s""" % sql_time_specification)
291 result = cc.fetchone()
292 return int(result[0])
293
294
295 def totaled_tbr_no_lgtm(cc, sql_time_specification):
296 """Counts the number of commits with a TBR that have not been lgtm'ed
297 in a given timeframe
298
299 Args:
300 cc(cursor)
301
302 Return:
303 result(int): a count of all commits with a TBR and no lgtm
304 """
305 cc.execute("""SELECT COUNT(*)
306 FROM review
307 INNER JOIN git_commit
308 ON review.review_url = git_commit.review_url
309 INNER JOIN commit_people
310 ON commit_people.git_commit_hash = git_commit.hash
311 LEFT JOIN (
312 SELECT review_url, COUNT(*) AS c
313 FROM review_people
314 WHERE type = 'lgtm'
315 GROUP BY review_url) lgtm_count
316 ON review.review_url = lgtm_count.review_url
317 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
318 AND commit_people.type = 'tbr' AND %s""" % sql_time_specification)
319 result = cc.fetchone()
320 return int(result[0])
321
322
323 def totaled_blank_tbr(cc, sql_time_specification):
324 """Counts the number of occurences of TBR= with no reviewer listed in a
325 given timeframe
326
327 Args:
328 cc(cursor)
329
330 Return:
331 result(int): a count of all blank TBRs (TBR=)
332 """
333 cc.execute("""SELECT COUNT(*)
334 FROM commit_people
335 INNER JOIN git_commit
336 ON commit_people.git_commit_hash = git_commit.hash
337 WHERE commit_people.people_email_address = 'NOBODY'
338 AND %s""" % sql_time_specification)
339 result = cc.fetchone()
340 return int(result[0])
341
342
343 def totaled_no_review_url(cc, sql_time_specification):
344 """Counts the number of commits with no review url in a given timeframe
345
346 Args:
347 cc(cursor)
348
349 Return:
350 result(int): a count of all commits with no review_url
351 """
352 cc.execute("""SELECT COUNT(*)
353 FROM git_commit
354 WHERE review_url IS NULL AND %s""" % sql_time_specification)
355 result = cc.fetchone()
356 return int(result[0])
357
358
359 # TODO(keelerh): change the SQL query to avoid using temporary tables
360 # add time specification back in
361 def tbr_to_total_people_ratio(cc, sql_time_specification=''):
362 """Calculate TBR stats by author for the Antibody leaderboard and writes
363 them to a json file
364
365 Args:
366 cc(cursor)
367
368 Return:
369 people_tbr_data(list): a list of dictionaries containing author email,
370 ratio of TBRed commits to total commits, number of
371 TBRed commits, total number of commits, and overall
372 rank, reverse sorted by ratio
373 """
374 cc.execute("""CREATE TEMPORARY TABLE addr_tbr_hash (email VARCHAR(200), tbr
pgervais 2015/07/21 16:48:34 If the insert statement fails, the table will rema
375 INT, hash VARCHAR(40), `timestamp` TIMESTAMP)""")
376 cc.execute("""INSERT INTO addr_tbr_hash (email, tbr, hash)
377 SELECT commit_people.people_email_address, tbr_count.c,
378 commit_people.git_commit_hash FROM commit_people
379 LEFT JOIN (
380 SELECT git_commit_hash, COUNT(*)
381 AS c FROM commit_people
382 INNER JOIN git_commit
383 ON git_commit.hash = commit_people.git_commit_hash
384 WHERE commit_people.type='tbr' %s
385 GROUP BY git_commit_hash) tbr_count
386 ON commit_people.git_commit_hash = tbr_count.git_commit_hash
387 WHERE commit_people.type='author'""" % sql_time_specification)
388 cc.execute("""SELECT email, SUM(CASE WHEN tbr<>0 THEN 1 ELSE 0 END) num_tbrd,
389 COUNT(*)
390 AS num_total
391 FROM addr_tbr_hash
392 GROUP BY email""")
393 people_tbr_data = cc.fetchall()
394 people_tbr_data = [[data_item[0],
395 round(float(data_item[1]) / data_item[2], 3),
396 int(data_item[1]), int(data_item[2])] for data_item in
397 people_tbr_data]
398 sorted_people_tbr_data = sorted(people_tbr_data, key=lambda x: x[1],
399 reverse=True)
400 top_100 = sorted_people_tbr_data[:100]
401 ordered_people_tbr_data = []
402 for i in range(len(top_100)):
403 temp = {}
404 temp['email'] = sorted_people_tbr_data[i][0]
405 temp['ratio'] = sorted_people_tbr_data[i][1]
406 temp['suspicious'] = sorted_people_tbr_data[i][2]
407 temp['total'] = sorted_people_tbr_data[i][3]
408 temp['rank'] = i
409 ordered_people_tbr_data.append(temp)
410 cc.execute("""DROP TABLE addr_tbr_hash""")
411 return ordered_people_tbr_data
412
413
414 def compute_monthly_breakdown_stats(cc):
415 """Computes stats broken down by month
416
417 Args:
418 cc(cursor)
419
420 Returns:
421 monthly_breakdown_stats(dict): contains stats calculated per month
422 including: timeframe, total_commits,
423 total_suspicious, suspicious_to_total_ratio,
424 total_tbr, tbr_no_lgtm,
425 no_tbr_lgtm_to_total_ratio, blank_tbr,
426 blank_tbr_to_total_ratio, no_review_url,
427 no_review_url_to_total_ratio
428 """
429 monthly_breakdown_stats = {}
430 monthly_breakdown_stats['timeframe'] = 'monthly_breakdown'
431 tot_commits = total_commits(cc)
432 monthly_breakdown_stats['total_commits'] = tot_commits
433 tot_suspicious = total_suspicious(cc)
434 monthly_breakdown_stats['total_suspicious'] = tot_suspicious
435 monthly_breakdown_stats['suspicious_to_total_ratio'] = ratio_calculator(
436 tot_suspicious, tot_commits)
437 tot_tbr = total_tbr(cc)
438 monthly_breakdown_stats['total_tbr'] = tot_tbr
439 tot_tbr_no_lgtm = tbr_no_lgtm(cc)
440 monthly_breakdown_stats['tbr_no_lgtm'] = tot_tbr_no_lgtm
441 monthly_breakdown_stats['no_tbr_lgtm_to_total_ratio'] = ratio_calculator(
442 tot_tbr_no_lgtm, tot_tbr)
443 tot_blank_tbr = blank_tbr(cc)
444 monthly_breakdown_stats['blank_tbrs'] = tot_blank_tbr
445 monthly_breakdown_stats['blank_tbr_to_total_ratio'] = ratio_calculator(
446 tot_blank_tbr, tot_tbr)
447 tot_no_review_url = no_review_url(cc)
448 monthly_breakdown_stats['no_review_url'] = tot_no_review_url
449 monthly_breakdown_stats['no_review_url_to_total_ratio'] = ratio_calculator(
450 tot_no_review_url, tot_commits)
451 return monthly_breakdown_stats
452
453
454 def compute_stats_by_time(cc):
455 """Computes the stats for the past 7 days, past 30 days, and all time
456
457 Args:
458 cc(cursor)
459
460 Returns:
461 output(list): three dictionaries containing stats for the past 7 days, 30
462 days, and all time respectively, each including timeframe,
463 suspicious_to_total_ratio, total_commits, tbr_no_lgtm,
464 no_review_url, blank_tbr
465 """
466 stats_7_days, stats_30_days, stats_all_time = {}, {}, {}
467 for d in [stats_7_days, stats_30_days, stats_all_time]:
468 if d == stats_7_days:
469 d['timeframe'] = '7_days'
pgervais 2015/07/21 16:48:34 Do that on line 466: stats_7_days = {'timeframe':
470 sql_insert = 'DATEDIFF(NOW(), git_commit.timestamp) <= 7'
pgervais 2015/07/21 16:48:34 Just set the value instead of writing a where clau
keelerh 2015/07/22 21:17:55 I was initially going to do that, but then didn't
471 elif d == stats_30_days:
472 d['timeframe'] = '30_days'
473 sql_insert = 'DATEDIFF(NOW(), git_commit.timestamp) <= 30'
474 elif d == stats_all_time:
475 d['timeframe'] = 'all_time'
476 sql_insert = 'DATEDIFF(git_commit.timestamp, NOW()) < 0'
477 tot_commits = totaled_total_commits(cc, sql_insert)
478 tot_suspicious = totaled_total_suspicious(cc, sql_insert)
479 d['suspicious_to_total_ratio'] = totaled_ratio_calculator(tot_suspicious,
480 tot_commits)
481 d['total_commits'] = tot_commits
482 d['tbr_no_lgtm'] = totaled_tbr_no_lgtm(cc, sql_insert)
483 d['no_review_url'] = totaled_no_review_url(cc, sql_insert)
484 d['blank_tbr'] = totaled_blank_tbr(cc, sql_insert)
485 output = [stats_7_days, stats_30_days, stats_all_time]
486 return output
487
488
489 def all_monthly_stats(cc, filename):
490 """Write stats calculated over different time segments as json to file
491
492 Args:
493 cc(cursor)
494 filename(str): the json file to write to
495 """
496 output = {}
497 stats_monthly_breakdown = compute_monthly_breakdown_stats(cc)
498 stats_by_time = compute_stats_by_time(cc)
499 stats_by_time.append(stats_monthly_breakdown)
500 for d in stats_by_time:
501 label = d['timeframe']
502 output[label] = d
503 with open(filename, 'w') as f:
504 json.dump(output, f)
505
506
507 def all_time_leaderboard(cc, filename):
508 """Writes stats for all-time Antibody leaderboard to a json file
509
510 Args:
511 cc(cursor)
512 filename(str): the json file to write to
513 """
514 output = tbr_to_total_people_ratio(cc)
515 with open(filename, 'w') as f:
516 json.dump(output, f)
517
518
519 def past_month_leaderboard(cc, filename):
520 """Writes stats for the past month Antibody leaderboard to a json file
521
522 Args:
523 cc(cursor)
524 filename(str): the json file to write to
525 """
526 sql_time_specification = 'AND DATEDIFF(NOW(), git_commit.timestamp) <= 30'
527 output = tbr_to_total_people_ratio(cc, sql_time_specification)
528 with open(filename, 'w') as f:
529 json.dump(output, f)
OLDNEW
« no previous file with comments | « infra/tools/antibody/cloudsql_connect.py ('k') | infra/tools/antibody/git_commit_parser.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698