Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: infra/tools/antibody/compute_stats.py

Issue 1235373004: Added script to generate stats on a git checkout (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@new_antibody_db_schema
Patch Set: Rebase Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « infra/tools/antibody/code_review_parse.py ('k') | infra/tools/antibody/git_commit_parser.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 import json
6
7
8 def ratio_calculator(numerator, denominator):
9 """Computes the ratio of the counts in two lists
10
11 Args:
12 numerator(list): list of lists with a year-month string as the first index
13 and a count as the second index
14 denominator(list): list of lists with a year-month string as the first index
15 and a count as the second index
16
17 Return:
18 ratios(list): a list of lists with ratios rounded to three decimal places
19 e.g. [['2015-01', .251], ['2014-10', .742]]
20 """
21 ratios = []
22 for i in numerator:
23 for j in denominator:
24 if i[0] == j[0] and j[1] != 0:
25 ratios.append([i[0], round(float(i[1]) / j[1], 3)])
26 break
27 elif i[0] == j[0]:
28 ratios.append([i[0], 0])
29 return ratios
30
31
32 def totaled_ratio_calculator(numerator, denominator):
33 """Computes the ratio of the counts in two lists
34
35 Args:
36 numerator(int): a totaled count
37 denominator(int): a totaled count
38
39 Return:
40 ratio(float): a ratio rounded to three decimal places
41 """
42 if denominator != 0:
43 ratio = round(float(numerator) / denominator, 3)
44 else:
45 raise ZeroDivisionError
46 return ratio
47
48
49 # functions that return stats calculated as lists of lists by month
50 def total_commits(cc): # pragma: no cover
51 """Counts all the git commits sorted by month and year
52
53 Args:
54 cc(cursor)
55
56 Return:
57 results(list): a list of lists e.g. [['2014-01', 20], ['2014-02', 45]]
58 """
59 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
60 FROM git_commit""")
61 months = cc.fetchall()
62 results = []
63 for month in months:
64 month = month[0]
65 cc.execute("""SELECT COUNT(*)
66 FROM git_commit
67 WHERE YEAR(git_commit.timestamp) = %s
68 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
69 result = cc.fetchone()
70 results.append([month, int(result[0])])
71 return results
72
73
74 # TODO(keelerh): group by month and year directly in the sql query instead of
75 # with a for loop
76 def total_suspicious(cc): # pragma: no cover
77 """Counts the number of commits with no review url or TBRed with no lgtm
78 sorted by month and year
79
80 Args:
81 cc(cursor)
82
83 Return:
84 results(list): a list of lists
85 """
86 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
87 FROM git_commit""")
88 months = cc.fetchall()
89 results = []
90 for month in months:
91 month = month[0]
92 cc.execute("""SELECT COUNT(*)
93 FROM review
94 INNER JOIN git_commit
95 ON review.review_url = git_commit.review_url
96 INNER JOIN commit_people
97 ON commit_people.git_commit_hash = git_commit.hash
98 LEFT JOIN (
99 SELECT review_url, COUNT(*) AS c
100 FROM review_people
101 WHERE type = 'lgtm'
102 GROUP BY review_url) lgtm_count
103 ON review.review_url = lgtm_count.review_url
104 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
105 AND commit_people.type = 'tbr'
106 AND YEAR(git_commit.timestamp) = %s
107 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
108 no_lgtm = cc.fetchone()
109 cc.execute("""SELECT COUNT(*)
110 FROM git_commit
111 WHERE review_url IS NULL
112 AND YEAR(git_commit.timestamp) = %s
113 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
114 no_review = cc.fetchone()
115 result = int(no_lgtm[0]) + int(no_review[0])
116 results.append([month, result])
117 return results
118
119
120 def total_tbr(cc): # pragma: no cover
121 """Counts the number of commits with a TBR
122
123 Args:
124 cc(cursor)
125
126 Return:
127 results(list): a list of lists
128 """
129 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
130 FROM git_commit""")
131 months = cc.fetchall()
132 results = []
133 for month in months:
134 month = month[0]
135 cc.execute("""SELECT COUNT(DISTINCT git_commit_hash)
136 FROM commit_people
137 INNER JOIN git_commit
138 ON commit_people.git_commit_hash = git_commit.hash
139 WHERE commit_people.type = 'tbr'
140 AND YEAR(git_commit.timestamp) = %s
141 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
142 result = cc.fetchone()
143 results.append([month, int(result[0])])
144 return results
145
146
147 def tbr_no_lgtm(cc): # pragma: no cover
148 """Counts the number of commits with a TBR that have not been lgtm'ed
149
150 Args:
151 cc(cursor)
152
153 Return:
154 results(list): a list of lists
155 """
156 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
157 FROM git_commit""")
158 months = cc.fetchall()
159 results = []
160 for month in months:
161 month = month[0]
162 cc.execute("""SELECT COUNT(*)
163 FROM review
164 INNER JOIN git_commit ON review.review_url = git_commit.review_url
165 INNER JOIN commit_people
166 ON commit_people.git_commit_hash = git_commit.hash
167 LEFT JOIN (
168 SELECT review_url, COUNT(*) AS c FROM review_people
169 WHERE type = 'lgtm' GROUP BY review_url) lgtm_count
170 ON review.review_url = lgtm_count.review_url
171 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
172 AND commit_people.type = 'tbr'
173 AND YEAR(git_commit.timestamp) = %s
174 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
175 result = cc.fetchone()
176 results.append([month, int(result[0])])
177 return results
178
179
180 def blank_tbr(cc): # pragma: no cover
181 """Counts the number of occurences of TBR= with no reviewer listed
182
183 Args:
184 cc(cursor)
185
186 Return:
187 results(list): a list of lists
188 """
189 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
190 FROM git_commit""")
191 months = cc.fetchall()
192 results = []
193 for month in months:
194 month = month[0]
195 cc.execute("""SELECT COUNT(*)
196 FROM commit_people
197 INNER JOIN git_commit
198 ON commit_people.git_commit_hash = git_commit.hash
199 WHERE commit_people.people_email_address = 'NOBODY'
200 AND YEAR(git_commit.timestamp) = %s
201 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
202 result = cc.fetchone()
203 results.append([month, int(result[0])])
204 return results
205
206
207 def no_review_url(cc): # pragma: no cover
208 """Counts the number of commits with no review url
209
210 Args:
211 cc(cursor)
212
213 Return:
214 results(list): a list of lists
215 """
216 cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m')
217 FROM git_commit""")
218 months = cc.fetchall()
219 results = []
220 for month in months:
221 month = month[0]
222 cc.execute("""SELECT COUNT(*)
223 FROM git_commit
224 WHERE review_url IS NULL
225 AND YEAR(git_commit.timestamp) = %s
226 AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:]))
227 result = cc.fetchone()
228 results.append([month, int(result[0])])
229 return results
230
231
232 # functions that return totaled stats for a set period back in time
233 def totaled_total_commits(cc, sql_time_specification): # pragma: no cover
234 """Counts all the git commits in a given timeframe
235
236 Args:
237 cc(cursor)
238 sql_time_specification(str): a sql command to limit the dates of the
239 returned results
240
241 Return:
242 result(int): a count of all the commits
243 """
244 cc.execute("""SELECT COUNT(*)
245 FROM git_commit
246 WHERE %s""" % sql_time_specification)
247 result = cc.fetchone()
248 return int(result[0])
249
250
251 def totaled_total_suspicious(cc, sql_time_specification): # pragma: no cover
252 """Counts the number of commits with no review url or TBRed with no lgtm
253 in a given timeframe
254
255 Args:
256 cc(cursor)
257 sql_time_specification(str): a sql command to limit the dates of the
258 returned results
259
260 Return:
261 result(int): a count of all the suspicious commits
262 """
263 cc.execute("""SELECT COUNT(*)
264 FROM review
265 INNER JOIN git_commit
266 ON review.review_url = git_commit.review_url
267 INNER JOIN commit_people
268 ON commit_people.git_commit_hash = git_commit.hash
269 LEFT JOIN (
270 SELECT review_url, COUNT(*) AS c
271 FROM review_people
272 WHERE type = 'lgtm'
273 GROUP BY review_url) lgtm_count
274 ON review.review_url = lgtm_count.review_url
275 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
276 AND commit_people.type = 'tbr' AND %s""" % sql_time_specification)
277 no_lgtm = cc.fetchone()
278 cc.execute("""SELECT COUNT(*)
279 FROM git_commit
280 WHERE review_url IS NULL AND %s""" % sql_time_specification)
281 no_review = cc.fetchone()
282 result = int(no_lgtm[0]) + int(no_review[0])
283 return result
284
285
286 def totaled_total_tbr(cc, sql_time_specification): # pragma: no cover
287 """Counts the total number of commits with a TBR in a given timeframe
288
289 Args:
290 cc(cursor)
291 sql_time_specification(str): a sql command to limit the dates of the
292 returned results
293
294 Return:
295 result(int): a count of all commits with a TBR
296 """
297 cc.execute("""SELECT COUNT(DISTINCT git_commit_hash)
298 FROM commit_people
299 INNER JOIN git_commit
300 ON commit_people.git_commit_hash = git_commit.hash
301 WHERE commit_people.type = 'tbr' AND %s""" % sql_time_specification)
302 result = cc.fetchone()
303 return int(result[0])
304
305
306 def totaled_tbr_no_lgtm(cc, sql_time_specification): # pragma: no cover
307 """Counts the number of commits with a TBR that have not been lgtm'ed
308 in a given timeframe
309
310 Args:
311 cc(cursor)
312 sql_time_specification(str): a sql command to limit the dates of the
313 returned results
314
315 Return:
316 result(int): a count of all commits with a TBR and no lgtm
317 """
318 cc.execute("""SELECT COUNT(*)
319 FROM review
320 INNER JOIN git_commit
321 ON review.review_url = git_commit.review_url
322 INNER JOIN commit_people
323 ON commit_people.git_commit_hash = git_commit.hash
324 LEFT JOIN (
325 SELECT review_url, COUNT(*) AS c
326 FROM review_people
327 WHERE type = 'lgtm'
328 GROUP BY review_url) lgtm_count
329 ON review.review_url = lgtm_count.review_url
330 WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL
331 AND commit_people.type = 'tbr' AND %s""" % sql_time_specification)
332 result = cc.fetchone()
333 return int(result[0])
334
335
336 def totaled_blank_tbr(cc, sql_time_specification): # pragma: no cover
337 """Counts the number of occurences of TBR= with no reviewer listed in a
338 given timeframe
339
340 Args:
341 cc(cursor)
342 sql_time_specification(str): a sql command to limit the dates of the
343 returned results
344
345 Return:
346 result(int): a count of all blank TBRs (TBR=)
347 """
348 cc.execute("""SELECT COUNT(*)
349 FROM commit_people
350 INNER JOIN git_commit
351 ON commit_people.git_commit_hash = git_commit.hash
352 WHERE commit_people.people_email_address = 'NOBODY'
353 AND %s""" % sql_time_specification)
354 result = cc.fetchone()
355 return int(result[0])
356
357
358 def totaled_no_review_url(cc, sql_time_specification): # pragma: no cover
359 """Counts the number of commits with no review url in a given timeframe
360
361 Args:
362 cc(cursor)
363 sql_time_specification(str): a sql command to limit the dates of the
364 returned results
365
366 Return:
367 result(int): a count of all commits with no review_url
368 """
369 cc.execute("""SELECT COUNT(*)
370 FROM git_commit
371 WHERE review_url IS NULL AND %s""" % sql_time_specification)
372 result = cc.fetchone()
373 return int(result[0])
374
375
376 # TODO(keelerh): change the SQL query to avoid using temporary tables
377 def tbr_to_total_people_ratio(cc,
378 sql_time_specification=''): # pragma: no cover
379 """Calculate TBR stats by author for the Antibody leaderboard and writes
380 them to a json file
381
382 Args:
383 cc(cursor)
384 sql_time_specification(str): a sql command to limit the dates of the
385 returned results
386
387 Return:
388 people_tbr_data(list): a list of dictionaries containing author email,
389 ratio of TBRed commits to total commits, number of
390 TBRed commits, total number of commits, and overall
391 rank, reverse sorted by ratio
392 """
393 cc.execute("""CREATE TEMPORARY TABLE addr_tbr_hash (email VARCHAR(200), tbr
394 INT, hash VARCHAR(40), `timestamp` TIMESTAMP)""")
395 try:
396 cc.execute("""INSERT INTO addr_tbr_hash (email, tbr, hash)
397 SELECT commit_people.people_email_address, tbr_count.c,
398 commit_people.git_commit_hash FROM commit_people
399 LEFT JOIN (
400 SELECT git_commit_hash, COUNT(*)
401 AS c FROM commit_people
402 INNER JOIN git_commit
403 ON git_commit.hash = commit_people.git_commit_hash
404 WHERE commit_people.type='tbr' %s
405 GROUP BY git_commit_hash) tbr_count
406 ON commit_people.git_commit_hash = tbr_count.git_commit_hash
407 WHERE commit_people.type='author'""" % sql_time_specification)
408 cc.execute("""SELECT email,
409 SUM(CASE WHEN tbr<>0 THEN 1 ELSE 0 END) num_tbrd, COUNT(*)
410 AS num_total
411 FROM addr_tbr_hash
412 GROUP BY email""")
413 people_tbr_data = cc.fetchall()
414 people_tbr_data = [[data_item[0],
415 round(float(data_item[1]) / data_item[2], 3),
416 int(data_item[1]), int(data_item[2])] for data_item in
417 people_tbr_data]
418 sorted_people_tbr_data = sorted(people_tbr_data, key=lambda x: x[1],
419 reverse=True)
420 top_100 = sorted_people_tbr_data[:100]
421 ordered_people_tbr_data = []
422 last_ratio = None
423 cur_rank = 0
424 for i in range(len(top_100)):
425 ratio = sorted_people_tbr_data[i][1]
426 if ratio > 0:
427 temp = {}
428 temp['email'] = sorted_people_tbr_data[i][0]
429 temp['ratio'] = ratio
430 temp['suspicious'] = sorted_people_tbr_data[i][2]
431 temp['total'] = sorted_people_tbr_data[i][3]
432 if last_ratio != ratio:
433 cur_rank += 1
434 temp['rank'] = cur_rank
435 last_ratio = ratio
436 else:
437 break
438 ordered_people_tbr_data.append(temp)
439 finally:
440 cc.execute("""DROP TABLE addr_tbr_hash""")
441 return ordered_people_tbr_data
442
443
444 def compute_monthly_breakdown_stats(cc): # pragma: no cover
445 """Computes stats broken down by month
446
447 Args:
448 cc(cursor)
449
450 Returns:
451 monthly_breakdown_stats(dict): contains stats calculated per month
452 including: timeframe, total_commits,
453 total_suspicious, suspicious_to_total_ratio,
454 total_tbr, tbr_no_lgtm,
455 no_tbr_lgtm_to_total_ratio, blank_tbr,
456 blank_tbr_to_total_ratio, no_review_url,
457 no_review_url_to_total_ratio
458 """
459 monthly_breakdown_stats = {}
460 monthly_breakdown_stats['timeframe'] = 'monthly_breakdown'
461 tot_commits = total_commits(cc)
462 monthly_breakdown_stats['total_commits'] = tot_commits
463 tot_suspicious = total_suspicious(cc)
464 monthly_breakdown_stats['total_suspicious'] = tot_suspicious
465 monthly_breakdown_stats['suspicious_to_total_ratio'] = ratio_calculator(
466 tot_suspicious, tot_commits)
467 tot_tbr = total_tbr(cc)
468 monthly_breakdown_stats['total_tbr'] = tot_tbr
469 tot_tbr_no_lgtm = tbr_no_lgtm(cc)
470 monthly_breakdown_stats['tbr_no_lgtm'] = tot_tbr_no_lgtm
471 monthly_breakdown_stats['no_tbr_lgtm_to_total_ratio'] = ratio_calculator(
472 tot_tbr_no_lgtm, tot_tbr)
473 tot_blank_tbr = blank_tbr(cc)
474 monthly_breakdown_stats['blank_tbrs'] = tot_blank_tbr
475 monthly_breakdown_stats['blank_tbr_to_total_ratio'] = ratio_calculator(
476 tot_blank_tbr, tot_tbr)
477 tot_no_review_url = no_review_url(cc)
478 monthly_breakdown_stats['no_review_url'] = tot_no_review_url
479 monthly_breakdown_stats['no_review_url_to_total_ratio'] = ratio_calculator(
480 tot_no_review_url, tot_commits)
481 return monthly_breakdown_stats
482
483
484 def compute_stats_by_time(cc): # pragma: no cover
485 """Computes the stats for the past 7 days, past 30 days, and all time
486
487 Args:
488 cc(cursor)
489
490 Returns:
491 output(list): three dictionaries containing stats for the past 7 days, 30
492 days, and all time respectively, each including timeframe,
493 suspicious_to_total_ratio, total_commits, tbr_no_lgtm,
494 no_review_url, blank_tbr
495 """
496 stats_7_days = {'timeframe': '7_days'}
497 stats_30_days = {'timeframe': '30_days'}
498 stats_all_time = {'timeframe': 'all_time'}
499 for d in [stats_7_days, stats_30_days, stats_all_time]:
500 if d == stats_7_days:
501 sql_insert = 'DATEDIFF(NOW(), git_commit.timestamp) <= 7'
502 elif d == stats_30_days:
503 sql_insert = 'DATEDIFF(NOW(), git_commit.timestamp) <= 30'
504 elif d == stats_all_time:
505 sql_insert = 'DATEDIFF(git_commit.timestamp, NOW()) < 0'
506 tot_commits = totaled_total_commits(cc, sql_insert)
507 tot_suspicious = totaled_total_suspicious(cc, sql_insert)
508 d['suspicious_to_total_ratio'] = totaled_ratio_calculator(tot_suspicious,
509 tot_commits)
510 d['total_commits'] = tot_commits
511 d['tbr_no_lgtm'] = totaled_tbr_no_lgtm(cc, sql_insert)
512 d['no_review_url'] = totaled_no_review_url(cc, sql_insert)
513 d['blank_tbr'] = totaled_blank_tbr(cc, sql_insert)
514 output = [stats_7_days, stats_30_days, stats_all_time]
515 return output
516
517
518 def all_monthly_stats(cc, filename): # pragma: no cover
519 """Write stats calculated over different time segments as json to file
520
521 Args:
522 cc(cursor)
523 filename(str): the json file to write to
524 """
525 output = {}
526 stats_monthly_breakdown = compute_monthly_breakdown_stats(cc)
527 stats_by_time = compute_stats_by_time(cc)
528 stats_by_time.append(stats_monthly_breakdown)
529 for d in stats_by_time:
530 label = d['timeframe']
531 output[label] = d
532 with open(filename, 'w') as f:
533 json.dump(output, f)
534
535
536 def all_time_leaderboard(cc, filename): # pragma: no cover
537 """Writes stats for all-time Antibody leaderboard top 100 to a json file
538
539 Args:
540 cc(cursor)
541 filename(str): the json file to write to
542 """
543 output = tbr_to_total_people_ratio(cc)
544 with open(filename, 'w') as f:
545 json.dump(output, f)
546
547
548 def past_month_leaderboard(cc, filename): # pragma: no cover
549 """Writes stats for the past month Antibody leaderboard to a json file
550
551 Args:
552 cc(cursor)
553 filename(str): the json file to write to
554 """
555 sql_time_specification = 'AND DATEDIFF(NOW(), git_commit.timestamp) <= 30'
556 output = tbr_to_total_people_ratio(cc, sql_time_specification)
557 with open(filename, 'w') as f:
558 json.dump(output, f)
OLDNEW
« no previous file with comments | « infra/tools/antibody/code_review_parse.py ('k') | infra/tools/antibody/git_commit_parser.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698