Index: infra/tools/antibody/compute_stats.py |
diff --git a/infra/tools/antibody/compute_stats.py b/infra/tools/antibody/compute_stats.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..7490b6dcfa9518395967bbb8382df92fd02f993f |
--- /dev/null |
+++ b/infra/tools/antibody/compute_stats.py |
@@ -0,0 +1,558 @@ |
+# Copyright 2015 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+import json |
+ |
+ |
+def ratio_calculator(numerator, denominator): |
+ """Computes the ratio of the counts in two lists |
+ |
+ Args: |
+ numerator(list): list of lists with a year-month string as the first index |
+ and a count as the second index |
+ denominator(list): list of lists with a year-month string as the first index |
+ and a count as the second index |
+ |
+ Return: |
+ ratios(list): a list of lists with ratios rounded to three decimal places |
+ e.g. [['2015-01', .251], ['2014-10', .742]] |
+ """ |
+ ratios = [] |
+ for i in numerator: |
+ for j in denominator: |
+ if i[0] == j[0] and j[1] != 0: |
+ ratios.append([i[0], round(float(i[1]) / j[1], 3)]) |
+ break |
+ elif i[0] == j[0]: |
+ ratios.append([i[0], 0]) |
+ return ratios |
+ |
+ |
+def totaled_ratio_calculator(numerator, denominator): |
+ """Computes the ratio of the counts in two lists |
+ |
+ Args: |
+ numerator(int): a totaled count |
+ denominator(int): a totaled count |
+ |
+ Return: |
+ ratio(float): a ratio rounded to three decimal places |
+ """ |
+ if denominator != 0: |
+ ratio = round(float(numerator) / denominator, 3) |
+ else: |
+ raise ZeroDivisionError |
+ return ratio |
+ |
+ |
+# functions that return stats calculated as lists of lists by month |
+def total_commits(cc): # pragma: no cover |
+ """Counts all the git commits sorted by month and year |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Return: |
+ results(list): a list of lists e.g. [['2014-01', 20], ['2014-02', 45]] |
+ """ |
+ cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m') |
+ FROM git_commit""") |
+ months = cc.fetchall() |
+ results = [] |
+ for month in months: |
+ month = month[0] |
+ cc.execute("""SELECT COUNT(*) |
+ FROM git_commit |
+ WHERE YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ result = cc.fetchone() |
+ results.append([month, int(result[0])]) |
+ return results |
+ |
+ |
+# TODO(keelerh): group by month and year directly in the sql query instead of |
+# with a for loop |
+def total_suspicious(cc): # pragma: no cover |
+ """Counts the number of commits with no review url or TBRed with no lgtm |
+ sorted by month and year |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Return: |
+ results(list): a list of lists |
+ """ |
+ cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m') |
+ FROM git_commit""") |
+ months = cc.fetchall() |
+ results = [] |
+ for month in months: |
+ month = month[0] |
+ cc.execute("""SELECT COUNT(*) |
+ FROM review |
+ INNER JOIN git_commit |
+ ON review.review_url = git_commit.review_url |
+ INNER JOIN commit_people |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ LEFT JOIN ( |
+ SELECT review_url, COUNT(*) AS c |
+ FROM review_people |
+ WHERE type = 'lgtm' |
+ GROUP BY review_url) lgtm_count |
+ ON review.review_url = lgtm_count.review_url |
+ WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL |
+ AND commit_people.type = 'tbr' |
+ AND YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ no_lgtm = cc.fetchone() |
+ cc.execute("""SELECT COUNT(*) |
+ FROM git_commit |
+ WHERE review_url IS NULL |
+ AND YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ no_review = cc.fetchone() |
+ result = int(no_lgtm[0]) + int(no_review[0]) |
+ results.append([month, result]) |
+ return results |
+ |
+ |
+def total_tbr(cc): # pragma: no cover |
+ """Counts the number of commits with a TBR |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Return: |
+ results(list): a list of lists |
+ """ |
+ cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m') |
+ FROM git_commit""") |
+ months = cc.fetchall() |
+ results = [] |
+ for month in months: |
+ month = month[0] |
+ cc.execute("""SELECT COUNT(DISTINCT git_commit_hash) |
+ FROM commit_people |
+ INNER JOIN git_commit |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ WHERE commit_people.type = 'tbr' |
+ AND YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ result = cc.fetchone() |
+ results.append([month, int(result[0])]) |
+ return results |
+ |
+ |
+def tbr_no_lgtm(cc): # pragma: no cover |
+ """Counts the number of commits with a TBR that have not been lgtm'ed |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Return: |
+ results(list): a list of lists |
+ """ |
+ cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m') |
+ FROM git_commit""") |
+ months = cc.fetchall() |
+ results = [] |
+ for month in months: |
+ month = month[0] |
+ cc.execute("""SELECT COUNT(*) |
+ FROM review |
+ INNER JOIN git_commit ON review.review_url = git_commit.review_url |
+ INNER JOIN commit_people |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ LEFT JOIN ( |
+ SELECT review_url, COUNT(*) AS c FROM review_people |
+ WHERE type = 'lgtm' GROUP BY review_url) lgtm_count |
+ ON review.review_url = lgtm_count.review_url |
+ WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL |
+ AND commit_people.type = 'tbr' |
+ AND YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ result = cc.fetchone() |
+ results.append([month, int(result[0])]) |
+ return results |
+ |
+ |
+def blank_tbr(cc): # pragma: no cover |
+ """Counts the number of occurences of TBR= with no reviewer listed |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Return: |
+ results(list): a list of lists |
+ """ |
+ cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m') |
+ FROM git_commit""") |
+ months = cc.fetchall() |
+ results = [] |
+ for month in months: |
+ month = month[0] |
+ cc.execute("""SELECT COUNT(*) |
+ FROM commit_people |
+ INNER JOIN git_commit |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ WHERE commit_people.people_email_address = 'NOBODY' |
+ AND YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ result = cc.fetchone() |
+ results.append([month, int(result[0])]) |
+ return results |
+ |
+ |
+def no_review_url(cc): # pragma: no cover |
+ """Counts the number of commits with no review url |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Return: |
+ results(list): a list of lists |
+ """ |
+ cc.execute("""SELECT DISTINCT DATE_FORMAT(git_commit.timestamp, '%Y-%m') |
+ FROM git_commit""") |
+ months = cc.fetchall() |
+ results = [] |
+ for month in months: |
+ month = month[0] |
+ cc.execute("""SELECT COUNT(*) |
+ FROM git_commit |
+ WHERE review_url IS NULL |
+ AND YEAR(git_commit.timestamp) = %s |
+ AND MONTH(git_commit.timestamp) = %s""" % (month[:4], month[5:])) |
+ result = cc.fetchone() |
+ results.append([month, int(result[0])]) |
+ return results |
+ |
+ |
+# functions that return totaled stats for a set period back in time |
+def totaled_total_commits(cc, sql_time_specification): # pragma: no cover |
+ """Counts all the git commits in a given timeframe |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ result(int): a count of all the commits |
+ """ |
+ cc.execute("""SELECT COUNT(*) |
+ FROM git_commit |
+ WHERE %s""" % sql_time_specification) |
+ result = cc.fetchone() |
+ return int(result[0]) |
+ |
+ |
+def totaled_total_suspicious(cc, sql_time_specification): # pragma: no cover |
+ """Counts the number of commits with no review url or TBRed with no lgtm |
+ in a given timeframe |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ result(int): a count of all the suspicious commits |
+ """ |
+ cc.execute("""SELECT COUNT(*) |
+ FROM review |
+ INNER JOIN git_commit |
+ ON review.review_url = git_commit.review_url |
+ INNER JOIN commit_people |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ LEFT JOIN ( |
+ SELECT review_url, COUNT(*) AS c |
+ FROM review_people |
+ WHERE type = 'lgtm' |
+ GROUP BY review_url) lgtm_count |
+ ON review.review_url = lgtm_count.review_url |
+ WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL |
+ AND commit_people.type = 'tbr' AND %s""" % sql_time_specification) |
+ no_lgtm = cc.fetchone() |
+ cc.execute("""SELECT COUNT(*) |
+ FROM git_commit |
+ WHERE review_url IS NULL AND %s""" % sql_time_specification) |
+ no_review = cc.fetchone() |
+ result = int(no_lgtm[0]) + int(no_review[0]) |
+ return result |
+ |
+ |
+def totaled_total_tbr(cc, sql_time_specification): # pragma: no cover |
+ """Counts the total number of commits with a TBR in a given timeframe |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ result(int): a count of all commits with a TBR |
+ """ |
+ cc.execute("""SELECT COUNT(DISTINCT git_commit_hash) |
+ FROM commit_people |
+ INNER JOIN git_commit |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ WHERE commit_people.type = 'tbr' AND %s""" % sql_time_specification) |
+ result = cc.fetchone() |
+ return int(result[0]) |
+ |
+ |
+def totaled_tbr_no_lgtm(cc, sql_time_specification): # pragma: no cover |
+ """Counts the number of commits with a TBR that have not been lgtm'ed |
+ in a given timeframe |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ result(int): a count of all commits with a TBR and no lgtm |
+ """ |
+ cc.execute("""SELECT COUNT(*) |
+ FROM review |
+ INNER JOIN git_commit |
+ ON review.review_url = git_commit.review_url |
+ INNER JOIN commit_people |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ LEFT JOIN ( |
+ SELECT review_url, COUNT(*) AS c |
+ FROM review_people |
+ WHERE type = 'lgtm' |
+ GROUP BY review_url) lgtm_count |
+ ON review.review_url = lgtm_count.review_url |
+ WHERE lgtm_count.c = 0 OR lgtm_count.c IS NULL |
+ AND commit_people.type = 'tbr' AND %s""" % sql_time_specification) |
+ result = cc.fetchone() |
+ return int(result[0]) |
+ |
+ |
+def totaled_blank_tbr(cc, sql_time_specification): # pragma: no cover |
+ """Counts the number of occurences of TBR= with no reviewer listed in a |
+ given timeframe |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ result(int): a count of all blank TBRs (TBR=) |
+ """ |
+ cc.execute("""SELECT COUNT(*) |
+ FROM commit_people |
+ INNER JOIN git_commit |
+ ON commit_people.git_commit_hash = git_commit.hash |
+ WHERE commit_people.people_email_address = 'NOBODY' |
+ AND %s""" % sql_time_specification) |
+ result = cc.fetchone() |
+ return int(result[0]) |
+ |
+ |
+def totaled_no_review_url(cc, sql_time_specification): # pragma: no cover |
+ """Counts the number of commits with no review url in a given timeframe |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ result(int): a count of all commits with no review_url |
+ """ |
+ cc.execute("""SELECT COUNT(*) |
+ FROM git_commit |
+ WHERE review_url IS NULL AND %s""" % sql_time_specification) |
+ result = cc.fetchone() |
+ return int(result[0]) |
+ |
+ |
+# TODO(keelerh): change the SQL query to avoid using temporary tables |
+def tbr_to_total_people_ratio(cc, |
+ sql_time_specification=''): # pragma: no cover |
+ """Calculate TBR stats by author for the Antibody leaderboard and writes |
+ them to a json file |
+ |
+ Args: |
+ cc(cursor) |
+ sql_time_specification(str): a sql command to limit the dates of the |
+ returned results |
+ |
+ Return: |
+ people_tbr_data(list): a list of dictionaries containing author email, |
+ ratio of TBRed commits to total commits, number of |
+ TBRed commits, total number of commits, and overall |
+ rank, reverse sorted by ratio |
+ """ |
+ cc.execute("""CREATE TEMPORARY TABLE addr_tbr_hash (email VARCHAR(200), tbr |
+ INT, hash VARCHAR(40), `timestamp` TIMESTAMP)""") |
+ try: |
+ cc.execute("""INSERT INTO addr_tbr_hash (email, tbr, hash) |
+ SELECT commit_people.people_email_address, tbr_count.c, |
+ commit_people.git_commit_hash FROM commit_people |
+ LEFT JOIN ( |
+ SELECT git_commit_hash, COUNT(*) |
+ AS c FROM commit_people |
+ INNER JOIN git_commit |
+ ON git_commit.hash = commit_people.git_commit_hash |
+ WHERE commit_people.type='tbr' %s |
+ GROUP BY git_commit_hash) tbr_count |
+ ON commit_people.git_commit_hash = tbr_count.git_commit_hash |
+ WHERE commit_people.type='author'""" % sql_time_specification) |
+ cc.execute("""SELECT email, |
+ SUM(CASE WHEN tbr<>0 THEN 1 ELSE 0 END) num_tbrd, COUNT(*) |
+ AS num_total |
+ FROM addr_tbr_hash |
+ GROUP BY email""") |
+ people_tbr_data = cc.fetchall() |
+ people_tbr_data = [[data_item[0], |
+ round(float(data_item[1]) / data_item[2], 3), |
+ int(data_item[1]), int(data_item[2])] for data_item in |
+ people_tbr_data] |
+ sorted_people_tbr_data = sorted(people_tbr_data, key=lambda x: x[1], |
+ reverse=True) |
+ top_100 = sorted_people_tbr_data[:100] |
+ ordered_people_tbr_data = [] |
+ last_ratio = None |
+ cur_rank = 0 |
+ for i in range(len(top_100)): |
+ ratio = sorted_people_tbr_data[i][1] |
+ if ratio > 0: |
+ temp = {} |
+ temp['email'] = sorted_people_tbr_data[i][0] |
+ temp['ratio'] = ratio |
+ temp['suspicious'] = sorted_people_tbr_data[i][2] |
+ temp['total'] = sorted_people_tbr_data[i][3] |
+ if last_ratio != ratio: |
+ cur_rank += 1 |
+ temp['rank'] = cur_rank |
+ last_ratio = ratio |
+ else: |
+ break |
+ ordered_people_tbr_data.append(temp) |
+ finally: |
+ cc.execute("""DROP TABLE addr_tbr_hash""") |
+ return ordered_people_tbr_data |
+ |
+ |
+def compute_monthly_breakdown_stats(cc): # pragma: no cover |
+ """Computes stats broken down by month |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Returns: |
+ monthly_breakdown_stats(dict): contains stats calculated per month |
+ including: timeframe, total_commits, |
+ total_suspicious, suspicious_to_total_ratio, |
+ total_tbr, tbr_no_lgtm, |
+ no_tbr_lgtm_to_total_ratio, blank_tbr, |
+ blank_tbr_to_total_ratio, no_review_url, |
+ no_review_url_to_total_ratio |
+ """ |
+ monthly_breakdown_stats = {} |
+ monthly_breakdown_stats['timeframe'] = 'monthly_breakdown' |
+ tot_commits = total_commits(cc) |
+ monthly_breakdown_stats['total_commits'] = tot_commits |
+ tot_suspicious = total_suspicious(cc) |
+ monthly_breakdown_stats['total_suspicious'] = tot_suspicious |
+ monthly_breakdown_stats['suspicious_to_total_ratio'] = ratio_calculator( |
+ tot_suspicious, tot_commits) |
+ tot_tbr = total_tbr(cc) |
+ monthly_breakdown_stats['total_tbr'] = tot_tbr |
+ tot_tbr_no_lgtm = tbr_no_lgtm(cc) |
+ monthly_breakdown_stats['tbr_no_lgtm'] = tot_tbr_no_lgtm |
+ monthly_breakdown_stats['no_tbr_lgtm_to_total_ratio'] = ratio_calculator( |
+ tot_tbr_no_lgtm, tot_tbr) |
+ tot_blank_tbr = blank_tbr(cc) |
+ monthly_breakdown_stats['blank_tbrs'] = tot_blank_tbr |
+ monthly_breakdown_stats['blank_tbr_to_total_ratio'] = ratio_calculator( |
+ tot_blank_tbr, tot_tbr) |
+ tot_no_review_url = no_review_url(cc) |
+ monthly_breakdown_stats['no_review_url'] = tot_no_review_url |
+ monthly_breakdown_stats['no_review_url_to_total_ratio'] = ratio_calculator( |
+ tot_no_review_url, tot_commits) |
+ return monthly_breakdown_stats |
+ |
+ |
+def compute_stats_by_time(cc): # pragma: no cover |
+ """Computes the stats for the past 7 days, past 30 days, and all time |
+ |
+ Args: |
+ cc(cursor) |
+ |
+ Returns: |
+ output(list): three dictionaries containing stats for the past 7 days, 30 |
+ days, and all time respectively, each including timeframe, |
+ suspicious_to_total_ratio, total_commits, tbr_no_lgtm, |
+ no_review_url, blank_tbr |
+ """ |
+ stats_7_days = {'timeframe': '7_days'} |
+ stats_30_days = {'timeframe': '30_days'} |
+ stats_all_time = {'timeframe': 'all_time'} |
+ for d in [stats_7_days, stats_30_days, stats_all_time]: |
+ if d == stats_7_days: |
+ sql_insert = 'DATEDIFF(NOW(), git_commit.timestamp) <= 7' |
+ elif d == stats_30_days: |
+ sql_insert = 'DATEDIFF(NOW(), git_commit.timestamp) <= 30' |
+ elif d == stats_all_time: |
+ sql_insert = 'DATEDIFF(git_commit.timestamp, NOW()) < 0' |
+ tot_commits = totaled_total_commits(cc, sql_insert) |
+ tot_suspicious = totaled_total_suspicious(cc, sql_insert) |
+ d['suspicious_to_total_ratio'] = totaled_ratio_calculator(tot_suspicious, |
+ tot_commits) |
+ d['total_commits'] = tot_commits |
+ d['tbr_no_lgtm'] = totaled_tbr_no_lgtm(cc, sql_insert) |
+ d['no_review_url'] = totaled_no_review_url(cc, sql_insert) |
+ d['blank_tbr'] = totaled_blank_tbr(cc, sql_insert) |
+ output = [stats_7_days, stats_30_days, stats_all_time] |
+ return output |
+ |
+ |
+def all_monthly_stats(cc, filename): # pragma: no cover |
+ """Write stats calculated over different time segments as json to file |
+ |
+ Args: |
+ cc(cursor) |
+ filename(str): the json file to write to |
+ """ |
+ output = {} |
+ stats_monthly_breakdown = compute_monthly_breakdown_stats(cc) |
+ stats_by_time = compute_stats_by_time(cc) |
+ stats_by_time.append(stats_monthly_breakdown) |
+ for d in stats_by_time: |
+ label = d['timeframe'] |
+ output[label] = d |
+ with open(filename, 'w') as f: |
+ json.dump(output, f) |
+ |
+ |
+def all_time_leaderboard(cc, filename): # pragma: no cover |
+ """Writes stats for all-time Antibody leaderboard top 100 to a json file |
+ |
+ Args: |
+ cc(cursor) |
+ filename(str): the json file to write to |
+ """ |
+ output = tbr_to_total_people_ratio(cc) |
+ with open(filename, 'w') as f: |
+ json.dump(output, f) |
+ |
+ |
+def past_month_leaderboard(cc, filename): # pragma: no cover |
+ """Writes stats for the past month Antibody leaderboard to a json file |
+ |
+ Args: |
+ cc(cursor) |
+ filename(str): the json file to write to |
+ """ |
+ sql_time_specification = 'AND DATEDIFF(NOW(), git_commit.timestamp) <= 30' |
+ output = tbr_to_total_people_ratio(cc, sql_time_specification) |
+ with open(filename, 'w') as f: |
+ json.dump(output, f) |