OLD | NEW |
| (Empty) |
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 """Collect stats regularly via app engine cron. | |
6 """ | |
7 | |
8 import calendar | |
9 import datetime | |
10 import json | |
11 import logging | |
12 | |
13 import numpy | |
14 import webapp2 | |
15 | |
16 from google.appengine.api import urlfetch | |
17 from google.appengine.ext import ndb | |
18 | |
19 from appengine_module.trooper_o_matic import models | |
20 from appengine_module.trooper_o_matic import trees | |
21 | |
22 | |
23 def datetime_now(): # pragma: no cover | |
24 """Easy to mock datetime.datetime.utcnow() for unit testing.""" | |
25 return datetime.datetime.utcnow() | |
26 | |
27 | |
28 def date_from_str(string, base_format): # pragma: no cover | |
29 """Converts a string to a date, taking into account the possible existence | |
30 of a millisecond precision value.""" | |
31 try: | |
32 return datetime.datetime.strptime(string, base_format + '.%f') | |
33 except ValueError: | |
34 return datetime.datetime.strptime(string, base_format) | |
35 | |
36 | |
37 class CheckCQHandler(webapp2.RequestHandler): # pragma: no cover | |
38 """Collect commit queue length and run times.""" | |
39 | |
40 patch_stop_list = ('http://chromium-cq-status.appspot.com/query/action=' | |
41 'patch_stop/?begin=%d') | |
42 | |
43 pending_api_url = 'https://chromium-commit-queue.appspot.com/api/%s/pending' | |
44 | |
45 patchset_details = ('https://chromium-cq-status.appspot.com/query/' | |
46 'issue=%d/patchset=%d/') | |
47 | |
48 @staticmethod | |
49 def update_stat_for_times(stat, times): | |
50 stat.min = times[0] | |
51 stat.max = times[-1] | |
52 stat.mean = numpy.mean(times) | |
53 stat.p10 = numpy.percentile(times, 10) | |
54 stat.p25 = numpy.percentile(times, 25) | |
55 stat.p50 = numpy.percentile(times, 50) | |
56 stat.p75 = numpy.percentile(times, 75) | |
57 stat.p90 = numpy.percentile(times, 90) | |
58 stat.p95 = numpy.percentile(times, 95) | |
59 stat.p99 = numpy.percentile(times, 99) | |
60 | |
61 def get(self): | |
62 # We only care about the last hour. | |
63 cutoff = datetime_now() - datetime.timedelta(hours=1) | |
64 url = self.patch_stop_list % calendar.timegm( | |
65 cutoff.timetuple()) | |
66 | |
67 # CQ API has a limit of results it will return, and if there are more | |
68 # results it will return a cursor. So loop through results until | |
69 # there is no cursor. | |
70 cursor = None | |
71 more_results = True | |
72 patchsets = {} | |
73 while more_results: | |
74 if cursor: | |
75 url = url + '&cursor=' + cursor | |
76 result = urlfetch.fetch(url=url, deadline=60) | |
77 content = json.loads(result.content) | |
78 for result in content['results']: | |
79 patchsets.setdefault(result['fields']['project'], set()).add( | |
80 (result['fields']['issue'], result['fields']['patchset'])) | |
81 cursor = content.get('cursor') | |
82 more_results = content.get('more') | |
83 | |
84 # Only track the chromium and blink projects. | |
85 projects = set(['chromium', 'blink']) | |
86 for project in projects: | |
87 # Ensure there is an ancestor for all the stats for this project. | |
88 project_model = models.Project.get_or_insert(project) | |
89 project_model.put() | |
90 | |
91 # CQ exposes an API for its length. | |
92 result = urlfetch.fetch(url=self.pending_api_url % project, deadline=60) | |
93 pending = set(json.loads(result.content)['results']) | |
94 num_pending = len(pending) | |
95 stat = models.CqStat(parent=project_model.key, length=num_pending) | |
96 patch_in_queue_stat = models.CqTimeInQueueForPatchStat( | |
97 parent=project_model.key, length=num_pending) | |
98 patch_total_time_stat = models.CqTotalTimeForPatchStat( | |
99 parent=project_model.key, length=num_pending) | |
100 | |
101 single_run_times = [] | |
102 in_queue_times = [] | |
103 total_times = [] | |
104 | |
105 for patchset in patchsets[project]: | |
106 url = self.patchset_details % (patchset[0], patchset[1]) | |
107 result = urlfetch.fetch(url=url, deadline=60) | |
108 content = json.loads(result.content) | |
109 # Get a list of all starts/stops for this patch. | |
110 actions = [result['fields'] for result in content['results'] if ( | |
111 result['fields'].get('action') == 'patch_start' or | |
112 result['fields'].get('action') == 'patch_stop')] | |
113 actions.sort(key=lambda k: k['timestamp']) | |
114 | |
115 start_time = None | |
116 last_start = None | |
117 end_time = None | |
118 run_times = [] | |
119 for action in actions: | |
120 if action['action'] == 'patch_start': | |
121 if not start_time: | |
122 start_time = action['timestamp'] | |
123 last_start = action['timestamp'] | |
124 else: | |
125 if last_start: | |
126 run_time = (action['timestamp'] - last_start) / 60 | |
127 run_times.append(run_time) | |
128 last_start = None | |
129 end_time = action['timestamp'] | |
130 | |
131 if run_times: | |
132 single_run_times += run_times | |
133 in_queue_times.append(sum(run_times)) | |
134 total_times.append((end_time - start_time) / 60) | |
135 | |
136 if single_run_times: | |
137 self.update_stat_for_times(stat, sorted(single_run_times)) | |
138 self.update_stat_for_times(patch_in_queue_stat, sorted(in_queue_times)) | |
139 self.update_stat_for_times(patch_total_time_stat, sorted(total_times)) | |
140 | |
141 stat.put() | |
142 patch_in_queue_stat.put() | |
143 patch_total_time_stat.put() | |
144 | |
145 | |
146 class CheckTreeHandler(webapp2.RequestHandler): # pragma: no cover | |
147 """Checks the given tree for build times higher than the SLO specifies.""" | |
148 | |
149 stats_api_url = ('https://chrome-infra-stats.appspot.com/_ah/api/stats/v1/' | |
150 'steps/%s/overall__build__result__/%s') | |
151 | |
152 last_hour_format = '%Y-%m-%dT%H:%MZ' | |
153 generated_format = '%Y-%m-%dT%H:%M:%S' | |
154 | |
155 def get(self, tree): | |
156 """For each master in the tree, find builds that don't meet our SLO.""" | |
157 masters = trees.GetMastersForTree(tree) | |
158 if not masters: | |
159 logging.error('Invalid tree %s', tree) | |
160 return | |
161 now = datetime_now() | |
162 tree_model = models.Tree.get_or_insert(tree) | |
163 tree_model.put() | |
164 stat = models.BuildTimeStat(parent=tree_model.key, | |
165 timestamp=now, | |
166 num_builds=0, | |
167 num_over_median_slo=0, | |
168 num_over_max_slo=0) | |
169 # The chrome-infra-stats API lists builds that have STARTED in the last | |
170 # hour. We want to list builds that have ENDED in the last hour, so we need | |
171 # to go back through the last 24 hours to make sure we don't miss any. | |
172 # TODO(sullivan): When an "ended in last hour" API is available, switch | |
173 # to that. | |
174 hours = [now - datetime.timedelta(hours=h) for h in range(0, 24)] | |
175 hour_strs = [hour.strftime(self.last_hour_format) for hour in hours] | |
176 last_hour = datetime.timedelta(hours=1) | |
177 for master in masters: | |
178 records = [] | |
179 urls = [self.stats_api_url % (master, hour_str) for hour_str in hour_strs] | |
180 for url in urls: | |
181 logging.info(url) | |
182 result = urlfetch.fetch(url=url, deadline=60) | |
183 content = json.loads(result.content) | |
184 records += content.get('step_records', []) | |
185 for record in records: | |
186 generated_time = date_from_str(record['generated'], | |
187 self.generated_format) | |
188 if now - generated_time > last_hour: | |
189 continue | |
190 stat.num_builds += 1 | |
191 buildtime_median = models.SLO_BUILDTIME_PER_BOT_MEDIAN.get( | |
192 master, {}).get(record['builder'], models.SLO_BUILDTIME_MEDIAN) | |
193 buildtime_max = models.SLO_BUILDTIME_PER_BOT_MAX.get( | |
194 master, {}).get(record['builder'], models.SLO_BUILDTIME_MAX) | |
195 buildtime_max = max(buildtime_max, buildtime_median) | |
196 | |
197 if record['step_time'] > buildtime_median: | |
198 stat.num_over_median_slo += 1 | |
199 v = models.BuildSLOOffender(tree=tree, master=master, | |
200 builder=record['builder'], | |
201 buildnumber=int(record['buildnumber']), | |
202 buildtime=float(record['step_time']), | |
203 result=int(record['result']), | |
204 revision=record['revision'], | |
205 slo_median_buildtime=buildtime_median, | |
206 slo_max_buildtime=buildtime_max) | |
207 stat.slo_offenders.append(v) | |
208 if record['step_time'] > buildtime_max: | |
209 stat.num_over_max_slo += 1 | |
210 ndb.put_multi(stat.slo_offenders) | |
211 stat.put() | |
212 | |
213 | |
214 class CheckTreeStatusHandler(webapp2.RequestHandler): # pragma: no cover | |
215 | |
216 status_url = ('https://%s-status.appspot.com/allstatus?format=json&' | |
217 'endTime=%s&limit=1000') | |
218 | |
219 @staticmethod | |
220 def tree_is_open_for(entry): | |
221 # Count scheduled maintenance as tree open, we only want to alert on | |
222 # unexpected closures. | |
223 return (entry['can_commit_freely'] or | |
224 entry['message'].startswith('Tree is closed for maintenance')) | |
225 | |
226 @staticmethod | |
227 def date_for( entry): | |
228 return datetime.datetime.strptime(entry['date'], '%Y-%m-%d %H:%M:%S.%f') | |
229 | |
230 def fetch_entries(self, project, days): | |
231 # Get two previous days of data, in case the tree has been in the same | |
232 # state for the entire time period. | |
233 data_start = datetime_now() - datetime.timedelta(days=days+2) | |
234 url = self.status_url % (project, calendar.timegm(data_start.timetuple())) | |
235 result = urlfetch.fetch(url) | |
236 entries = json.loads(result.content) | |
237 entries.sort(key=self.date_for) | |
238 return entries | |
239 | |
240 def get_state_of_tree(self, entries, cutoff): | |
241 # Find the state of the tree before the days started. | |
242 was_open = True | |
243 for _, entry in enumerate(entries): | |
244 if self.date_for(entry) > cutoff: | |
245 break | |
246 was_open = self.tree_is_open_for(entry) | |
247 return was_open | |
248 | |
249 def get(self, project, days): | |
250 # Check tree status in last N days | |
251 days = int(days) | |
252 now = datetime_now() | |
253 cutoff = datetime_now() - datetime.timedelta(days=days) | |
254 | |
255 entries = self.fetch_entries(project, days) | |
256 was_open = self.get_state_of_tree(entries, cutoff) | |
257 | |
258 # Now look through the entries in the relevant days to find the tree open | |
259 # times. | |
260 last_change = cutoff | |
261 open_time = datetime.timedelta(seconds=0) | |
262 closed_time = datetime.timedelta(seconds=0) | |
263 for entry in entries: | |
264 is_open = self.tree_is_open_for(entry) | |
265 if self.date_for(entry) <= cutoff or is_open == was_open: | |
266 continue | |
267 current_time = self.date_for(entry) | |
268 delta = current_time - last_change | |
269 if was_open: | |
270 open_time += delta | |
271 else: | |
272 closed_time += delta | |
273 last_change = current_time | |
274 was_open = is_open | |
275 | |
276 delta = now - last_change | |
277 if was_open: | |
278 open_time += delta | |
279 else: | |
280 closed_time += delta | |
281 | |
282 open_seconds = open_time.total_seconds() | |
283 closed_seconds = closed_time.total_seconds() | |
284 project_model = models.Project.get_or_insert(project) | |
285 project_model.put() | |
286 stat = models.TreeOpenStat( | |
287 parent=project_model.key, | |
288 num_days=days, | |
289 percent_open=(open_seconds / (open_seconds + closed_seconds)) * 100) | |
290 stat.put() | |
OLD | NEW |