Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(380)

Side by Side Diff: appengine/cr-buildbucket/metrics.py

Issue 1532713002: buildbucket: add monitoring using ts_mon (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import logging 5 import logging
6
7 from google.appengine.api import app_identity
6 from google.appengine.ext import ndb 8 from google.appengine.ext import ndb
7 9
8 from components import metrics 10 from components import metrics
9 from components import utils 11 from components import utils
12 import gae_ts_mon
10 13
11 import config 14 import config
12 import model 15 import model
13 16
17 # TODO(nodir): remove Cloud Monitoring and refactor
18 # when gae_ts_mon is stabilized
19
14 LABEL_BUCKET = 'buildbucket/bucket' 20 LABEL_BUCKET = 'buildbucket/bucket'
15 COMMON_LABELS = { 21 COMMON_LABELS = {
16 LABEL_BUCKET: 'Bucket' 22 LABEL_BUCKET: 'Bucket'
17 } 23 }
18 METRIC_PENDING_BUILDS = metrics.Descriptor( 24 METRIC_PENDING_BUILDS = metrics.Descriptor(
19 name='buildbucket/builds/pending', 25 name='buildbucket/builds/pending',
20 description='Number of pending builds', 26 description='Number of pending builds',
21 labels=COMMON_LABELS, 27 labels=COMMON_LABELS,
22 ) 28 )
23 METRIC_RUNNING_BUILDS = metrics.Descriptor( 29 METRIC_RUNNING_BUILDS = metrics.Descriptor(
(...skipping 12 matching lines...) Expand all
36 METRIC_SCHEDULING_LATENCY = metrics.Descriptor( 42 METRIC_SCHEDULING_LATENCY = metrics.Descriptor(
37 name='buildbucket/builds/scheduling_latency', 43 name='buildbucket/builds/scheduling_latency',
38 description=( 44 description=(
39 'Average number of seconds for a scheduled build ' 45 'Average number of seconds for a scheduled build '
40 'to remain in SCHEDULED leased state' 46 'to remain in SCHEDULED leased state'
41 ), 47 ),
42 value_type='double', 48 value_type='double',
43 labels=COMMON_LABELS, 49 labels=COMMON_LABELS,
44 ) 50 )
45 51
52 # gae_ts_mon
53 FIELD_BUCKET = 'bucket'
54 COMMON_FIELDS = {
55 'buildbucket_hostname': app_identity.get_default_version_hostname(),
56 }
57
58
59 def _def_metric(metric_type, name, description):
60 return metric_type(
61 '/buildbucket/%s' % name,
Sergey Berezin 2015/12/17 00:08:01 Remove the leading slash: just 'buildbucket/%s'.
nodir 2015/12/17 01:28:54 Done.
62 fields=COMMON_FIELDS,
63 description=description)
64
65
66 CREATE_COUNT = _def_metric(
67 gae_ts_mon.CounterMetric,
68 'created',
Sergey Berezin 2015/12/17 00:08:01 nit: I'd make the metric name more descriptive, e.
nodir 2015/12/17 01:28:54 Done
69 'Build creation',
70 )
71 START_COUNT = _def_metric(
72 gae_ts_mon.CounterMetric,
73 'started',
74 'Build start',
75 )
76 COMPLETE_COUNT = _def_metric(
77 gae_ts_mon.CounterMetric,
78 'completed',
79 'Build completion, including success, failure and cancellation'
80 )
81 HEARTBEAT_FAILURE_COUNT = _def_metric(
82 gae_ts_mon.CounterMetric,
83 'heartbeat_failures',
84 'Failures to extend a build lease'
85 )
86 LEASE_COUNT = _def_metric(
87 gae_ts_mon.CounterMetric,
88 'leases',
89 'Successful build lease extension',
90 )
91 LEASE_EXPIRATION_COUNT = _def_metric(
92 gae_ts_mon.CounterMetric,
93 'lease_expired',
94 'Build lease expirations'
95 )
96 CURRENTLY_PENDING = _def_metric(
97 gae_ts_mon.GaugeMetric,
98 'pending',
99 'Number of pending builds',
100 )
101 CURRENTLY_RUNNING = _def_metric(
102 gae_ts_mon.GaugeMetric,
103 'running',
104 'Number of running builds'
105 )
106 LEASE_LATENCY = _def_metric(
107 gae_ts_mon.NonCumulativeDistributionMetric,
Sergey Berezin 2015/12/17 00:08:01 I think this should be CumulativeDistributionMetri
nodir 2015/12/17 01:28:54 FWIU, you are suggesting to do LEASE_LATENCY.add(
Sergey Berezin 2015/12/17 02:30:18 It's a valid point; e.g. CQ measures both complete
nodir 2015/12/17 03:19:56 Renamed to never_leased_duration and scheduling_du
108 'lease_latency',
109 'Duration between a build is created and it is leased for the first time',
110 )
111 SCHEDULING_LATENCY = _def_metric(
112 gae_ts_mon.NonCumulativeDistributionMetric,
113 'scheduling_latency',
114 'Duration of a build being in SCHEDULED state',
115 )
116
117
118 GAUGE_OF_CLOUD_METRIC = {
119 METRIC_PENDING_BUILDS: CURRENTLY_PENDING,
120 METRIC_RUNNING_BUILDS: CURRENTLY_RUNNING,
121 }
122 DISTRIBUTION_OF_CLOUD_METRIC = {
123 METRIC_LEASE_BUILD_LATENCY: LEASE_LATENCY,
124 METRIC_SCHEDULING_LATENCY: SCHEDULING_LATENCY,
125 }
126
127
128 def increment(metric, build, **fields):
129 fields = {
130 k: str(v)
131 for k, v in fields.iteritems()
132 if v is not None
Sergey Berezin 2015/12/17 00:08:01 Don't skip fields dynamically. ts_mon requires all
nodir 2015/12/17 01:28:54 Done.
133 }
134 fields.setdefault(FIELD_BUCKET, build.bucket if build else '<no bucket>')
135 if build:
136 for t in build.tags:
137 k, v = t.split(':', 1)
138 fields.setdefault('tag_%s' % k, v)
Sergey Berezin 2015/12/17 00:08:01 How many different tags are there? Just checking t
nodir 2015/12/17 01:28:54 made them static
139 metric.increment(fields)
140
46 141
47 def set_gauge(buf, bucket, metric, value): 142 def set_gauge(buf, bucket, metric, value):
48 logging.info('Bucket %s: %s = %d', bucket, metric.name, value) 143 logging.info('Bucket %s: %s = %d', bucket, metric.name, value)
49 buf.set_gauge(metric, value, {LABEL_BUCKET: bucket}) 144 buf.set_gauge(metric, value, {LABEL_BUCKET: bucket})
145 gae_ts_mon_metric = GAUGE_OF_CLOUD_METRIC.get(metric)
146 if gae_ts_mon_metric:
147 gae_ts_mon_metric.set(value, {FIELD_BUCKET: bucket})
50 148
51 149
52 @ndb.tasklet 150 @ndb.tasklet
53 def send_build_status_metric(buf, bucket, metric, status): 151 def send_build_status_metric(buf, bucket, metric, status):
54 q = model.Build.query( 152 q = model.Build.query(
55 model.Build.bucket == bucket, 153 model.Build.bucket == bucket,
56 model.Build.status == status) 154 model.Build.status == status)
57 value = yield q.count_async() 155 value = yield q.count_async()
58 set_gauge(buf, bucket, metric, value) 156 set_gauge(buf, bucket, metric, value)
59 157
60 158
61 @ndb.tasklet 159 @ndb.tasklet
62 def send_build_latency(buf, metric, bucket, must_be_never_leased): 160 def send_build_latency(buf, metric, bucket, must_be_never_leased):
63 q = model.Build.query( 161 q = model.Build.query(
64 model.Build.bucket == bucket, 162 model.Build.bucket == bucket,
65 model.Build.status == model.BuildStatus.SCHEDULED, 163 model.Build.status == model.BuildStatus.SCHEDULED,
66 ) 164 )
67 if must_be_never_leased: 165 if must_be_never_leased:
68 q = q.filter(model.Build.never_leased == True) 166 q = q.filter(model.Build.never_leased == True)
69 else: 167 else:
70 # Reuse the index that has never_leased 168 # Reuse the index that has never_leased
71 q = q.filter(model.Build.never_leased.IN((True, False))) 169 q = q.filter(model.Build.never_leased.IN((True, False)))
72 170
73 now = utils.utcnow() 171 now = utils.utcnow()
74 avg_latency = 0.0 172 avg_latency = 0.0
75 count = 0 173 count = 0
174 dist = gae_ts_mon.Distribution(gae_ts_mon.GeometricBucketer())
76 for e in q.iter(projection=[model.Build.create_time]): 175 for e in q.iter(projection=[model.Build.create_time]):
77 avg_latency += (now - e.create_time).total_seconds() 176 latency = (now - e.create_time).total_seconds()
177 dist.add(latency)
178 avg_latency += latency
78 count += 1 179 count += 1
79 if count > 0: 180 if count > 0:
80 avg_latency /= count 181 avg_latency /= count
81 set_gauge(buf, bucket, metric, avg_latency) 182 set_gauge(buf, bucket, metric, avg_latency)
183 DISTRIBUTION_OF_CLOUD_METRIC[metric].set(dist, {FIELD_BUCKET: bucket})
82 184
83 185
84 def send_all_metrics(): 186 def send_all_metrics():
85 buf = metrics.Buffer() 187 buf = metrics.Buffer()
86 futures = [] 188 futures = []
87 for b in config.get_buckets_async().get_result(): 189 for b in config.get_buckets_async().get_result():
88 futures.extend([ 190 futures.extend([
89 send_build_status_metric( 191 send_build_status_metric(
90 buf, b.name, METRIC_PENDING_BUILDS, model.BuildStatus.SCHEDULED), 192 buf, b.name, METRIC_PENDING_BUILDS, model.BuildStatus.SCHEDULED),
91 send_build_status_metric( 193 send_build_status_metric(
92 buf, b.name, METRIC_RUNNING_BUILDS, model.BuildStatus.STARTED), 194 buf, b.name, METRIC_RUNNING_BUILDS, model.BuildStatus.STARTED),
93 send_build_latency(buf, METRIC_LEASE_BUILD_LATENCY, b.name, True), 195 send_build_latency(buf, METRIC_LEASE_BUILD_LATENCY, b.name, True),
94 send_build_latency(buf, METRIC_SCHEDULING_LATENCY, b.name, False), 196 send_build_latency(buf, METRIC_SCHEDULING_LATENCY, b.name, False),
95 ]) 197 ])
96 ndb.Future.wait_all(futures) 198 ndb.Future.wait_all(futures)
97 buf.flush() 199 buf.flush()
98 for f in futures: 200 for f in futures:
99 f.check_success() 201 f.check_success()
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698