Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2467)

Unified Diff: appengine/cr-buildbucket/metrics.py

Issue 1532713002: buildbucket: add monitoring using ts_mon (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/cr-buildbucket/metrics.py
diff --git a/appengine/cr-buildbucket/metrics.py b/appengine/cr-buildbucket/metrics.py
index 3473d984961b11b00f2ecb7f29d821a15570f834..5f63c359896b3c2e9f8d7020501d8ee8c3804dcf 100644
--- a/appengine/cr-buildbucket/metrics.py
+++ b/appengine/cr-buildbucket/metrics.py
@@ -3,14 +3,20 @@
# found in the LICENSE file.
import logging
+
+from google.appengine.api import app_identity
from google.appengine.ext import ndb
from components import metrics
from components import utils
+import gae_ts_mon
import config
import model
+# TODO(nodir): remove Cloud Monitoring and refactor
+# when gae_ts_mon is stabilized
+
LABEL_BUCKET = 'buildbucket/bucket'
COMMON_LABELS = {
LABEL_BUCKET: 'Bucket'
@@ -43,10 +49,102 @@ METRIC_SCHEDULING_LATENCY = metrics.Descriptor(
labels=COMMON_LABELS,
)
+# gae_ts_mon
+FIELD_BUCKET = 'bucket'
+COMMON_FIELDS = {
+ 'buildbucket_hostname': app_identity.get_default_version_hostname(),
+}
+
+
+def _def_metric(metric_type, name, description):
+ return metric_type(
+ '/buildbucket/%s' % name,
Sergey Berezin 2015/12/17 00:08:01 Remove the leading slash: just 'buildbucket/%s'.
nodir 2015/12/17 01:28:54 Done.
+ fields=COMMON_FIELDS,
+ description=description)
+
+
+CREATE_COUNT = _def_metric(
+ gae_ts_mon.CounterMetric,
+ 'created',
Sergey Berezin 2015/12/17 00:08:01 nit: I'd make the metric name more descriptive, e.
nodir 2015/12/17 01:28:54 Done
+ 'Build creation',
+)
+START_COUNT = _def_metric(
+ gae_ts_mon.CounterMetric,
+ 'started',
+ 'Build start',
+)
+COMPLETE_COUNT = _def_metric(
+ gae_ts_mon.CounterMetric,
+ 'completed',
+ 'Build completion, including success, failure and cancellation'
+)
+HEARTBEAT_FAILURE_COUNT = _def_metric(
+ gae_ts_mon.CounterMetric,
+ 'heartbeat_failures',
+ 'Failures to extend a build lease'
+)
+LEASE_COUNT = _def_metric(
+ gae_ts_mon.CounterMetric,
+ 'leases',
+ 'Successful build lease extension',
+)
+LEASE_EXPIRATION_COUNT = _def_metric(
+ gae_ts_mon.CounterMetric,
+ 'lease_expired',
+ 'Build lease expirations'
+)
+CURRENTLY_PENDING = _def_metric(
+ gae_ts_mon.GaugeMetric,
+ 'pending',
+ 'Number of pending builds',
+)
+CURRENTLY_RUNNING = _def_metric(
+ gae_ts_mon.GaugeMetric,
+ 'running',
+ 'Number of running builds'
+)
+LEASE_LATENCY = _def_metric(
+ gae_ts_mon.NonCumulativeDistributionMetric,
Sergey Berezin 2015/12/17 00:08:01 I think this should be CumulativeDistributionMetri
nodir 2015/12/17 01:28:54 FWIU, you are suggesting to do LEASE_LATENCY.add(
Sergey Berezin 2015/12/17 02:30:18 It's a valid point; e.g. CQ measures both complete
nodir 2015/12/17 03:19:56 Renamed to never_leased_duration and scheduling_du
+ 'lease_latency',
+ 'Duration between a build is created and it is leased for the first time',
+)
+SCHEDULING_LATENCY = _def_metric(
+ gae_ts_mon.NonCumulativeDistributionMetric,
+ 'scheduling_latency',
+ 'Duration of a build being in SCHEDULED state',
+)
+
+
+GAUGE_OF_CLOUD_METRIC = {
+ METRIC_PENDING_BUILDS: CURRENTLY_PENDING,
+ METRIC_RUNNING_BUILDS: CURRENTLY_RUNNING,
+}
+DISTRIBUTION_OF_CLOUD_METRIC = {
+ METRIC_LEASE_BUILD_LATENCY: LEASE_LATENCY,
+ METRIC_SCHEDULING_LATENCY: SCHEDULING_LATENCY,
+}
+
+
+def increment(metric, build, **fields):
+ fields = {
+ k: str(v)
+ for k, v in fields.iteritems()
+ if v is not None
Sergey Berezin 2015/12/17 00:08:01 Don't skip fields dynamically. ts_mon requires all
nodir 2015/12/17 01:28:54 Done.
+ }
+ fields.setdefault(FIELD_BUCKET, build.bucket if build else '<no bucket>')
+ if build:
+ for t in build.tags:
+ k, v = t.split(':', 1)
+ fields.setdefault('tag_%s' % k, v)
Sergey Berezin 2015/12/17 00:08:01 How many different tags are there? Just checking t
nodir 2015/12/17 01:28:54 made them static
+ metric.increment(fields)
+
def set_gauge(buf, bucket, metric, value):
logging.info('Bucket %s: %s = %d', bucket, metric.name, value)
buf.set_gauge(metric, value, {LABEL_BUCKET: bucket})
+ gae_ts_mon_metric = GAUGE_OF_CLOUD_METRIC.get(metric)
+ if gae_ts_mon_metric:
+ gae_ts_mon_metric.set(value, {FIELD_BUCKET: bucket})
@ndb.tasklet
@@ -73,12 +171,16 @@ def send_build_latency(buf, metric, bucket, must_be_never_leased):
now = utils.utcnow()
avg_latency = 0.0
count = 0
+ dist = gae_ts_mon.Distribution(gae_ts_mon.GeometricBucketer())
for e in q.iter(projection=[model.Build.create_time]):
- avg_latency += (now - e.create_time).total_seconds()
+ latency = (now - e.create_time).total_seconds()
+ dist.add(latency)
+ avg_latency += latency
count += 1
if count > 0:
avg_latency /= count
set_gauge(buf, bucket, metric, avg_latency)
+ DISTRIBUTION_OF_CLOUD_METRIC[metric].set(dist, {FIELD_BUCKET: bucket})
def send_all_metrics():

Powered by Google App Engine
This is Rietveld 408576698