appengine/cr-buildbucket/metrics.py - Issue 1532713002: buildbucket: add monitoring using ts_mon

Side by Side Diff: appengine/cr-buildbucket/metrics.py

Issue 1532713002: buildbucket: add monitoring using ts_mon (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import logging	5 import logging

	6

	7 from google.appengine.api import app_identity

6 from google.appengine.ext import ndb	8 from google.appengine.ext import ndb

7	9

8 from components import metrics	10 from components import metrics

9 from components import utils	11 from components import utils

	12 import gae_ts_mon

10	13

11 import config	14 import config

12 import model	15 import model

13	16

	17 # TODO(nodir): remove Cloud Monitoring and refactor

	18 # when gae_ts_mon is stabilized

	19

14 LABEL_BUCKET = 'buildbucket/bucket'	20 LABEL_BUCKET = 'buildbucket/bucket'

15 COMMON_LABELS = {	21 COMMON_LABELS = {

16 LABEL_BUCKET: 'Bucket'	22 LABEL_BUCKET: 'Bucket'

17 }	23 }

18 METRIC_PENDING_BUILDS = metrics.Descriptor(	24 METRIC_PENDING_BUILDS = metrics.Descriptor(

19 name='buildbucket/builds/pending',	25 name='buildbucket/builds/pending',

20 description='Number of pending builds',	26 description='Number of pending builds',

21 labels=COMMON_LABELS,	27 labels=COMMON_LABELS,

22 )	28 )

23 METRIC_RUNNING_BUILDS = metrics.Descriptor(	29 METRIC_RUNNING_BUILDS = metrics.Descriptor(

(...skipping 12 matching lines...) Expand all Loading...
36 METRIC_SCHEDULING_LATENCY = metrics.Descriptor(	42 METRIC_SCHEDULING_LATENCY = metrics.Descriptor(

37 name='buildbucket/builds/scheduling_latency',	43 name='buildbucket/builds/scheduling_latency',

38 description=(	44 description=(

39 'Average number of seconds for a scheduled build '	45 'Average number of seconds for a scheduled build '

40 'to remain in SCHEDULED leased state'	46 'to remain in SCHEDULED leased state'

41 ),	47 ),

42 value_type='double',	48 value_type='double',

43 labels=COMMON_LABELS,	49 labels=COMMON_LABELS,

44 )	50 )

45	51

	52 # gae_ts_mon

	53 FIELD_BUCKET = 'bucket'

	54 COMMON_FIELDS = {

	55 'buildbucket_hostname': app_identity.get_default_version_hostname(),

	56 }

	57

	58

	59 def _def_metric(metric_type, name, description):

	60 return metric_type(

	61 '/buildbucket/%s' % name,
	Sergey Berezin 2015/12/17 00:08:01 Remove the leading slash: just 'buildbucket/%s'. Remove the leading slash: just 'buildbucket/%s'. nodir 2015/12/17 01:28:54 Done. Show quoted text On 2015/12/17 00:08:01, Sergey Berezin wrote: > Remove the leading slash: just 'buildbucket/%s'. Done.
	62 fields=COMMON_FIELDS,

	63 description=description)

	64

	65

	66 CREATE_COUNT = _def_metric(

	67 gae_ts_mon.CounterMetric,

	68 'created',
	Sergey Berezin 2015/12/17 00:08:01 nit: I'd make the metric name more descriptive, e. nit: I'd make the metric name more descriptive, e.g. 'builds/created'. Similarly below. nodir 2015/12/17 01:28:54 Done Show quoted text On 2015/12/17 00:08:01, Sergey Berezin wrote: > nit: I'd make the metric name more descriptive, e.g. 'builds/created'. Similarly > below. Done
	69 'Build creation',

	70 )

	71 START_COUNT = _def_metric(

	72 gae_ts_mon.CounterMetric,

	73 'started',

	74 'Build start',

	75 )

	76 COMPLETE_COUNT = _def_metric(

	77 gae_ts_mon.CounterMetric,

	78 'completed',

	79 'Build completion, including success, failure and cancellation'

	80 )

	81 HEARTBEAT_FAILURE_COUNT = _def_metric(

	82 gae_ts_mon.CounterMetric,

	83 'heartbeat_failures',

	84 'Failures to extend a build lease'

	85 )

	86 LEASE_COUNT = _def_metric(

	87 gae_ts_mon.CounterMetric,

	88 'leases',

	89 'Successful build lease extension',

	90 )

	91 LEASE_EXPIRATION_COUNT = _def_metric(

	92 gae_ts_mon.CounterMetric,

	93 'lease_expired',

	94 'Build lease expirations'

	95 )

	96 CURRENTLY_PENDING = _def_metric(

	97 gae_ts_mon.GaugeMetric,

	98 'pending',

	99 'Number of pending builds',

	100 )

	101 CURRENTLY_RUNNING = _def_metric(

	102 gae_ts_mon.GaugeMetric,

	103 'running',

	104 'Number of running builds'

	105 )

	106 LEASE_LATENCY = _def_metric(

	107 gae_ts_mon.NonCumulativeDistributionMetric,
	Sergey Berezin 2015/12/17 00:08:01 I think this should be CumulativeDistributionMetri I think this should be CumulativeDistributionMetric, since you want to measure it at the moment the build is leased, so it becomes an event. nodir 2015/12/17 01:28:54 FWIU, you are suggesting to do LEASE_LATENCY.add( Show quoted text On 2015/12/17 00:08:01, Sergey Berezin wrote: > I think this should be CumulativeDistributionMetric, since you want to measure > it at the moment the build is leased, so it becomes an event. FWIU, you are suggesting to do LEASE_LATENCY.add(now - created, {'bucket': 'master.x'}) the moment a build is leased. However, imagine that something happened with buildbot and it stopped leasing entirely. Events don't fire and thus graphs do not indicate a problem, until the moment buildbot is back again. The moment leasing resumes we notice that it is huge, but we want to notice it the moment leasing stopped. Sergey Berezin 2015/12/17 02:30:18 It's a valid point; e.g. CQ measures both complete Show quoted text On 2015/12/17 01:28:54, nodir wrote: > On 2015/12/17 00:08:01, Sergey Berezin wrote: > > I think this should be CumulativeDistributionMetric, since you want to measure > > it at the moment the build is leased, so it becomes an event. > > FWIU, you are suggesting to do > > LEASE_LATENCY.add(now - created, {'bucket': 'master.x'}) > > the moment a build is leased. > > However, imagine that something happened with buildbot and it stopped leasing > entirely. Events don't fire and thus graphs do not indicate a problem, until the > moment buildbot is back again. The moment leasing resumes we notice that it is > huge, but we want to notice it the moment leasing stopped. It's a valid point; e.g. CQ measures both completed attempt times (cumulative) and time currently in CQ (non-cumulative), just for that reason. In this case, the name and description of the metric suggests it's an actual latency (time delta between two events), which is a cumulative measurement taken at lease time. I'd add both builds/lease_latency (cumulative) and builds/pending_time (or something like that - non-cumulative). Similarly for builds/scheduling_latency - the word "latency" implies something completed, not still waiting. The description also doesn't make it clear. In CQ, these metrics are named pending_attempts/durations and completed_attempts/durations, making it less ambiguous. nodir 2015/12/17 03:19:56 Renamed to never_leased_duration and scheduling_du Show quoted text On 2015/12/17 02:30:18, Sergey Berezin wrote: > On 2015/12/17 01:28:54, nodir wrote: > > On 2015/12/17 00:08:01, Sergey Berezin wrote: > > > I think this should be CumulativeDistributionMetric, since you want to > measure > > > it at the moment the build is leased, so it becomes an event. > > > > FWIU, you are suggesting to do > > > > LEASE_LATENCY.add(now - created, {'bucket': 'master.x'}) > > > > the moment a build is leased. > > > > However, imagine that something happened with buildbot and it stopped leasing > > entirely. Events don't fire and thus graphs do not indicate a problem, until > the > > moment buildbot is back again. The moment leasing resumes we notice that it is > > huge, but we want to notice it the moment leasing stopped. > > It's a valid point; e.g. CQ measures both completed attempt times (cumulative) > and time currently in CQ (non-cumulative), just for that reason. > > In this case, the name and description of the metric suggests it's an actual > latency (time delta between two events), which is a cumulative measurement taken > at lease time. I'd add both builds/lease_latency (cumulative) and > builds/pending_time (or something like that - non-cumulative). > > Similarly for builds/scheduling_latency - the word "latency" implies something > completed, not still waiting. The description also doesn't make it clear. > > In CQ, these metrics are named pending_attempts/durations and > completed_attempts/durations, making it less ambiguous. Renamed to never_leased_duration and scheduling_duration
	108 'lease_latency',

	109 'Duration between a build is created and it is leased for the first time',

	110 )

	111 SCHEDULING_LATENCY = _def_metric(

	112 gae_ts_mon.NonCumulativeDistributionMetric,

	113 'scheduling_latency',

	114 'Duration of a build being in SCHEDULED state',

	115 )

	116

	117

	118 GAUGE_OF_CLOUD_METRIC = {

	119 METRIC_PENDING_BUILDS: CURRENTLY_PENDING,

	120 METRIC_RUNNING_BUILDS: CURRENTLY_RUNNING,

	121 }

	122 DISTRIBUTION_OF_CLOUD_METRIC = {

	123 METRIC_LEASE_BUILD_LATENCY: LEASE_LATENCY,

	124 METRIC_SCHEDULING_LATENCY: SCHEDULING_LATENCY,

	125 }

	126

	127

	128 def increment(metric, build, **fields):

	129 fields = {

	130 k: str(v)

	131 for k, v in fields.iteritems()

	132 if v is not None
	Sergey Berezin 2015/12/17 00:08:01 Don't skip fields dynamically. ts_mon requires all Don't skip fields dynamically. ts_mon requires all fields to be present in the metric for all points, globally. Otherwise inconsistent metrics will be rejected. If needed, assign None, empty string or 0 to the field as appropriate. nodir 2015/12/17 01:28:54 Done. Show quoted text On 2015/12/17 00:08:01, Sergey Berezin wrote: > Don't skip fields dynamically. ts_mon requires all fields to be present in the > metric for all points, globally. Otherwise inconsistent metrics will be > rejected. If needed, assign None, empty string or 0 to the field as appropriate. Done.
	133 }

	134 fields.setdefault(FIELD_BUCKET, build.bucket if build else '<no bucket>')

	135 if build:

	136 for t in build.tags:

	137 k, v = t.split(':', 1)

	138 fields.setdefault('tag_%s' % k, v)
	Sergey Berezin 2015/12/17 00:08:01 How many different tags are there? Just checking t How many different tags are there? Just checking that they are not dynamically generated. If they are, we can't have them as a field, since they'll create too many streams per metric. nodir 2015/12/17 01:28:54 made them static Show quoted text On 2015/12/17 00:08:01, Sergey Berezin wrote: > How many different tags are there? Just checking that they are not dynamically > generated. If they are, we can't have them as a field, since they'll create too > many streams per metric. made them static
	139 metric.increment(fields)

	140

46	141

47 def set_gauge(buf, bucket, metric, value):	142 def set_gauge(buf, bucket, metric, value):

48 logging.info('Bucket %s: %s = %d', bucket, metric.name, value)	143 logging.info('Bucket %s: %s = %d', bucket, metric.name, value)

49 buf.set_gauge(metric, value, {LABEL_BUCKET: bucket})	144 buf.set_gauge(metric, value, {LABEL_BUCKET: bucket})

	145 gae_ts_mon_metric = GAUGE_OF_CLOUD_METRIC.get(metric)

	146 if gae_ts_mon_metric:

	147 gae_ts_mon_metric.set(value, {FIELD_BUCKET: bucket})

50	148

51	149

52 @ndb.tasklet	150 @ndb.tasklet

53 def send_build_status_metric(buf, bucket, metric, status):	151 def send_build_status_metric(buf, bucket, metric, status):

54 q = model.Build.query(	152 q = model.Build.query(

55 model.Build.bucket == bucket,	153 model.Build.bucket == bucket,

56 model.Build.status == status)	154 model.Build.status == status)

57 value = yield q.count_async()	155 value = yield q.count_async()

58 set_gauge(buf, bucket, metric, value)	156 set_gauge(buf, bucket, metric, value)

59	157

60	158

61 @ndb.tasklet	159 @ndb.tasklet

62 def send_build_latency(buf, metric, bucket, must_be_never_leased):	160 def send_build_latency(buf, metric, bucket, must_be_never_leased):

63 q = model.Build.query(	161 q = model.Build.query(

64 model.Build.bucket == bucket,	162 model.Build.bucket == bucket,

65 model.Build.status == model.BuildStatus.SCHEDULED,	163 model.Build.status == model.BuildStatus.SCHEDULED,

66 )	164 )

67 if must_be_never_leased:	165 if must_be_never_leased:

68 q = q.filter(model.Build.never_leased == True)	166 q = q.filter(model.Build.never_leased == True)

69 else:	167 else:

70 # Reuse the index that has never_leased	168 # Reuse the index that has never_leased

71 q = q.filter(model.Build.never_leased.IN((True, False)))	169 q = q.filter(model.Build.never_leased.IN((True, False)))

72	170

73 now = utils.utcnow()	171 now = utils.utcnow()

74 avg_latency = 0.0	172 avg_latency = 0.0

75 count = 0	173 count = 0

	174 dist = gae_ts_mon.Distribution(gae_ts_mon.GeometricBucketer())

76 for e in q.iter(projection=[model.Build.create_time]):	175 for e in q.iter(projection=[model.Build.create_time]):

77 avg_latency += (now - e.create_time).total_seconds()	176 latency = (now - e.create_time).total_seconds()

	177 dist.add(latency)

	178 avg_latency += latency

78 count += 1	179 count += 1

79 if count > 0:	180 if count > 0:

80 avg_latency /= count	181 avg_latency /= count

81 set_gauge(buf, bucket, metric, avg_latency)	182 set_gauge(buf, bucket, metric, avg_latency)

	183 DISTRIBUTION_OF_CLOUD_METRIC[metric].set(dist, {FIELD_BUCKET: bucket})

82	184

83	185

84 def send_all_metrics():	186 def send_all_metrics():

85 buf = metrics.Buffer()	187 buf = metrics.Buffer()

86 futures = []	188 futures = []

87 for b in config.get_buckets_async().get_result():	189 for b in config.get_buckets_async().get_result():

88 futures.extend([	190 futures.extend([

89 send_build_status_metric(	191 send_build_status_metric(

90 buf, b.name, METRIC_PENDING_BUILDS, model.BuildStatus.SCHEDULED),	192 buf, b.name, METRIC_PENDING_BUILDS, model.BuildStatus.SCHEDULED),

91 send_build_status_metric(	193 send_build_status_metric(

92 buf, b.name, METRIC_RUNNING_BUILDS, model.BuildStatus.STARTED),	194 buf, b.name, METRIC_RUNNING_BUILDS, model.BuildStatus.STARTED),

93 send_build_latency(buf, METRIC_LEASE_BUILD_LATENCY, b.name, True),	195 send_build_latency(buf, METRIC_LEASE_BUILD_LATENCY, b.name, True),

94 send_build_latency(buf, METRIC_SCHEDULING_LATENCY, b.name, False),	196 send_build_latency(buf, METRIC_SCHEDULING_LATENCY, b.name, False),

95 ])	197 ])

96 ndb.Future.wait_all(futures)	198 ndb.Future.wait_all(futures)

97 buf.flush()	199 buf.flush()

98 for f in futures:	200 for f in futures:

99 f.check_success()	201 f.check_success()

OLD	NEW

« no previous file with comments | « appengine/cr-buildbucket/main.py ('k') | appengine/cr-buildbucket/module-backend.yaml » ('j') | appengine/cr-buildbucket/service.py » ('J')