Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1028)

Unified Diff: scripts/master/monitoring_status_receiver.py

Issue 2102383003: Report the /json/varz metrics directly from buildbot. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@buildbot-tsmon-2
Patch Set: Rebase Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: scripts/master/monitoring_status_receiver.py
diff --git a/scripts/master/monitoring_status_receiver.py b/scripts/master/monitoring_status_receiver.py
index db096b93ab8e38ca9e3afe077d6474b7e52d3d87..81279ba2c529fa65666ff95c2296e6e9d7c86550 100644
--- a/scripts/master/monitoring_status_receiver.py
+++ b/scripts/master/monitoring_status_receiver.py
@@ -2,12 +2,41 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
+import collections
+import time
+
from buildbot.status.base import StatusReceiverMultiService
-from twisted.internet import task
+from twisted.internet import defer, reactor, task
from twisted.python import log, threadpool
from infra_libs import ts_mon
+uptime = ts_mon.FloatMetric('buildbot/master/uptime',
+ description='Time (in seconds) since the master was started')
+accepting_builds = ts_mon.BooleanMetric('buildbot/master/accepting_builds',
+ description='Whether the master\'s BuildRequestDistributor is running')
+
+connected = ts_mon.GaugeMetric('buildbot/master/builders/connected_slaves',
+ description='Number of slaves currently connected, per builder')
+current_builds = ts_mon.GaugeMetric('buildbot/master/builders/current_builds',
+ description='Number of builds currently running, per builder')
+pending_builds = ts_mon.GaugeMetric('buildbot/master/builders/pending_builds',
+ description='Number of builds pending, per builder')
+state = ts_mon.StringMetric('buildbot/master/builders/state',
+ description='State of this builder - building, idle, or offline')
+total = ts_mon.GaugeMetric('buildbot/master/builders/total_slaves',
+ description='Number of slaves configured on this builder - connected or '
+ 'not')
+
+pool_queue = ts_mon.GaugeMetric('buildbot/master/thread_pool/queue',
+ description='Number of runnables queued in the database thread pool')
+pool_waiting = ts_mon.GaugeMetric('buildbot/master/thread_pool/waiting',
+ description='Number of idle workers for the database thread pool')
+pool_working = ts_mon.GaugeMetric('buildbot/master/thread_pool/working',
+ description='Number of running workers for the database thread pool')
+
+SERVER_STARTED = time.time()
+
class MonitoringStatusReceiver(StatusReceiverMultiService):
"""Flushes ts_mon metrics once per minute."""
@@ -16,7 +45,7 @@ class MonitoringStatusReceiver(StatusReceiverMultiService):
StatusReceiverMultiService.__init__(self)
self.status = None
self.thread_pool = threadpool.ThreadPool(1, 1)
- self.loop = task.LoopingCall(self._flush)
+ self.loop = task.LoopingCall(self.updateMetricsAndFlush)
def startService(self):
StatusReceiverMultiService.startService(self)
@@ -31,8 +60,53 @@ class MonitoringStatusReceiver(StatusReceiverMultiService):
self.thread_pool.stop()
return StatusReceiverMultiService.stopService(self)
- def _flush(self):
- self.thread_pool.callInThread(self._flush_and_log_exceptions)
+ @defer.inlineCallbacks
+ def updateMetricsAndFlush(self):
+ try:
+ yield self.updateMetrics()
+ finally:
+ self.thread_pool.callInThread(self._flush_and_log_exceptions)
+
+ @defer.inlineCallbacks
+ def updateMetrics(self):
+ uptime.set(time.time() - SERVER_STARTED)
+ accepting_builds.set(bool(self.status.master.botmaster.brd.running))
+ pool = self.status.master.db.pool
+ pool_queue.set(pool.q.qsize())
+ pool_waiting.set(len(pool.waiters))
+ pool_working.set(len(pool.working))
+
+ for builder_name in self.status.getBuilderNames():
+ fields = {'builder': builder_name}
+ builder = self.status.getBuilder(builder_name)
+ slaves = builder.getSlaves()
+
+ connected.set(sum(1 for x in slaves if x.connected), fields=fields)
+ current_builds.set(len(builder.getCurrentBuilds()), fields=fields)
+ state.set(builder.currentBigState, fields=fields)
+ total.set(len(slaves), fields=fields)
+
+ # Get pending build requests directly from the db for all builders at
+ # once.
+ d = self.status.master.db.buildrequests.getBuildRequests(claimed=False)
+
+ # Timeout the database request after 5 seconds.
+ def timeout():
+ if not d.called:
+ d.cancel()
+ reactor.callLater(5, timeout)
+
+ try:
+ brdicts = yield d
+ except Exception as ex:
+ log.err(ex, 'getBuildRequests failed while failed populating metrics')
+ else:
+ pending_per_builder = collections.defaultdict(int)
+ for brdict in brdicts:
+ pending_per_builder[brdict['buildername']] += 1
+
+ for builder_name, count in pending_per_builder.iteritems():
+ pending_builds.set(count, fields={'builder': builder_name})
def _flush_and_log_exceptions(self):
try:
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698