| OLD | NEW |
| (Empty) |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 import json | |
| 6 import logging | |
| 7 import os | |
| 8 import socket | |
| 9 import sys | |
| 10 import urlparse | |
| 11 import re | |
| 12 | |
| 13 import requests | |
| 14 | |
| 15 from infra_libs.ts_mon.common import interface | |
| 16 from infra_libs.ts_mon.common import metric_store | |
| 17 from infra_libs.ts_mon.common import monitors | |
| 18 from infra_libs.ts_mon.common import standard_metrics | |
| 19 from infra_libs.ts_mon.common import targets | |
| 20 | |
| 21 | |
| 22 def load_machine_config(filename): | |
| 23 if not os.path.exists(filename): | |
| 24 logging.info('Configuration file does not exist, ignoring: %s', filename) | |
| 25 return {} | |
| 26 | |
| 27 try: | |
| 28 with open(filename) as fh: | |
| 29 return json.load(fh) | |
| 30 except Exception: | |
| 31 logging.error('Configuration file couldn\'t be read: %s', filename) | |
| 32 raise | |
| 33 | |
| 34 | |
| 35 def _default_region(fqdn): | |
| 36 # Check if we're running in a GCE instance. | |
| 37 try: | |
| 38 r = requests.get( | |
| 39 'http://metadata.google.internal/computeMetadata/v1/instance/zone', | |
| 40 headers={'Metadata-Flavor': 'Google'}, | |
| 41 timeout=1.0) | |
| 42 except requests.exceptions.RequestException: | |
| 43 pass | |
| 44 else: | |
| 45 if r.status_code == requests.codes.ok: | |
| 46 # The zone is the last slash-separated component. | |
| 47 return r.text.split('/')[-1] | |
| 48 | |
| 49 try: | |
| 50 return fqdn.split('.')[1] # [chrome|golo] | |
| 51 except IndexError: | |
| 52 return '' | |
| 53 | |
| 54 | |
| 55 def _default_network(host): | |
| 56 try: | |
| 57 # Regular expression that matches the vast majority of our host names. | |
| 58 # Matches everything of the form 'masterN', 'masterNa', and 'foo-xN'. | |
| 59 return re.match(r'^([\w-]*?-[acm]|master)(\d+)a?$', host).group(2) # N | |
| 60 except AttributeError: | |
| 61 return '' | |
| 62 | |
| 63 | |
| 64 def add_argparse_options(parser): | |
| 65 """Add monitoring related flags to a process' argument parser. | |
| 66 | |
| 67 Args: | |
| 68 parser (argparse.ArgumentParser): the parser for the main process. | |
| 69 """ | |
| 70 if sys.platform == 'win32': # pragma: no cover | |
| 71 default_config_file = 'C:\\chrome-infra\\ts-mon.json' | |
| 72 else: # pragma: no cover | |
| 73 default_config_file = '/etc/chrome-infra/ts-mon.json' | |
| 74 | |
| 75 parser = parser.add_argument_group('Timeseries Monitoring Options') | |
| 76 parser.add_argument( | |
| 77 '--ts-mon-config-file', | |
| 78 default=default_config_file, | |
| 79 help='path to a JSON config file that contains suitable values for ' | |
| 80 '"endpoint" and "credentials" for this machine. This config file is ' | |
| 81 'intended to be shared by all processes on the machine, as the ' | |
| 82 'values depend on the machine\'s position in the network, IP ' | |
| 83 'whitelisting and deployment of credentials. (default: %(default)s)') | |
| 84 parser.add_argument( | |
| 85 '--ts-mon-endpoint', | |
| 86 help='url (including file://, pubsub://project/topic, https://) to post ' | |
| 87 'monitoring metrics to. If set, overrides the value in ' | |
| 88 '--ts-mon-config-file') | |
| 89 parser.add_argument( | |
| 90 '--ts-mon-credentials', | |
| 91 help='path to a pkcs8 json credential file. If set, overrides the value ' | |
| 92 'in --ts-mon-config-file') | |
| 93 parser.add_argument( | |
| 94 '--ts-mon-flush', | |
| 95 choices=('manual', 'auto'), default='auto', | |
| 96 help=('metric push behavior: manual (only send when flush() is called), ' | |
| 97 'or auto (send automatically every --ts-mon-flush-interval-secs ' | |
| 98 'seconds). (default: %(default)s)')) | |
| 99 parser.add_argument( | |
| 100 '--ts-mon-flush-interval-secs', | |
| 101 type=int, | |
| 102 default=60, | |
| 103 help=('automatically push metrics on this interval if ' | |
| 104 '--ts-mon-flush=auto.')) | |
| 105 parser.add_argument( | |
| 106 '--ts-mon-autogen-hostname', | |
| 107 action="store_true", | |
| 108 help=('Indicate that the hostname is autogenerated. ' | |
| 109 'This option must be set on autoscaled GCE VMs, Kubernetes pods, ' | |
| 110 'or any other hosts with dynamically generated names.')) | |
| 111 | |
| 112 parser.add_argument( | |
| 113 '--ts-mon-target-type', | |
| 114 choices=('device', 'task'), | |
| 115 default='device', | |
| 116 help='the type of target that is being monitored ("device" or "task").' | |
| 117 ' (default: %(default)s)') | |
| 118 | |
| 119 fqdn = socket.getfqdn().lower() # foo-[a|m]N.[chrome|golo].chromium.org | |
| 120 host = fqdn.split('.')[0] # foo-[a|m]N | |
| 121 region = _default_region(fqdn) | |
| 122 network = _default_network(host) | |
| 123 | |
| 124 parser.add_argument( | |
| 125 '--ts-mon-device-hostname', | |
| 126 default=host, | |
| 127 help='name of this device, (default: %(default)s)') | |
| 128 parser.add_argument( | |
| 129 '--ts-mon-device-region', | |
| 130 default=region, | |
| 131 help='name of the region this devices lives in. (default: %(default)s)') | |
| 132 parser.add_argument( | |
| 133 '--ts-mon-device-role', | |
| 134 default='default', | |
| 135 help='Role of the device. (default: %(default)s)') | |
| 136 parser.add_argument( | |
| 137 '--ts-mon-device-network', | |
| 138 default=network, | |
| 139 help='name of the network this device is connected to. ' | |
| 140 '(default: %(default)s)') | |
| 141 | |
| 142 parser.add_argument( | |
| 143 '--ts-mon-task-service-name', | |
| 144 help='name of the service being monitored') | |
| 145 parser.add_argument( | |
| 146 '--ts-mon-task-job-name', | |
| 147 help='name of this job instance of the task') | |
| 148 parser.add_argument( | |
| 149 '--ts-mon-task-region', | |
| 150 default=region, | |
| 151 help='name of the region in which this task is running ' | |
| 152 '(default: %(default)s)') | |
| 153 parser.add_argument( | |
| 154 '--ts-mon-task-hostname', | |
| 155 default=host, | |
| 156 help='name of the host on which this task is running ' | |
| 157 '(default: %(default)s)') | |
| 158 parser.add_argument( | |
| 159 '--ts-mon-task-number', type=int, default=0, | |
| 160 help='number (e.g. for replication) of this instance of this task ' | |
| 161 '(default: %(default)s)') | |
| 162 | |
| 163 parser.add_argument( | |
| 164 '--ts-mon-metric-name-prefix', | |
| 165 default='/chrome/infra/', | |
| 166 help='metric name prefix for all metrics (default: %(default)s)') | |
| 167 | |
| 168 def process_argparse_options(args): | |
| 169 """Process command line arguments to initialize the global monitor. | |
| 170 | |
| 171 Also initializes the default target. | |
| 172 | |
| 173 Starts a background thread to automatically flush monitoring metrics if not | |
| 174 disabled by command line arguments. | |
| 175 | |
| 176 Args: | |
| 177 args (argparse.Namespace): the result of parsing the command line arguments | |
| 178 """ | |
| 179 # Parse the config file if it exists. | |
| 180 config = load_machine_config(args.ts_mon_config_file) | |
| 181 endpoint = config.get('endpoint', '') | |
| 182 credentials = config.get('credentials', '') | |
| 183 autogen_hostname = config.get('autogen_hostname', False) | |
| 184 | |
| 185 # Command-line args override the values in the config file. | |
| 186 if args.ts_mon_endpoint is not None: | |
| 187 endpoint = args.ts_mon_endpoint | |
| 188 if args.ts_mon_credentials is not None: | |
| 189 credentials = args.ts_mon_credentials | |
| 190 | |
| 191 if args.ts_mon_target_type == 'device': | |
| 192 hostname = args.ts_mon_device_hostname | |
| 193 if args.ts_mon_autogen_hostname or autogen_hostname: | |
| 194 hostname = 'autogen:' + hostname | |
| 195 interface.state.target = targets.DeviceTarget( | |
| 196 args.ts_mon_device_region, | |
| 197 args.ts_mon_device_role, | |
| 198 args.ts_mon_device_network, | |
| 199 hostname) | |
| 200 if args.ts_mon_target_type == 'task': | |
| 201 # Reimplement ArgumentParser.error, since we don't have access to the parser | |
| 202 if not args.ts_mon_task_service_name: | |
| 203 print >> sys.stderr, ('Argument --ts-mon-task-service-name must be ' | |
| 204 'provided when the target type is "task".') | |
| 205 sys.exit(2) | |
| 206 if not args.ts_mon_task_job_name: | |
| 207 print >> sys.stderr, ('Argument --ts-mon-task-job-name must be provided ' | |
| 208 'when the target type is "task".') | |
| 209 sys.exit(2) | |
| 210 hostname = args.ts_mon_task_hostname | |
| 211 if args.ts_mon_autogen_hostname or autogen_hostname: | |
| 212 hostname = 'autogen:' + hostname | |
| 213 interface.state.target = targets.TaskTarget( | |
| 214 args.ts_mon_task_service_name, | |
| 215 args.ts_mon_task_job_name, | |
| 216 args.ts_mon_task_region, | |
| 217 hostname, | |
| 218 args.ts_mon_task_number) | |
| 219 | |
| 220 interface.state.metric_name_prefix = args.ts_mon_metric_name_prefix | |
| 221 interface.state.global_monitor = monitors.NullMonitor() | |
| 222 | |
| 223 if endpoint.startswith('file://'): | |
| 224 interface.state.global_monitor = monitors.DebugMonitor( | |
| 225 endpoint[len('file://'):]) | |
| 226 elif endpoint.startswith('pubsub://'): | |
| 227 if credentials: | |
| 228 url = urlparse.urlparse(endpoint) | |
| 229 project = url.netloc | |
| 230 topic = url.path.strip('/') | |
| 231 interface.state.global_monitor = monitors.PubSubMonitor( | |
| 232 credentials, project, topic, use_instrumented_http=True) | |
| 233 else: | |
| 234 logging.error('ts_mon monitoring is disabled because credentials are not ' | |
| 235 'available') | |
| 236 elif endpoint.startswith('https://'): | |
| 237 interface.state.global_monitor = monitors.HttpsMonitor(endpoint, | |
| 238 credentials) | |
| 239 elif endpoint.lower() == 'none': | |
| 240 logging.info('ts_mon monitoring has been explicitly disabled') | |
| 241 else: | |
| 242 logging.error('ts_mon monitoring is disabled because the endpoint provided' | |
| 243 ' is invalid or not supported: %s', endpoint) | |
| 244 | |
| 245 interface.state.flush_mode = args.ts_mon_flush | |
| 246 | |
| 247 if args.ts_mon_flush == 'auto': | |
| 248 interface.state.flush_thread = interface._FlushThread( | |
| 249 args.ts_mon_flush_interval_secs) | |
| 250 interface.state.flush_thread.start() | |
| 251 | |
| 252 standard_metrics.init() | |
| OLD | NEW |