third_party/grpc/tools/run_tests/stress_test/run_stress_tests_on_gke.py - Issue 1932353002: Initial checkin of gRPC to third_party/

Unified Diff: third_party/grpc/tools/run_tests/stress_test/run_stress_tests_on_gke.py

Issue 1932353002: Initial checkin of gRPC to third_party/ Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/grpc/tools/run_tests/stress_test/run_stress_tests_on_gke.py

diff --git a/third_party/grpc/tools/run_tests/stress_test/run_stress_tests_on_gke.py b/third_party/grpc/tools/run_tests/stress_test/run_stress_tests_on_gke.py

new file mode 100755

index 0000000000000000000000000000000000000000..634eb1aca53265dc4fe6f2842f089d86c21b71bb

--- /dev/null

+++ b/third_party/grpc/tools/run_tests/stress_test/run_stress_tests_on_gke.py

@@ -0,0 +1,556 @@

+#!/usr/bin/env python2.7

+# Redistribution and use in source and binary forms, with or without

+# modification, are permitted provided that the following conditions are

+# met:

+# * Redistributions of source code must retain the above copyright

+# notice, this list of conditions and the following disclaimer.

+# * Redistributions in binary form must reproduce the above

+# copyright notice, this list of conditions and the following disclaimer

+# in the documentation and/or other materials provided with the

+# distribution.

+# * Neither the name of Google Inc. nor the names of its

+# contributors may be used to endorse or promote products derived from

+# this software without specific prior written permission.

+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+import argparse

+import datetime

+import os

+import subprocess

+import sys

+import time

+stress_test_utils_dir = os.path.abspath(os.path.join(

+ os.path.dirname(__file__), '../../gcp/stress_test'))

+sys.path.append(stress_test_utils_dir)

+from stress_test_utils import BigQueryHelper

+kubernetes_api_dir = os.path.abspath(os.path.join(

+ os.path.dirname(__file__), '../../gcp/utils'))

+sys.path.append(kubernetes_api_dir)

+import kubernetes_api

+_GRPC_ROOT = os.path.abspath(os.path.join(

+ os.path.dirname(sys.argv[0]), '../../..'))

+os.chdir(_GRPC_ROOT)

+# num of seconds to wait for the GKE image to start and warmup

+_GKE_IMAGE_WARMUP_WAIT_SECS = 60

+_SERVER_POD_NAME = 'stress-server'

+_CLIENT_POD_NAME_PREFIX = 'stress-client'

+_DATASET_ID_PREFIX = 'stress_test'

+_SUMMARY_TABLE_ID = 'summary'

+_QPS_TABLE_ID = 'qps'

+_DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test'

+# The default port on which the kubernetes proxy server is started on localhost

+# (i.e kubectl proxy --port=<port>)

+_DEFAULT_KUBERNETES_PROXY_PORT = 8001

+# How frequently should the stress client wrapper script (running inside a GKE

+# container) poll the health of the stress client (also running inside the GKE

+# container) and upload metrics to BigQuery

+_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60

+# The default setting for stress test server and client

+_DEFAULT_STRESS_SERVER_PORT = 8080

+_DEFAULT_METRICS_PORT = 8081

+_DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1'

+_DEFAULT_NUM_CHANNELS_PER_SERVER = 5

+_DEFAULT_NUM_STUBS_PER_CHANNEL = 10

+_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30

+# Number of stress client instances to launch

+_DEFAULT_NUM_CLIENTS = 3

+# How frequently should this test monitor the health of Stress clients and

+# Servers running in GKE

+_DEFAULT_TEST_POLL_INTERVAL_SECS = 60

+# Default run time for this test (2 hour)

+_DEFAULT_TEST_DURATION_SECS = 7200

+# The number of seconds it would take a GKE pod to warm up (i.e get to 'Running'

+# state from the time of creation). Ideally this is something the test should

+# automatically determine by using Kubernetes API to poll the pods status.

+_DEFAULT_GKE_WARMUP_SECS = 60

+class KubernetesProxy:

+ """ Class to start a proxy on localhost to the Kubernetes API server """

+ def __init__(self, api_port):

+ self.port = api_port

+ self.p = None

+ self.started = False

+ def start(self):

+ cmd = ['kubectl', 'proxy', '--port=%d' % self.port]

+ self.p = subprocess.Popen(args=cmd)

+ self.started = True

+ time.sleep(2)

+ print '..Started'

+ def get_port(self):

+ return self.port

+ def is_started(self):

+ return self.started

+ def __del__(self):

+ if self.p is not None:

+ print 'Shutting down Kubernetes proxy..'

+ self.p.kill()

+class TestSettings:

+ def __init__(self, build_docker_image, test_poll_interval_secs,

+ test_duration_secs, kubernetes_proxy_port):

+ self.build_docker_image = build_docker_image

+ self.test_poll_interval_secs = test_poll_interval_secs

+ self.test_duration_secs = test_duration_secs

+ self.kubernetes_proxy_port = kubernetes_proxy_port

+class GkeSettings:

+ def __init__(self, project_id, docker_image_name):

+ self.project_id = project_id

+ self.docker_image_name = docker_image_name

+ self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name)

+class BigQuerySettings:

+ def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id):

+ self.run_id = run_id

+ self.dataset_id = dataset_id

+ self.summary_table_id = summary_table_id

+ self.qps_table_id = qps_table_id

+class StressServerSettings:

+ def __init__(self, server_pod_name, server_port):

+ self.server_pod_name = server_pod_name

+ self.server_port = server_port

+class StressClientSettings:

+ def __init__(self, num_clients, client_pod_name_prefix, server_pod_name,

+ server_port, metrics_port, metrics_collection_interval_secs,

+ stress_client_poll_interval_secs, num_channels_per_server,

+ num_stubs_per_channel, test_cases_str):

+ self.num_clients = num_clients

+ self.client_pod_name_prefix = client_pod_name_prefix

+ self.server_pod_name = server_pod_name

+ self.server_port = server_port

+ self.metrics_port = metrics_port

+ self.metrics_collection_interval_secs = metrics_collection_interval_secs

+ self.stress_client_poll_interval_secs = stress_client_poll_interval_secs

+ self.num_channels_per_server = num_channels_per_server

+ self.num_stubs_per_channel = num_stubs_per_channel

+ self.test_cases_str = test_cases_str

+ # == Derived properties ==

+ # Note: Client can accept a list of server addresses (a comma separated list

+ # of 'server_name:server_port'). In this case, we only have one server

+ # address to pass

+ self.server_addresses = '%s.default.svc.cluster.local:%d' % (

+ server_pod_name, server_port)

+ self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i)

+ for i in range(1, num_clients + 1)]

+def _build_docker_image(image_name, tag_name):

+ """ Build the docker image and add tag it to the GKE repository """

+ print 'Building docker image: %s' % image_name

+ os.environ['INTEROP_IMAGE'] = image_name

+ os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name

+ # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script

+ # build_interop_stress_image.sh invokes the following script:

+ # tools/dockerfile/$BASE_NAME/build_interop_stress.sh

+ os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx'

+ cmd = ['tools/jenkins/build_interop_stress_image.sh']

+ retcode = subprocess.call(args=cmd)

+ if retcode != 0:

+ print 'Error in building docker image'

+ return False

+ return True

+def _push_docker_image_to_gke_registry(docker_tag_name):

+ """Executes 'gcloud docker push <docker_tag_name>' to push the image to GKE registry"""

+ cmd = ['gcloud', 'docker', 'push', docker_tag_name]

+ print 'Pushing %s to GKE registry..' % docker_tag_name

+ retcode = subprocess.call(args=cmd)

+ if retcode != 0:

+ print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name

+ return False

+ return True

+def _launch_server(gke_settings, stress_server_settings, bq_settings,

+ kubernetes_proxy):

+ """ Launches a stress test server instance in GKE cluster """

+ if not kubernetes_proxy.is_started:

+ print 'Kubernetes proxy must be started before calling this function'

+ return False

+ # This is the wrapper script that is run in the container. This script runs

+ # the actual stress test server

+ server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py']

+ # run_server.py does not take any args from the command line. The args are

+ # instead passed via environment variables (see server_env below)

+ server_arg_list = []

+ # The parameters to the script run_server.py are injected into the container

+ # via environment variables

+ server_env = {

+ 'STRESS_TEST_IMAGE_TYPE': 'SERVER',

+ 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server',

+ 'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port,

+ 'RUN_ID': bq_settings.run_id,

+ 'POD_NAME': stress_server_settings.server_pod_name,

+ 'GCP_PROJECT_ID': gke_settings.project_id,

+ 'DATASET_ID': bq_settings.dataset_id,

+ 'SUMMARY_TABLE_ID': bq_settings.summary_table_id,

+ 'QPS_TABLE_ID': bq_settings.qps_table_id

+ }

+ # Launch Server

+ is_success = kubernetes_api.create_pod_and_service(

+ 'localhost',

+ kubernetes_proxy.get_port(),

+ 'default', # Use 'default' namespace

+ stress_server_settings.server_pod_name,

+ gke_settings.tag_name,

+ [stress_server_settings.server_port], # Port that should be exposed

+ server_cmd_list,

+ server_arg_list,

+ server_env,

+ True # Headless = True for server. Since we want DNS records to be created by GKE

+ )

+ return is_success

+def _launch_client(gke_settings, stress_server_settings, stress_client_settings,

+ bq_settings, kubernetes_proxy):

+ """ Launches a configurable number of stress test clients on GKE cluster """

+ if not kubernetes_proxy.is_started:

+ print 'Kubernetes proxy must be started before calling this function'

+ return False

+ stress_client_arg_list = [

+ '--server_addresses=%s' % stress_client_settings.server_addresses,

+ '--test_cases=%s' % stress_client_settings.test_cases_str,

+ '--num_stubs_per_channel=%d' %

+ stress_client_settings.num_stubs_per_channel

+ ]

+ # This is the wrapper script that is run in the container. This script runs

+ # the actual stress client

+ client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py']

+ # run_client.py takes no args. All args are passed as env variables (see

+ # client_env)

+ client_arg_list = []

+ metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port

+ metrics_client_arg_list = [

+ '--metrics_server_address=%s' % metrics_server_address,

+ '--total_only=true'

+ ]

+ # The parameters to the script run_client.py are injected into the container

+ # via environment variables

+ client_env = {

+ 'STRESS_TEST_IMAGE_TYPE': 'CLIENT',

+ 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test',

+ 'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list),

+ 'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client',

+ 'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list),

+ 'RUN_ID': bq_settings.run_id,

+ 'POLL_INTERVAL_SECS':

+ str(stress_client_settings.stress_client_poll_interval_secs),

+ 'GCP_PROJECT_ID': gke_settings.project_id,

+ 'DATASET_ID': bq_settings.dataset_id,

+ 'SUMMARY_TABLE_ID': bq_settings.summary_table_id,

+ 'QPS_TABLE_ID': bq_settings.qps_table_id

+ }

+ for pod_name in stress_client_settings.client_pod_names_list:

+ client_env['POD_NAME'] = pod_name

+ is_success = kubernetes_api.create_pod_and_service(

+ 'localhost', # Since proxy is running on localhost

+ kubernetes_proxy.get_port(),

+ 'default', # default namespace

+ pod_name,

+ gke_settings.tag_name,

+ [stress_client_settings.metrics_port

+ ], # Client pods expose metrics port

+ client_cmd_list,

+ client_arg_list,

+ client_env,

+ False # Client is not a headless service

+ )

+ if not is_success:

+ print 'Error in launching client %s' % pod_name

+ return False

+ return True

+def _launch_server_and_client(gke_settings, stress_server_settings,

+ stress_client_settings, bq_settings,

+ kubernetes_proxy_port):

+ # Start kubernetes proxy

+ print 'Kubernetes proxy'

+ kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)

+ kubernetes_proxy.start()

+ print 'Launching server..'

+ is_success = _launch_server(gke_settings, stress_server_settings, bq_settings,

+ kubernetes_proxy)

+ if not is_success:

+ print 'Error in launching server'

+ return False

+ # Server takes a while to start.

+ # TODO(sree) Use Kubernetes API to query the status of the server instead of

+ # sleeping

+ print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS

+ time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)

+ # Launch client

+ client_pod_name_prefix = 'stress-client'

+ is_success = _launch_client(gke_settings, stress_server_settings,

+ stress_client_settings, bq_settings,

+ kubernetes_proxy)

+ if not is_success:

+ print 'Error in launching client(s)'

+ return False

+ print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS

+ time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)

+ return True

+def _delete_server_and_client(stress_server_settings, stress_client_settings,

+ kubernetes_proxy_port):

+ kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)

+ kubernetes_proxy.start()

+ # Delete clients first

+ is_success = True

+ for pod_name in stress_client_settings.client_pod_names_list:

+ is_success = kubernetes_api.delete_pod_and_service(

+ 'localhost', kubernetes_proxy_port, 'default', pod_name)

+ if not is_success:

+ return False

+ # Delete server

+ is_success = kubernetes_api.delete_pod_and_service(

+ 'localhost', kubernetes_proxy_port, 'default',

+ stress_server_settings.server_pod_name)

+ return is_success

+def run_test_main(test_settings, gke_settings, stress_server_settings,

+ stress_client_clients):

+ is_success = True

+ if test_settings.build_docker_image:

+ is_success = _build_docker_image(gke_settings.docker_image_name,

+ gke_settings.tag_name)

+ if not is_success:

+ return False

+ is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)

+ if not is_success:

+ return False

+ # Create a unique id for this run (Note: Using timestamp instead of UUID to

+ # make it easier to deduce the date/time of the run just by looking at the run

+ # run id. This is useful in debugging when looking at records in Biq query)

+ run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

+ dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)

+ # Big Query settings (common for both Stress Server and Client)

+ bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,

+ _QPS_TABLE_ID)

+ bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,

+ _SUMMARY_TABLE_ID, _QPS_TABLE_ID)

+ bq_helper.initialize()

+ try:

+ is_success = _launch_server_and_client(gke_settings, stress_server_settings,

+ stress_client_settings, bq_settings,

+ test_settings.kubernetes_proxy_port)

+ if not is_success:

+ return False

+ start_time = datetime.datetime.now()

+ end_time = start_time + datetime.timedelta(

+ seconds=test_settings.test_duration_secs)

+ print 'Running the test until %s' % end_time.isoformat()

+ while True:

+ if datetime.datetime.now() > end_time:

+ print 'Test was run for %d seconds' % test_settings.test_duration_secs

+ break

+ # Check if either stress server or clients have failed

+ if bq_helper.check_if_any_tests_failed():

+ is_success = False

+ print 'Some tests failed.'

+ break

+ # Things seem to be running fine. Wait until next poll time to check the

+ # status

+ print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs

+ time.sleep(test_settings.test_poll_interval_secs)

+ # Print BiqQuery tables

+ bq_helper.print_summary_records()

+ bq_helper.print_qps_records()

+ finally:

+ # If is_success is False at this point, it means that the stress tests were

+ # started successfully but failed while running the tests. In this case we

+ # do should not delete the pods (since they contain all the failure

+ # information)

+ if is_success:

+ _delete_server_and_client(stress_server_settings, stress_client_settings,

+ test_settings.kubernetes_proxy_port)

+ return is_success

+argp = argparse.ArgumentParser(

+ description='Launch stress tests in GKE',

+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)

+argp.add_argument('--project_id',

+ required=True,

+ help='The Google Cloud Platform Project Id')

+argp.add_argument('--num_clients',

+ default=1,

+ type=int,

+ help='Number of client instances to start')

+argp.add_argument('--docker_image_name',

+ default=_DEFAULT_DOCKER_IMAGE_NAME,

+ help='The name of the docker image containing stress client '

+ 'and stress servers')

+argp.add_argument('--build_docker_image',

+ dest='build_docker_image',

+ action='store_true',

+ help='Build a docker image and push to Google Container '

+ 'Registry')

+argp.add_argument('--do_not_build_docker_image',

+ dest='build_docker_image',

+ action='store_false',

+ help='Do not build and push docker image to Google Container '

+ 'Registry')

+argp.set_defaults(build_docker_image=True)

+argp.add_argument('--test_poll_interval_secs',

+ default=_DEFAULT_TEST_POLL_INTERVAL_SECS,

+ type=int,

+ help='How frequently should this script should monitor the '

+ 'health of stress clients and servers running in the GKE '

+ 'cluster')

+argp.add_argument('--test_duration_secs',

+ default=_DEFAULT_TEST_DURATION_SECS,

+ type=int,

+ help='How long should this test be run')

+argp.add_argument('--kubernetes_proxy_port',

+ default=_DEFAULT_KUBERNETES_PROXY_PORT,

+ type=int,

+ help='The port on which the kubernetes proxy (on localhost)'

+ ' is started')

+argp.add_argument('--stress_server_port',

+ default=_DEFAULT_STRESS_SERVER_PORT,

+ type=int,

+ help='The port on which the stress server (in GKE '

+ 'containers) listens')

+argp.add_argument('--stress_client_metrics_port',

+ default=_DEFAULT_METRICS_PORT,

+ type=int,

+ help='The port on which the stress clients (in GKE '

+ 'containers) expose metrics')

+argp.add_argument('--stress_client_poll_interval_secs',

+ default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS,

+ type=int,

+ help='How frequently should the stress client wrapper script'

+ ' running inside GKE should monitor health of the actual '

+ ' stress client process and upload the metrics to BigQuery')

+argp.add_argument('--stress_client_metrics_collection_interval_secs',

+ default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS,

+ type=int,

+ help='How frequently should metrics be collected in-memory on'

+ ' the stress clients (running inside GKE containers). Note '

+ 'that this is NOT the same as the upload-to-BigQuery '

+ 'frequency. The metrics upload frequency is controlled by the'

+ ' --stress_client_poll_interval_secs flag')

+argp.add_argument('--stress_client_num_channels_per_server',

+ default=_DEFAULT_NUM_CHANNELS_PER_SERVER,

+ type=int,

+ help='The number of channels created to each server from a '

+ 'stress client')

+argp.add_argument('--stress_client_num_stubs_per_channel',

+ default=_DEFAULT_NUM_STUBS_PER_CHANNEL,

+ type=int,

+ help='The number of stubs created per channel. This number '

+ 'indicates the max number of RPCs that can be made in '

+ 'parallel on each channel at any given time')

+argp.add_argument('--stress_client_test_cases',

+ default=_DEFAULT_TEST_CASES_STR,

+ help='List of test cases (with weights) to be executed by the'

+ ' stress test client. The list is in the following format:\n'

+ ' <testcase_1:w_1,<test_case2:w_2>..<testcase_n:w_n>\n'

+ ' (Note: The weights do not have to add up to 100)')

+if __name__ == '__main__':

+ args = argp.parse_args()

+ test_settings = TestSettings(

+ args.build_docker_image, args.test_poll_interval_secs,

+ args.test_duration_secs, args.kubernetes_proxy_port)

+ gke_settings = GkeSettings(args.project_id, args.docker_image_name)

+ stress_server_settings = StressServerSettings(_SERVER_POD_NAME,

+ args.stress_server_port)

+ stress_client_settings = StressClientSettings(

+ args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME,

+ args.stress_server_port, args.stress_client_metrics_port,

+ args.stress_client_metrics_collection_interval_secs,

+ args.stress_client_poll_interval_secs,

+ args.stress_client_num_channels_per_server,

+ args.stress_client_num_stubs_per_channel, args.stress_client_test_cases)

+ run_test_main(test_settings, gke_settings, stress_server_settings,

+ stress_client_settings)

« no previous file with comments | « third_party/grpc/tools/run_tests/sources_and_headers.json ('k') | third_party/grpc/tools/run_tests/task_runner.py » ('j') | no next file with comments »