Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(53)

Unified Diff: client/bin/cpuset.py

Issue 4823005: Merge remote branch 'cros/upstream' into tempbranch (Closed) Base URL: http://git.chromium.org/git/autotest.git@master
Patch Set: patch Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « cli/cli_mock.py ('k') | client/bin/job.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: client/bin/cpuset.py
diff --git a/client/bin/cpuset.py b/client/bin/cpuset.py
new file mode 100644
index 0000000000000000000000000000000000000000..68fe50a37e92ec0441e3f46fe891d607591f1008
--- /dev/null
+++ b/client/bin/cpuset.py
@@ -0,0 +1,544 @@
+# Copyright 2007-2010 Google Inc. Released under the GPL v2
+__author__ = "duanes (Duane Sand), pdahl (Peter Dahl)"
+
+# A basic cpuset/cgroup container manager for limiting memory use during tests
+# for use on kernels not running some site-specific container manager
+
+import os, sys, re, glob, fcntl, logging
+from autotest_lib.client.bin import utils
+from autotest_lib.client.common_lib import error
+
+SUPER_ROOT = '' # root of all containers or cgroups
+NO_LIMIT = (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit
+
+# propio service classes:
+PROPIO_PRIO = 1
+PROPIO_NORMAL = 2
+PROPIO_IDLE = 3
+
+super_root_path = '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
+cpuset_prefix = None # usually 'cpuset.'; '' on 2.6.18
+fake_numa_containers = False # container mem via numa=fake mem nodes, else pages
+mem_isolation_on = False
+node_mbytes = 0 # mbytes in one typical mem node
+root_container_bytes = 0 # squishy limit on effective size of root container
+
+
+def discover_container_style():
+ global super_root_path, cpuset_prefix
+ global mem_isolation_on, fake_numa_containers
+ global node_mbytes, root_container_bytes
+ if super_root_path != '':
+ return # already looked up
+ if os.path.exists('/dev/cgroup/tasks'):
+ # running on 2.6.26 or later kernel with containers on:
+ super_root_path = '/dev/cgroup'
+ cpuset_prefix = 'cpuset.'
+ if get_boot_numa():
+ mem_isolation_on = fake_numa_containers = True
+ else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot
+ fake_numa_containers = False
+ mem_isolation_on = os.path.exists(
+ '/dev/cgroup/memory.limit_in_bytes')
+ # TODO: handle possibility of where memcg is mounted as its own
+ # cgroup hierarchy, separate from cpuset??
+ elif os.path.exists('/dev/cpuset/tasks'):
+ # running on 2.6.18 kernel with containers on:
+ super_root_path = '/dev/cpuset'
+ cpuset_prefix = ''
+ mem_isolation_on = fake_numa_containers = get_boot_numa() != ''
+ else:
+ # neither cpuset nor cgroup filesystem active:
+ super_root_path = None
+ cpuset_prefix = 'no_cpusets_or_cgroups_exist'
+ mem_isolation_on = fake_numa_containers = False
+
+ logging.debug('mem_isolation: %s', mem_isolation_on)
+ logging.debug('fake_numa_containers: %s', fake_numa_containers)
+ if fake_numa_containers:
+ node_mbytes = int(mbytes_per_mem_node())
+ elif mem_isolation_on: # memcg-style containers
+ # For now, limit total of all containers to using just 98% of system's
+ # visible total ram, to avoid oom events at system level, and avoid
+ # page reclaim overhead from going above kswapd highwater mark.
+ system_visible_pages = utils.memtotal() >> 2
+ usable_pages = int(system_visible_pages * 0.98)
+ root_container_bytes = usable_pages << 12
+ logging.debug('root_container_bytes: %s',
+ utils.human_format(root_container_bytes))
+
+
+def need_mem_containers():
+ discover_container_style()
+ if not mem_isolation_on:
+ raise error.AutotestError('Mem-isolation containers not enabled '
+ 'by latest reboot')
+
+def need_fake_numa():
+ discover_container_style()
+ if not fake_numa_containers:
+ raise error.AutotestError('fake=numa not enabled by latest reboot')
+
+
+def full_path(container_name):
+ discover_container_style()
+ return os.path.join(super_root_path, container_name)
+
+
+def unpath(container_path):
+ return container_path[len(super_root_path)+1:]
+
+
+def cpuset_attr(container_name, attr):
+ discover_container_style()
+ return os.path.join(super_root_path, container_name, cpuset_prefix+attr)
+
+
+def io_attr(container_name, attr):
+ discover_container_style()
+ # current version assumes shared cgroup hierarchy
+ return os.path.join(super_root_path, container_name, 'io.'+attr)
+
+
+def tasks_path(container_name):
+ return os.path.join(full_path(container_name), 'tasks')
+
+
+def mems_path(container_name):
+ return cpuset_attr(container_name, 'mems')
+
+
+def memory_path(container_name):
+ return os.path.join(super_root_path, container_name, 'memory')
+
+
+def cpus_path(container_name):
+ return cpuset_attr(container_name, 'cpus')
+
+
+def container_exists(name):
+ return name is not None and os.path.exists(tasks_path(name))
+
+
+def move_tasks_into_container(name, tasks):
+ task_file = tasks_path(name)
+ for task in tasks:
+ try:
+ logging.debug('moving task %s into container "%s"', task, name)
+ utils.write_one_line(task_file, task)
+ except Exception:
+ if utils.pid_is_alive(task):
+ raise # task exists but couldn't move it
+ # task is gone or zombie so ignore this exception
+
+
+def move_self_into_container(name):
+ me = str(os.getpid())
+ move_tasks_into_container(name, [me])
+ logging.debug('running self (pid %s) in container "%s"', me, name)
+
+
+def _avail_mbytes_via_nodes(parent):
+ # total mbytes of mem nodes available for new containers in parent
+ free_nodes = available_exclusive_mem_nodes(parent)
+ mbytes = nodes_avail_mbytes(free_nodes)
+ # don't have exact model for how container mgr measures mem space
+ # better here to underestimate than overestimate
+ mbytes = max(mbytes - node_mbytes//2, 0)
+ return mbytes
+
+
+def _avail_bytes_via_pages(parent):
+ # Get memory bytes available to parent container which could
+ # be allocated exclusively to new child containers.
+ # This excludes mem previously allocated to existing children.
+ available = container_bytes(parent)
+ mem_files_pattern = os.path.join(full_path(parent),
+ '*', 'memory.limit_in_bytes')
+ for mem_file in glob.glob(mem_files_pattern):
+ child_container = unpath(os.path.dirname(mem_file))
+ available -= container_bytes(child_container)
+ return available
+
+
+def avail_mbytes(parent=SUPER_ROOT):
+ # total mbytes available in parent, for exclusive use in new containers
+ if fake_numa_containers:
+ return _avail_mbytes_via_nodes(parent)
+ else:
+ return _avail_bytes_via_pages(parent) >> 20
+
+
+def delete_leftover_test_containers():
+ # recover mems and cores tied up by containers of prior failed tests:
+ for child in inner_containers_of(SUPER_ROOT):
+ _release_container_nest(child)
+
+
+def my_lock(lockname):
+ # lockname is 'inner'
+ lockdir = os.environ['AUTODIR']
+ lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname)
+ lockfile = open(lockname, 'w')
+ fcntl.flock(lockfile, fcntl.LOCK_EX)
+ return lockfile
+
+
+def my_unlock(lockfile):
+ fcntl.flock(lockfile, fcntl.LOCK_UN)
+ lockfile.close()
+
+
+# Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
+def rangelist_to_set(rangelist):
+ result = set()
+ if not rangelist:
+ return result
+ for x in rangelist.split(','):
+ if re.match(r'^(\d+)$', x):
+ result.add(int(x))
+ continue
+ m = re.match(r'^(\d+)-(\d+)$', x)
+ if m:
+ start = int(m.group(1))
+ end = int(m.group(2))
+ result.update(set(range(start, end+1)))
+ continue
+ msg = 'Cannot understand data input: %s %s' % (x, rangelist)
+ raise ValueError(msg)
+ return result
+
+
+def my_container_name():
+ # Get current process's inherited or self-built container name
+ # within /dev/cpuset or /dev/cgroup. Is '' for root container.
+ name = utils.read_one_line('/proc/%i/cpuset' % os.getpid())
+ return name[1:] # strip leading /
+
+
+def get_mem_nodes(container_name):
+ # all mem nodes now available to a container, both exclusive & shared
+ file_name = mems_path(container_name)
+ if os.path.exists(file_name):
+ return rangelist_to_set(utils.read_one_line(file_name))
+ else:
+ return set()
+
+
+def _busy_mem_nodes(parent_container):
+ # Get set of numa memory nodes now used (exclusively or shared)
+ # by existing children of parent container
+ busy = set()
+ mem_files_pattern = os.path.join(full_path(parent_container),
+ '*', cpuset_prefix+'mems')
+ for mem_file in glob.glob(mem_files_pattern):
+ child_container = os.path.dirname(mem_file)
+ busy |= get_mem_nodes(child_container)
+ return busy
+
+
+def available_exclusive_mem_nodes(parent_container):
+ # Get subset of numa memory nodes of parent container which could
+ # be allocated exclusively to new child containers.
+ # This excludes nodes now allocated to existing children.
+ need_fake_numa()
+ available = get_mem_nodes(parent_container)
+ available -= _busy_mem_nodes(parent_container)
+ return available
+
+
+def my_mem_nodes():
+ # Get set of numa memory nodes owned by current process's container.
+ discover_container_style()
+ if not mem_isolation_on:
+ return set() # as expected by vmstress
+ return get_mem_nodes(my_container_name())
+
+
+def my_available_exclusive_mem_nodes():
+ # Get subset of numa memory nodes owned by current process's
+ # container, which could be allocated exclusively to new child
+ # containers. This excludes any nodes now allocated
+ # to existing children.
+ return available_exclusive_mem_nodes(my_container_name())
+
+
+def node_avail_kbytes(node):
+ return node_mbytes << 10 # crude; fixed numa node size
+
+
+def nodes_avail_mbytes(nodes):
+ # nodes' combined user+avail size, in Mbytes
+ return sum(node_avail_kbytes(n) for n in nodes) // 1024
+
+
+def container_bytes(name):
+ if fake_numa_containers:
+ return nodes_avail_mbytes(get_mem_nodes(name)) << 20
+ else:
+ while True:
+ file = memory_path(name) + '.limit_in_bytes'
+ limit = int(utils.read_one_line(file))
+ if limit < NO_LIMIT:
+ return limit
+ if name == SUPER_ROOT:
+ return root_container_bytes
+ name = os.path.dirname(name)
+
+
+def container_mbytes(name):
+ return container_bytes(name) >> 20
+
+
+def mbytes_per_mem_node():
+ # Get mbyte size of standard numa mem node, as float
+ # (some nodes are bigger than this)
+ # Replaces utils.node_size().
+ numa = get_boot_numa()
+ if numa.endswith('M'):
+ return float(numa[:-1]) # mbyte size of fake nodes
+ elif numa:
+ nodecnt = int(numa) # fake numa mem nodes for container isolation
+ else:
+ nodecnt = len(utils.numa_nodes()) # phys mem-controller nodes
+ # Use guessed total physical mem size, not kernel's
+ # lesser 'available memory' after various system tables.
+ return utils.rounded_memtotal() / (nodecnt * 1024.0)
+
+
+def get_cpus(container_name):
+ file_name = cpus_path(container_name)
+ if os.path.exists(file_name):
+ return rangelist_to_set(utils.read_one_line(file_name))
+ else:
+ return set()
+
+
+def get_tasks(container_name):
+ file_name = tasks_path(container_name)
+ try:
+ tasks = [x.rstrip() for x in open(file_name).readlines()]
+ except IOError:
+ if os.path.exists(file_name):
+ raise
+ tasks = [] # container doesn't exist anymore
+ return tasks
+
+
+def inner_containers_of(parent):
+ pattern = os.path.join(full_path(parent), '*/tasks')
+ return [unpath(os.path.dirname(task_file))
+ for task_file in glob.glob(pattern)]
+
+
+def _release_container_nest(nest):
+ # Destroy a container, and any nested sub-containers
+ nest_path = full_path(nest)
+ if os.path.exists(nest_path):
+
+ # bottom-up walk of tree, releasing all nested sub-containers
+ for child in inner_containers_of(nest):
+ _release_container_nest(child)
+
+ logging.debug("releasing container %s", nest)
+
+ # Transfer any survivor tasks (e.g. self) to parent container
+ parent = os.path.dirname(nest)
+ move_tasks_into_container(parent, get_tasks(nest))
+
+ # remove the now-empty outermost container of this nest
+ if os.path.exists(nest_path):
+ os.rmdir(nest_path) # nested, or dead manager
+
+
+def release_container(container_name=None):
+ # Destroy a container
+ my_container = my_container_name()
+ if container_name is None:
+ container_name = my_container
+ _release_container_nest(container_name)
+ displaced = my_container_name()
+ if displaced != my_container:
+ logging.debug('now running self (pid %d) in container "%s"',
+ os.getpid(), displaced)
+
+
+def remove_empty_prio_classes(prios):
+ # remove prio classes whose set of allowed priorities is empty
+ # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3'
+ return ';'.join(p for p in prios.split(';') if p.split(':')[1])
+
+
+def all_drive_names():
+ # list of all disk drives sda,sdb,...
+ paths = glob.glob('/sys/block/sd*')
+ if not paths:
+ paths = glob.glob('/sys/block/hd*')
+ return [os.path.basename(path) for path in paths]
+
+
+def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL],
+ io_shares=[95], io_limits=[0]):
+ # set the propio controls for one container, for selected disks
+ # writing directly to /dev/cgroup/container_name/io.io_service_level
+ # without using containerd or container.py
+ # See wiki ProportionalIOScheduler for definitions
+ # ioprio_classes: list of service classes, one per disk
+ # using numeric propio service classes as used by kernel API, namely
+ # 1: RT, Real Time, aka PROPIO_PRIO
+ # 2: BE, Best Effort, aka PROPIO_NORMAL
+ # 3: PROPIO_IDLE
+ # io_shares: list of disk-time-fractions, one per disk,
+ # as percentage integer 0..100
+ # io_limits: list of limit on/off, one per disk
+ # 0: no limit, shares use of other containers' unused disk time
+ # 1: limited, container's use of disk time is capped to given DTF
+ # ioprio_classes defaults to best-effort
+ # io_limit defaults to no limit, use slack time
+ if not disks: # defaults to all drives
+ disks = all_drive_names()
+ io_shares = [io_shares [0]] * len(disks)
+ ioprio_classes = [ioprio_classes[0]] * len(disks)
+ io_limits = [io_limits [0]] * len(disks)
+ if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares)
+ and len(disks) == len(io_limits)):
+ raise error.AutotestError('Unequal number of values for io controls')
+ service_level = io_attr(container_name, 'io_service_level')
+ if not os.path.exists(service_level):
+ return # kernel predates propio features
+ # or io cgroup is mounted separately from cpusets
+ disk_infos = []
+ for disk,ioclass,limit,share in zip(disks, ioprio_classes,
+ io_limits, io_shares):
+ parts = (disk, str(ioclass), str(limit), str(share))
+ disk_info = ' '.join(parts)
+ utils.write_one_line(service_level, disk_info)
+ disk_infos.append(disk_info)
+ logging.debug('set_io_controls of %s to %s',
+ container_name, ', '.join(disk_infos))
+
+
+def abbrev_list(vals):
+ """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'."""
+ ranges = []
+ lower = 0
+ upper = -2
+ for val in sorted(vals)+[-1]:
+ if val != upper+1:
+ if lower == upper:
+ ranges.append(str(lower))
+ elif lower <= upper:
+ ranges.append('%d-%d' % (lower, upper))
+ lower = val
+ upper = val
+ return ','.join(ranges)
+
+
+def create_container_with_specific_mems_cpus(name, mems, cpus):
+ need_fake_numa()
+ os.mkdir(full_path(name))
+ utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1')
+ utils.write_one_line(mems_path(name), ','.join(map(str, mems)))
+ utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
+ logging.debug('container %s has %d cpus and %d nodes totalling %s bytes',
+ name, len(cpus), len(get_mem_nodes(name)),
+ utils.human_format(container_bytes(name)) )
+
+
+def create_container_via_memcg(name, parent, bytes, cpus):
+ # create container via direct memcg cgroup writes
+ os.mkdir(full_path(name))
+ nodes = utils.read_one_line(mems_path(parent))
+ utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes
+ utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes))
+ utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
+ logging.debug('Created container %s directly via memcg,'
+ ' has %d cpus and %s bytes',
+ name, len(cpus), utils.human_format(container_bytes(name)))
+
+
+def _create_fake_numa_container_directly(name, parent, mbytes, cpus):
+ need_fake_numa()
+ lockfile = my_lock('inner') # serialize race between parallel tests
+ try:
+ # Pick specific mem nodes for new cpuset's exclusive use
+ # For now, arbitrarily pick highest available node numbers
+ needed_kbytes = mbytes * 1024
+ nodes = sorted(list(available_exclusive_mem_nodes(parent)))
+ kbytes = 0
+ nodecnt = 0
+ while kbytes < needed_kbytes and nodecnt < len(nodes):
+ nodecnt += 1
+ kbytes += node_avail_kbytes(nodes[-nodecnt])
+ if kbytes < needed_kbytes:
+ parent_mbytes = container_mbytes(parent)
+ if mbytes > parent_mbytes:
+ raise error.AutotestError(
+ "New container's %d Mbytes exceeds "
+ "parent container's %d Mbyte size"
+ % (mbytes, parent_mbytes) )
+ else:
+ raise error.AutotestError(
+ "Existing sibling containers hold "
+ "%d Mbytes needed by new container"
+ % ((needed_kbytes - kbytes)//1024) )
+ mems = nodes[-nodecnt:]
+
+ create_container_with_specific_mems_cpus(name, mems, cpus)
+ finally:
+ my_unlock(lockfile)
+
+
+def create_container_directly(name, mbytes, cpus):
+ parent = os.path.dirname(name)
+ if fake_numa_containers:
+ _create_fake_numa_container_directly(name, parent, mbytes, cpus)
+ else:
+ create_container_via_memcg(name, parent, mbytes<<20, cpus)
+
+
+def create_container_with_mbytes_and_specific_cpus(name, mbytes,
+ cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0):
+ """\
+ Create a cpuset container and move job's current pid into it
+ Allocate the list "cpus" of cpus to that container
+
+ name = arbitrary string tag
+ mbytes = reqested memory for job in megabytes
+ cpus = list of cpu indicies to associate with the cpuset
+ defaults to all cpus avail with given root
+ root = the parent cpuset to nest this new set within
+ '': unnested top-level container
+ io = arguments for proportional IO containers
+ move_in = True: Move current process into the new container now.
+ timeout = must be 0: persist until explicitly deleted.
+ """
+ need_mem_containers()
+ if not container_exists(root):
+ raise error.AutotestError('Parent container "%s" does not exist'
+ % root)
+ if cpus is None:
+ # default to biggest container we can make under root
+ cpus = get_cpus(root)
+ else:
+ cpus = set(cpus) # interface uses list
+ if not cpus:
+ raise error.AutotestError('Creating container with no cpus')
+ name = os.path.join(root, name) # path relative to super_root
+ if os.path.exists(full_path(name)):
+ raise error.AutotestError('Container %s already exists' % name)
+ create_container_directly(name, mbytes, cpus)
+ set_io_controls(name, **io)
+ if move_in:
+ move_self_into_container(name)
+ return name
+
+
+def get_boot_numa():
+ # get boot-time numa=fake=xyz option for current boot
+ # eg numa=fake=nnn, numa=fake=nnnM, or nothing
+ label = 'numa=fake='
+ for arg in utils.read_one_line('/proc/cmdline').split():
+ if arg.startswith(label):
+ return arg[len(label):]
+ return ''
« no previous file with comments | « cli/cli_mock.py ('k') | client/bin/job.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698