Index: client/bin/cpuset.py |
diff --git a/client/bin/cpuset.py b/client/bin/cpuset.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..68fe50a37e92ec0441e3f46fe891d607591f1008 |
--- /dev/null |
+++ b/client/bin/cpuset.py |
@@ -0,0 +1,544 @@ |
+# Copyright 2007-2010 Google Inc. Released under the GPL v2 |
+__author__ = "duanes (Duane Sand), pdahl (Peter Dahl)" |
+ |
+# A basic cpuset/cgroup container manager for limiting memory use during tests |
+# for use on kernels not running some site-specific container manager |
+ |
+import os, sys, re, glob, fcntl, logging |
+from autotest_lib.client.bin import utils |
+from autotest_lib.client.common_lib import error |
+ |
+SUPER_ROOT = '' # root of all containers or cgroups |
+NO_LIMIT = (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit |
+ |
+# propio service classes: |
+PROPIO_PRIO = 1 |
+PROPIO_NORMAL = 2 |
+PROPIO_IDLE = 3 |
+ |
+super_root_path = '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18 |
+cpuset_prefix = None # usually 'cpuset.'; '' on 2.6.18 |
+fake_numa_containers = False # container mem via numa=fake mem nodes, else pages |
+mem_isolation_on = False |
+node_mbytes = 0 # mbytes in one typical mem node |
+root_container_bytes = 0 # squishy limit on effective size of root container |
+ |
+ |
+def discover_container_style(): |
+ global super_root_path, cpuset_prefix |
+ global mem_isolation_on, fake_numa_containers |
+ global node_mbytes, root_container_bytes |
+ if super_root_path != '': |
+ return # already looked up |
+ if os.path.exists('/dev/cgroup/tasks'): |
+ # running on 2.6.26 or later kernel with containers on: |
+ super_root_path = '/dev/cgroup' |
+ cpuset_prefix = 'cpuset.' |
+ if get_boot_numa(): |
+ mem_isolation_on = fake_numa_containers = True |
+ else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot |
+ fake_numa_containers = False |
+ mem_isolation_on = os.path.exists( |
+ '/dev/cgroup/memory.limit_in_bytes') |
+ # TODO: handle possibility of where memcg is mounted as its own |
+ # cgroup hierarchy, separate from cpuset?? |
+ elif os.path.exists('/dev/cpuset/tasks'): |
+ # running on 2.6.18 kernel with containers on: |
+ super_root_path = '/dev/cpuset' |
+ cpuset_prefix = '' |
+ mem_isolation_on = fake_numa_containers = get_boot_numa() != '' |
+ else: |
+ # neither cpuset nor cgroup filesystem active: |
+ super_root_path = None |
+ cpuset_prefix = 'no_cpusets_or_cgroups_exist' |
+ mem_isolation_on = fake_numa_containers = False |
+ |
+ logging.debug('mem_isolation: %s', mem_isolation_on) |
+ logging.debug('fake_numa_containers: %s', fake_numa_containers) |
+ if fake_numa_containers: |
+ node_mbytes = int(mbytes_per_mem_node()) |
+ elif mem_isolation_on: # memcg-style containers |
+ # For now, limit total of all containers to using just 98% of system's |
+ # visible total ram, to avoid oom events at system level, and avoid |
+ # page reclaim overhead from going above kswapd highwater mark. |
+ system_visible_pages = utils.memtotal() >> 2 |
+ usable_pages = int(system_visible_pages * 0.98) |
+ root_container_bytes = usable_pages << 12 |
+ logging.debug('root_container_bytes: %s', |
+ utils.human_format(root_container_bytes)) |
+ |
+ |
+def need_mem_containers(): |
+ discover_container_style() |
+ if not mem_isolation_on: |
+ raise error.AutotestError('Mem-isolation containers not enabled ' |
+ 'by latest reboot') |
+ |
+def need_fake_numa(): |
+ discover_container_style() |
+ if not fake_numa_containers: |
+ raise error.AutotestError('fake=numa not enabled by latest reboot') |
+ |
+ |
+def full_path(container_name): |
+ discover_container_style() |
+ return os.path.join(super_root_path, container_name) |
+ |
+ |
+def unpath(container_path): |
+ return container_path[len(super_root_path)+1:] |
+ |
+ |
+def cpuset_attr(container_name, attr): |
+ discover_container_style() |
+ return os.path.join(super_root_path, container_name, cpuset_prefix+attr) |
+ |
+ |
+def io_attr(container_name, attr): |
+ discover_container_style() |
+ # current version assumes shared cgroup hierarchy |
+ return os.path.join(super_root_path, container_name, 'io.'+attr) |
+ |
+ |
+def tasks_path(container_name): |
+ return os.path.join(full_path(container_name), 'tasks') |
+ |
+ |
+def mems_path(container_name): |
+ return cpuset_attr(container_name, 'mems') |
+ |
+ |
+def memory_path(container_name): |
+ return os.path.join(super_root_path, container_name, 'memory') |
+ |
+ |
+def cpus_path(container_name): |
+ return cpuset_attr(container_name, 'cpus') |
+ |
+ |
+def container_exists(name): |
+ return name is not None and os.path.exists(tasks_path(name)) |
+ |
+ |
+def move_tasks_into_container(name, tasks): |
+ task_file = tasks_path(name) |
+ for task in tasks: |
+ try: |
+ logging.debug('moving task %s into container "%s"', task, name) |
+ utils.write_one_line(task_file, task) |
+ except Exception: |
+ if utils.pid_is_alive(task): |
+ raise # task exists but couldn't move it |
+ # task is gone or zombie so ignore this exception |
+ |
+ |
+def move_self_into_container(name): |
+ me = str(os.getpid()) |
+ move_tasks_into_container(name, [me]) |
+ logging.debug('running self (pid %s) in container "%s"', me, name) |
+ |
+ |
+def _avail_mbytes_via_nodes(parent): |
+ # total mbytes of mem nodes available for new containers in parent |
+ free_nodes = available_exclusive_mem_nodes(parent) |
+ mbytes = nodes_avail_mbytes(free_nodes) |
+ # don't have exact model for how container mgr measures mem space |
+ # better here to underestimate than overestimate |
+ mbytes = max(mbytes - node_mbytes//2, 0) |
+ return mbytes |
+ |
+ |
+def _avail_bytes_via_pages(parent): |
+ # Get memory bytes available to parent container which could |
+ # be allocated exclusively to new child containers. |
+ # This excludes mem previously allocated to existing children. |
+ available = container_bytes(parent) |
+ mem_files_pattern = os.path.join(full_path(parent), |
+ '*', 'memory.limit_in_bytes') |
+ for mem_file in glob.glob(mem_files_pattern): |
+ child_container = unpath(os.path.dirname(mem_file)) |
+ available -= container_bytes(child_container) |
+ return available |
+ |
+ |
+def avail_mbytes(parent=SUPER_ROOT): |
+ # total mbytes available in parent, for exclusive use in new containers |
+ if fake_numa_containers: |
+ return _avail_mbytes_via_nodes(parent) |
+ else: |
+ return _avail_bytes_via_pages(parent) >> 20 |
+ |
+ |
+def delete_leftover_test_containers(): |
+ # recover mems and cores tied up by containers of prior failed tests: |
+ for child in inner_containers_of(SUPER_ROOT): |
+ _release_container_nest(child) |
+ |
+ |
+def my_lock(lockname): |
+ # lockname is 'inner' |
+ lockdir = os.environ['AUTODIR'] |
+ lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname) |
+ lockfile = open(lockname, 'w') |
+ fcntl.flock(lockfile, fcntl.LOCK_EX) |
+ return lockfile |
+ |
+ |
+def my_unlock(lockfile): |
+ fcntl.flock(lockfile, fcntl.LOCK_UN) |
+ lockfile.close() |
+ |
+ |
+# Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12) |
+def rangelist_to_set(rangelist): |
+ result = set() |
+ if not rangelist: |
+ return result |
+ for x in rangelist.split(','): |
+ if re.match(r'^(\d+)$', x): |
+ result.add(int(x)) |
+ continue |
+ m = re.match(r'^(\d+)-(\d+)$', x) |
+ if m: |
+ start = int(m.group(1)) |
+ end = int(m.group(2)) |
+ result.update(set(range(start, end+1))) |
+ continue |
+ msg = 'Cannot understand data input: %s %s' % (x, rangelist) |
+ raise ValueError(msg) |
+ return result |
+ |
+ |
+def my_container_name(): |
+ # Get current process's inherited or self-built container name |
+ # within /dev/cpuset or /dev/cgroup. Is '' for root container. |
+ name = utils.read_one_line('/proc/%i/cpuset' % os.getpid()) |
+ return name[1:] # strip leading / |
+ |
+ |
+def get_mem_nodes(container_name): |
+ # all mem nodes now available to a container, both exclusive & shared |
+ file_name = mems_path(container_name) |
+ if os.path.exists(file_name): |
+ return rangelist_to_set(utils.read_one_line(file_name)) |
+ else: |
+ return set() |
+ |
+ |
+def _busy_mem_nodes(parent_container): |
+ # Get set of numa memory nodes now used (exclusively or shared) |
+ # by existing children of parent container |
+ busy = set() |
+ mem_files_pattern = os.path.join(full_path(parent_container), |
+ '*', cpuset_prefix+'mems') |
+ for mem_file in glob.glob(mem_files_pattern): |
+ child_container = os.path.dirname(mem_file) |
+ busy |= get_mem_nodes(child_container) |
+ return busy |
+ |
+ |
+def available_exclusive_mem_nodes(parent_container): |
+ # Get subset of numa memory nodes of parent container which could |
+ # be allocated exclusively to new child containers. |
+ # This excludes nodes now allocated to existing children. |
+ need_fake_numa() |
+ available = get_mem_nodes(parent_container) |
+ available -= _busy_mem_nodes(parent_container) |
+ return available |
+ |
+ |
+def my_mem_nodes(): |
+ # Get set of numa memory nodes owned by current process's container. |
+ discover_container_style() |
+ if not mem_isolation_on: |
+ return set() # as expected by vmstress |
+ return get_mem_nodes(my_container_name()) |
+ |
+ |
+def my_available_exclusive_mem_nodes(): |
+ # Get subset of numa memory nodes owned by current process's |
+ # container, which could be allocated exclusively to new child |
+ # containers. This excludes any nodes now allocated |
+ # to existing children. |
+ return available_exclusive_mem_nodes(my_container_name()) |
+ |
+ |
+def node_avail_kbytes(node): |
+ return node_mbytes << 10 # crude; fixed numa node size |
+ |
+ |
+def nodes_avail_mbytes(nodes): |
+ # nodes' combined user+avail size, in Mbytes |
+ return sum(node_avail_kbytes(n) for n in nodes) // 1024 |
+ |
+ |
+def container_bytes(name): |
+ if fake_numa_containers: |
+ return nodes_avail_mbytes(get_mem_nodes(name)) << 20 |
+ else: |
+ while True: |
+ file = memory_path(name) + '.limit_in_bytes' |
+ limit = int(utils.read_one_line(file)) |
+ if limit < NO_LIMIT: |
+ return limit |
+ if name == SUPER_ROOT: |
+ return root_container_bytes |
+ name = os.path.dirname(name) |
+ |
+ |
+def container_mbytes(name): |
+ return container_bytes(name) >> 20 |
+ |
+ |
+def mbytes_per_mem_node(): |
+ # Get mbyte size of standard numa mem node, as float |
+ # (some nodes are bigger than this) |
+ # Replaces utils.node_size(). |
+ numa = get_boot_numa() |
+ if numa.endswith('M'): |
+ return float(numa[:-1]) # mbyte size of fake nodes |
+ elif numa: |
+ nodecnt = int(numa) # fake numa mem nodes for container isolation |
+ else: |
+ nodecnt = len(utils.numa_nodes()) # phys mem-controller nodes |
+ # Use guessed total physical mem size, not kernel's |
+ # lesser 'available memory' after various system tables. |
+ return utils.rounded_memtotal() / (nodecnt * 1024.0) |
+ |
+ |
+def get_cpus(container_name): |
+ file_name = cpus_path(container_name) |
+ if os.path.exists(file_name): |
+ return rangelist_to_set(utils.read_one_line(file_name)) |
+ else: |
+ return set() |
+ |
+ |
+def get_tasks(container_name): |
+ file_name = tasks_path(container_name) |
+ try: |
+ tasks = [x.rstrip() for x in open(file_name).readlines()] |
+ except IOError: |
+ if os.path.exists(file_name): |
+ raise |
+ tasks = [] # container doesn't exist anymore |
+ return tasks |
+ |
+ |
+def inner_containers_of(parent): |
+ pattern = os.path.join(full_path(parent), '*/tasks') |
+ return [unpath(os.path.dirname(task_file)) |
+ for task_file in glob.glob(pattern)] |
+ |
+ |
+def _release_container_nest(nest): |
+ # Destroy a container, and any nested sub-containers |
+ nest_path = full_path(nest) |
+ if os.path.exists(nest_path): |
+ |
+ # bottom-up walk of tree, releasing all nested sub-containers |
+ for child in inner_containers_of(nest): |
+ _release_container_nest(child) |
+ |
+ logging.debug("releasing container %s", nest) |
+ |
+ # Transfer any survivor tasks (e.g. self) to parent container |
+ parent = os.path.dirname(nest) |
+ move_tasks_into_container(parent, get_tasks(nest)) |
+ |
+ # remove the now-empty outermost container of this nest |
+ if os.path.exists(nest_path): |
+ os.rmdir(nest_path) # nested, or dead manager |
+ |
+ |
+def release_container(container_name=None): |
+ # Destroy a container |
+ my_container = my_container_name() |
+ if container_name is None: |
+ container_name = my_container |
+ _release_container_nest(container_name) |
+ displaced = my_container_name() |
+ if displaced != my_container: |
+ logging.debug('now running self (pid %d) in container "%s"', |
+ os.getpid(), displaced) |
+ |
+ |
+def remove_empty_prio_classes(prios): |
+ # remove prio classes whose set of allowed priorities is empty |
+ # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3' |
+ return ';'.join(p for p in prios.split(';') if p.split(':')[1]) |
+ |
+ |
+def all_drive_names(): |
+ # list of all disk drives sda,sdb,... |
+ paths = glob.glob('/sys/block/sd*') |
+ if not paths: |
+ paths = glob.glob('/sys/block/hd*') |
+ return [os.path.basename(path) for path in paths] |
+ |
+ |
+def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL], |
+ io_shares=[95], io_limits=[0]): |
+ # set the propio controls for one container, for selected disks |
+ # writing directly to /dev/cgroup/container_name/io.io_service_level |
+ # without using containerd or container.py |
+ # See wiki ProportionalIOScheduler for definitions |
+ # ioprio_classes: list of service classes, one per disk |
+ # using numeric propio service classes as used by kernel API, namely |
+ # 1: RT, Real Time, aka PROPIO_PRIO |
+ # 2: BE, Best Effort, aka PROPIO_NORMAL |
+ # 3: PROPIO_IDLE |
+ # io_shares: list of disk-time-fractions, one per disk, |
+ # as percentage integer 0..100 |
+ # io_limits: list of limit on/off, one per disk |
+ # 0: no limit, shares use of other containers' unused disk time |
+ # 1: limited, container's use of disk time is capped to given DTF |
+ # ioprio_classes defaults to best-effort |
+ # io_limit defaults to no limit, use slack time |
+ if not disks: # defaults to all drives |
+ disks = all_drive_names() |
+ io_shares = [io_shares [0]] * len(disks) |
+ ioprio_classes = [ioprio_classes[0]] * len(disks) |
+ io_limits = [io_limits [0]] * len(disks) |
+ if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares) |
+ and len(disks) == len(io_limits)): |
+ raise error.AutotestError('Unequal number of values for io controls') |
+ service_level = io_attr(container_name, 'io_service_level') |
+ if not os.path.exists(service_level): |
+ return # kernel predates propio features |
+ # or io cgroup is mounted separately from cpusets |
+ disk_infos = [] |
+ for disk,ioclass,limit,share in zip(disks, ioprio_classes, |
+ io_limits, io_shares): |
+ parts = (disk, str(ioclass), str(limit), str(share)) |
+ disk_info = ' '.join(parts) |
+ utils.write_one_line(service_level, disk_info) |
+ disk_infos.append(disk_info) |
+ logging.debug('set_io_controls of %s to %s', |
+ container_name, ', '.join(disk_infos)) |
+ |
+ |
+def abbrev_list(vals): |
+ """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'.""" |
+ ranges = [] |
+ lower = 0 |
+ upper = -2 |
+ for val in sorted(vals)+[-1]: |
+ if val != upper+1: |
+ if lower == upper: |
+ ranges.append(str(lower)) |
+ elif lower <= upper: |
+ ranges.append('%d-%d' % (lower, upper)) |
+ lower = val |
+ upper = val |
+ return ','.join(ranges) |
+ |
+ |
+def create_container_with_specific_mems_cpus(name, mems, cpus): |
+ need_fake_numa() |
+ os.mkdir(full_path(name)) |
+ utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1') |
+ utils.write_one_line(mems_path(name), ','.join(map(str, mems))) |
+ utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) |
+ logging.debug('container %s has %d cpus and %d nodes totalling %s bytes', |
+ name, len(cpus), len(get_mem_nodes(name)), |
+ utils.human_format(container_bytes(name)) ) |
+ |
+ |
+def create_container_via_memcg(name, parent, bytes, cpus): |
+ # create container via direct memcg cgroup writes |
+ os.mkdir(full_path(name)) |
+ nodes = utils.read_one_line(mems_path(parent)) |
+ utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes |
+ utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes)) |
+ utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) |
+ logging.debug('Created container %s directly via memcg,' |
+ ' has %d cpus and %s bytes', |
+ name, len(cpus), utils.human_format(container_bytes(name))) |
+ |
+ |
+def _create_fake_numa_container_directly(name, parent, mbytes, cpus): |
+ need_fake_numa() |
+ lockfile = my_lock('inner') # serialize race between parallel tests |
+ try: |
+ # Pick specific mem nodes for new cpuset's exclusive use |
+ # For now, arbitrarily pick highest available node numbers |
+ needed_kbytes = mbytes * 1024 |
+ nodes = sorted(list(available_exclusive_mem_nodes(parent))) |
+ kbytes = 0 |
+ nodecnt = 0 |
+ while kbytes < needed_kbytes and nodecnt < len(nodes): |
+ nodecnt += 1 |
+ kbytes += node_avail_kbytes(nodes[-nodecnt]) |
+ if kbytes < needed_kbytes: |
+ parent_mbytes = container_mbytes(parent) |
+ if mbytes > parent_mbytes: |
+ raise error.AutotestError( |
+ "New container's %d Mbytes exceeds " |
+ "parent container's %d Mbyte size" |
+ % (mbytes, parent_mbytes) ) |
+ else: |
+ raise error.AutotestError( |
+ "Existing sibling containers hold " |
+ "%d Mbytes needed by new container" |
+ % ((needed_kbytes - kbytes)//1024) ) |
+ mems = nodes[-nodecnt:] |
+ |
+ create_container_with_specific_mems_cpus(name, mems, cpus) |
+ finally: |
+ my_unlock(lockfile) |
+ |
+ |
+def create_container_directly(name, mbytes, cpus): |
+ parent = os.path.dirname(name) |
+ if fake_numa_containers: |
+ _create_fake_numa_container_directly(name, parent, mbytes, cpus) |
+ else: |
+ create_container_via_memcg(name, parent, mbytes<<20, cpus) |
+ |
+ |
+def create_container_with_mbytes_and_specific_cpus(name, mbytes, |
+ cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0): |
+ """\ |
+ Create a cpuset container and move job's current pid into it |
+ Allocate the list "cpus" of cpus to that container |
+ |
+ name = arbitrary string tag |
+ mbytes = reqested memory for job in megabytes |
+ cpus = list of cpu indicies to associate with the cpuset |
+ defaults to all cpus avail with given root |
+ root = the parent cpuset to nest this new set within |
+ '': unnested top-level container |
+ io = arguments for proportional IO containers |
+ move_in = True: Move current process into the new container now. |
+ timeout = must be 0: persist until explicitly deleted. |
+ """ |
+ need_mem_containers() |
+ if not container_exists(root): |
+ raise error.AutotestError('Parent container "%s" does not exist' |
+ % root) |
+ if cpus is None: |
+ # default to biggest container we can make under root |
+ cpus = get_cpus(root) |
+ else: |
+ cpus = set(cpus) # interface uses list |
+ if not cpus: |
+ raise error.AutotestError('Creating container with no cpus') |
+ name = os.path.join(root, name) # path relative to super_root |
+ if os.path.exists(full_path(name)): |
+ raise error.AutotestError('Container %s already exists' % name) |
+ create_container_directly(name, mbytes, cpus) |
+ set_io_controls(name, **io) |
+ if move_in: |
+ move_self_into_container(name) |
+ return name |
+ |
+ |
+def get_boot_numa(): |
+ # get boot-time numa=fake=xyz option for current boot |
+ # eg numa=fake=nnn, numa=fake=nnnM, or nothing |
+ label = 'numa=fake=' |
+ for arg in utils.read_one_line('/proc/cmdline').split(): |
+ if arg.startswith(label): |
+ return arg[len(label):] |
+ return '' |