OLD | NEW |
(Empty) | |
| 1 # Copyright 2007-2010 Google Inc. Released under the GPL v2 |
| 2 __author__ = "duanes (Duane Sand), pdahl (Peter Dahl)" |
| 3 |
| 4 # A basic cpuset/cgroup container manager for limiting memory use during tests |
| 5 # for use on kernels not running some site-specific container manager |
| 6 |
| 7 import os, sys, re, glob, fcntl, logging |
| 8 from autotest_lib.client.bin import utils |
| 9 from autotest_lib.client.common_lib import error |
| 10 |
| 11 SUPER_ROOT = '' # root of all containers or cgroups |
| 12 NO_LIMIT = (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit |
| 13 |
| 14 # propio service classes: |
| 15 PROPIO_PRIO = 1 |
| 16 PROPIO_NORMAL = 2 |
| 17 PROPIO_IDLE = 3 |
| 18 |
| 19 super_root_path = '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18 |
| 20 cpuset_prefix = None # usually 'cpuset.'; '' on 2.6.18 |
| 21 fake_numa_containers = False # container mem via numa=fake mem nodes, else pages |
| 22 mem_isolation_on = False |
| 23 node_mbytes = 0 # mbytes in one typical mem node |
| 24 root_container_bytes = 0 # squishy limit on effective size of root container |
| 25 |
| 26 |
| 27 def discover_container_style(): |
| 28 global super_root_path, cpuset_prefix |
| 29 global mem_isolation_on, fake_numa_containers |
| 30 global node_mbytes, root_container_bytes |
| 31 if super_root_path != '': |
| 32 return # already looked up |
| 33 if os.path.exists('/dev/cgroup/tasks'): |
| 34 # running on 2.6.26 or later kernel with containers on: |
| 35 super_root_path = '/dev/cgroup' |
| 36 cpuset_prefix = 'cpuset.' |
| 37 if get_boot_numa(): |
| 38 mem_isolation_on = fake_numa_containers = True |
| 39 else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot |
| 40 fake_numa_containers = False |
| 41 mem_isolation_on = os.path.exists( |
| 42 '/dev/cgroup/memory.limit_in_bytes') |
| 43 # TODO: handle possibility of where memcg is mounted as its own |
| 44 # cgroup hierarchy, separate from cpuset?? |
| 45 elif os.path.exists('/dev/cpuset/tasks'): |
| 46 # running on 2.6.18 kernel with containers on: |
| 47 super_root_path = '/dev/cpuset' |
| 48 cpuset_prefix = '' |
| 49 mem_isolation_on = fake_numa_containers = get_boot_numa() != '' |
| 50 else: |
| 51 # neither cpuset nor cgroup filesystem active: |
| 52 super_root_path = None |
| 53 cpuset_prefix = 'no_cpusets_or_cgroups_exist' |
| 54 mem_isolation_on = fake_numa_containers = False |
| 55 |
| 56 logging.debug('mem_isolation: %s', mem_isolation_on) |
| 57 logging.debug('fake_numa_containers: %s', fake_numa_containers) |
| 58 if fake_numa_containers: |
| 59 node_mbytes = int(mbytes_per_mem_node()) |
| 60 elif mem_isolation_on: # memcg-style containers |
| 61 # For now, limit total of all containers to using just 98% of system's |
| 62 # visible total ram, to avoid oom events at system level, and avoid |
| 63 # page reclaim overhead from going above kswapd highwater mark. |
| 64 system_visible_pages = utils.memtotal() >> 2 |
| 65 usable_pages = int(system_visible_pages * 0.98) |
| 66 root_container_bytes = usable_pages << 12 |
| 67 logging.debug('root_container_bytes: %s', |
| 68 utils.human_format(root_container_bytes)) |
| 69 |
| 70 |
| 71 def need_mem_containers(): |
| 72 discover_container_style() |
| 73 if not mem_isolation_on: |
| 74 raise error.AutotestError('Mem-isolation containers not enabled ' |
| 75 'by latest reboot') |
| 76 |
| 77 def need_fake_numa(): |
| 78 discover_container_style() |
| 79 if not fake_numa_containers: |
| 80 raise error.AutotestError('fake=numa not enabled by latest reboot') |
| 81 |
| 82 |
| 83 def full_path(container_name): |
| 84 discover_container_style() |
| 85 return os.path.join(super_root_path, container_name) |
| 86 |
| 87 |
| 88 def unpath(container_path): |
| 89 return container_path[len(super_root_path)+1:] |
| 90 |
| 91 |
| 92 def cpuset_attr(container_name, attr): |
| 93 discover_container_style() |
| 94 return os.path.join(super_root_path, container_name, cpuset_prefix+attr) |
| 95 |
| 96 |
| 97 def io_attr(container_name, attr): |
| 98 discover_container_style() |
| 99 # current version assumes shared cgroup hierarchy |
| 100 return os.path.join(super_root_path, container_name, 'io.'+attr) |
| 101 |
| 102 |
| 103 def tasks_path(container_name): |
| 104 return os.path.join(full_path(container_name), 'tasks') |
| 105 |
| 106 |
| 107 def mems_path(container_name): |
| 108 return cpuset_attr(container_name, 'mems') |
| 109 |
| 110 |
| 111 def memory_path(container_name): |
| 112 return os.path.join(super_root_path, container_name, 'memory') |
| 113 |
| 114 |
| 115 def cpus_path(container_name): |
| 116 return cpuset_attr(container_name, 'cpus') |
| 117 |
| 118 |
| 119 def container_exists(name): |
| 120 return name is not None and os.path.exists(tasks_path(name)) |
| 121 |
| 122 |
| 123 def move_tasks_into_container(name, tasks): |
| 124 task_file = tasks_path(name) |
| 125 for task in tasks: |
| 126 try: |
| 127 logging.debug('moving task %s into container "%s"', task, name) |
| 128 utils.write_one_line(task_file, task) |
| 129 except Exception: |
| 130 if utils.pid_is_alive(task): |
| 131 raise # task exists but couldn't move it |
| 132 # task is gone or zombie so ignore this exception |
| 133 |
| 134 |
| 135 def move_self_into_container(name): |
| 136 me = str(os.getpid()) |
| 137 move_tasks_into_container(name, [me]) |
| 138 logging.debug('running self (pid %s) in container "%s"', me, name) |
| 139 |
| 140 |
| 141 def _avail_mbytes_via_nodes(parent): |
| 142 # total mbytes of mem nodes available for new containers in parent |
| 143 free_nodes = available_exclusive_mem_nodes(parent) |
| 144 mbytes = nodes_avail_mbytes(free_nodes) |
| 145 # don't have exact model for how container mgr measures mem space |
| 146 # better here to underestimate than overestimate |
| 147 mbytes = max(mbytes - node_mbytes//2, 0) |
| 148 return mbytes |
| 149 |
| 150 |
| 151 def _avail_bytes_via_pages(parent): |
| 152 # Get memory bytes available to parent container which could |
| 153 # be allocated exclusively to new child containers. |
| 154 # This excludes mem previously allocated to existing children. |
| 155 available = container_bytes(parent) |
| 156 mem_files_pattern = os.path.join(full_path(parent), |
| 157 '*', 'memory.limit_in_bytes') |
| 158 for mem_file in glob.glob(mem_files_pattern): |
| 159 child_container = unpath(os.path.dirname(mem_file)) |
| 160 available -= container_bytes(child_container) |
| 161 return available |
| 162 |
| 163 |
| 164 def avail_mbytes(parent=SUPER_ROOT): |
| 165 # total mbytes available in parent, for exclusive use in new containers |
| 166 if fake_numa_containers: |
| 167 return _avail_mbytes_via_nodes(parent) |
| 168 else: |
| 169 return _avail_bytes_via_pages(parent) >> 20 |
| 170 |
| 171 |
| 172 def delete_leftover_test_containers(): |
| 173 # recover mems and cores tied up by containers of prior failed tests: |
| 174 for child in inner_containers_of(SUPER_ROOT): |
| 175 _release_container_nest(child) |
| 176 |
| 177 |
| 178 def my_lock(lockname): |
| 179 # lockname is 'inner' |
| 180 lockdir = os.environ['AUTODIR'] |
| 181 lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname) |
| 182 lockfile = open(lockname, 'w') |
| 183 fcntl.flock(lockfile, fcntl.LOCK_EX) |
| 184 return lockfile |
| 185 |
| 186 |
| 187 def my_unlock(lockfile): |
| 188 fcntl.flock(lockfile, fcntl.LOCK_UN) |
| 189 lockfile.close() |
| 190 |
| 191 |
| 192 # Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12) |
| 193 def rangelist_to_set(rangelist): |
| 194 result = set() |
| 195 if not rangelist: |
| 196 return result |
| 197 for x in rangelist.split(','): |
| 198 if re.match(r'^(\d+)$', x): |
| 199 result.add(int(x)) |
| 200 continue |
| 201 m = re.match(r'^(\d+)-(\d+)$', x) |
| 202 if m: |
| 203 start = int(m.group(1)) |
| 204 end = int(m.group(2)) |
| 205 result.update(set(range(start, end+1))) |
| 206 continue |
| 207 msg = 'Cannot understand data input: %s %s' % (x, rangelist) |
| 208 raise ValueError(msg) |
| 209 return result |
| 210 |
| 211 |
| 212 def my_container_name(): |
| 213 # Get current process's inherited or self-built container name |
| 214 # within /dev/cpuset or /dev/cgroup. Is '' for root container. |
| 215 name = utils.read_one_line('/proc/%i/cpuset' % os.getpid()) |
| 216 return name[1:] # strip leading / |
| 217 |
| 218 |
| 219 def get_mem_nodes(container_name): |
| 220 # all mem nodes now available to a container, both exclusive & shared |
| 221 file_name = mems_path(container_name) |
| 222 if os.path.exists(file_name): |
| 223 return rangelist_to_set(utils.read_one_line(file_name)) |
| 224 else: |
| 225 return set() |
| 226 |
| 227 |
| 228 def _busy_mem_nodes(parent_container): |
| 229 # Get set of numa memory nodes now used (exclusively or shared) |
| 230 # by existing children of parent container |
| 231 busy = set() |
| 232 mem_files_pattern = os.path.join(full_path(parent_container), |
| 233 '*', cpuset_prefix+'mems') |
| 234 for mem_file in glob.glob(mem_files_pattern): |
| 235 child_container = os.path.dirname(mem_file) |
| 236 busy |= get_mem_nodes(child_container) |
| 237 return busy |
| 238 |
| 239 |
| 240 def available_exclusive_mem_nodes(parent_container): |
| 241 # Get subset of numa memory nodes of parent container which could |
| 242 # be allocated exclusively to new child containers. |
| 243 # This excludes nodes now allocated to existing children. |
| 244 need_fake_numa() |
| 245 available = get_mem_nodes(parent_container) |
| 246 available -= _busy_mem_nodes(parent_container) |
| 247 return available |
| 248 |
| 249 |
| 250 def my_mem_nodes(): |
| 251 # Get set of numa memory nodes owned by current process's container. |
| 252 discover_container_style() |
| 253 if not mem_isolation_on: |
| 254 return set() # as expected by vmstress |
| 255 return get_mem_nodes(my_container_name()) |
| 256 |
| 257 |
| 258 def my_available_exclusive_mem_nodes(): |
| 259 # Get subset of numa memory nodes owned by current process's |
| 260 # container, which could be allocated exclusively to new child |
| 261 # containers. This excludes any nodes now allocated |
| 262 # to existing children. |
| 263 return available_exclusive_mem_nodes(my_container_name()) |
| 264 |
| 265 |
| 266 def node_avail_kbytes(node): |
| 267 return node_mbytes << 10 # crude; fixed numa node size |
| 268 |
| 269 |
| 270 def nodes_avail_mbytes(nodes): |
| 271 # nodes' combined user+avail size, in Mbytes |
| 272 return sum(node_avail_kbytes(n) for n in nodes) // 1024 |
| 273 |
| 274 |
| 275 def container_bytes(name): |
| 276 if fake_numa_containers: |
| 277 return nodes_avail_mbytes(get_mem_nodes(name)) << 20 |
| 278 else: |
| 279 while True: |
| 280 file = memory_path(name) + '.limit_in_bytes' |
| 281 limit = int(utils.read_one_line(file)) |
| 282 if limit < NO_LIMIT: |
| 283 return limit |
| 284 if name == SUPER_ROOT: |
| 285 return root_container_bytes |
| 286 name = os.path.dirname(name) |
| 287 |
| 288 |
| 289 def container_mbytes(name): |
| 290 return container_bytes(name) >> 20 |
| 291 |
| 292 |
| 293 def mbytes_per_mem_node(): |
| 294 # Get mbyte size of standard numa mem node, as float |
| 295 # (some nodes are bigger than this) |
| 296 # Replaces utils.node_size(). |
| 297 numa = get_boot_numa() |
| 298 if numa.endswith('M'): |
| 299 return float(numa[:-1]) # mbyte size of fake nodes |
| 300 elif numa: |
| 301 nodecnt = int(numa) # fake numa mem nodes for container isolation |
| 302 else: |
| 303 nodecnt = len(utils.numa_nodes()) # phys mem-controller nodes |
| 304 # Use guessed total physical mem size, not kernel's |
| 305 # lesser 'available memory' after various system tables. |
| 306 return utils.rounded_memtotal() / (nodecnt * 1024.0) |
| 307 |
| 308 |
| 309 def get_cpus(container_name): |
| 310 file_name = cpus_path(container_name) |
| 311 if os.path.exists(file_name): |
| 312 return rangelist_to_set(utils.read_one_line(file_name)) |
| 313 else: |
| 314 return set() |
| 315 |
| 316 |
| 317 def get_tasks(container_name): |
| 318 file_name = tasks_path(container_name) |
| 319 try: |
| 320 tasks = [x.rstrip() for x in open(file_name).readlines()] |
| 321 except IOError: |
| 322 if os.path.exists(file_name): |
| 323 raise |
| 324 tasks = [] # container doesn't exist anymore |
| 325 return tasks |
| 326 |
| 327 |
| 328 def inner_containers_of(parent): |
| 329 pattern = os.path.join(full_path(parent), '*/tasks') |
| 330 return [unpath(os.path.dirname(task_file)) |
| 331 for task_file in glob.glob(pattern)] |
| 332 |
| 333 |
| 334 def _release_container_nest(nest): |
| 335 # Destroy a container, and any nested sub-containers |
| 336 nest_path = full_path(nest) |
| 337 if os.path.exists(nest_path): |
| 338 |
| 339 # bottom-up walk of tree, releasing all nested sub-containers |
| 340 for child in inner_containers_of(nest): |
| 341 _release_container_nest(child) |
| 342 |
| 343 logging.debug("releasing container %s", nest) |
| 344 |
| 345 # Transfer any survivor tasks (e.g. self) to parent container |
| 346 parent = os.path.dirname(nest) |
| 347 move_tasks_into_container(parent, get_tasks(nest)) |
| 348 |
| 349 # remove the now-empty outermost container of this nest |
| 350 if os.path.exists(nest_path): |
| 351 os.rmdir(nest_path) # nested, or dead manager |
| 352 |
| 353 |
| 354 def release_container(container_name=None): |
| 355 # Destroy a container |
| 356 my_container = my_container_name() |
| 357 if container_name is None: |
| 358 container_name = my_container |
| 359 _release_container_nest(container_name) |
| 360 displaced = my_container_name() |
| 361 if displaced != my_container: |
| 362 logging.debug('now running self (pid %d) in container "%s"', |
| 363 os.getpid(), displaced) |
| 364 |
| 365 |
| 366 def remove_empty_prio_classes(prios): |
| 367 # remove prio classes whose set of allowed priorities is empty |
| 368 # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3' |
| 369 return ';'.join(p for p in prios.split(';') if p.split(':')[1]) |
| 370 |
| 371 |
| 372 def all_drive_names(): |
| 373 # list of all disk drives sda,sdb,... |
| 374 paths = glob.glob('/sys/block/sd*') |
| 375 if not paths: |
| 376 paths = glob.glob('/sys/block/hd*') |
| 377 return [os.path.basename(path) for path in paths] |
| 378 |
| 379 |
| 380 def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL], |
| 381 io_shares=[95], io_limits=[0]): |
| 382 # set the propio controls for one container, for selected disks |
| 383 # writing directly to /dev/cgroup/container_name/io.io_service_level |
| 384 # without using containerd or container.py |
| 385 # See wiki ProportionalIOScheduler for definitions |
| 386 # ioprio_classes: list of service classes, one per disk |
| 387 # using numeric propio service classes as used by kernel API, namely |
| 388 # 1: RT, Real Time, aka PROPIO_PRIO |
| 389 # 2: BE, Best Effort, aka PROPIO_NORMAL |
| 390 # 3: PROPIO_IDLE |
| 391 # io_shares: list of disk-time-fractions, one per disk, |
| 392 # as percentage integer 0..100 |
| 393 # io_limits: list of limit on/off, one per disk |
| 394 # 0: no limit, shares use of other containers' unused disk time |
| 395 # 1: limited, container's use of disk time is capped to given DTF |
| 396 # ioprio_classes defaults to best-effort |
| 397 # io_limit defaults to no limit, use slack time |
| 398 if not disks: # defaults to all drives |
| 399 disks = all_drive_names() |
| 400 io_shares = [io_shares [0]] * len(disks) |
| 401 ioprio_classes = [ioprio_classes[0]] * len(disks) |
| 402 io_limits = [io_limits [0]] * len(disks) |
| 403 if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares) |
| 404 and len(disks) == len(io_limits)): |
| 405 raise error.AutotestError('Unequal number of values for io controls') |
| 406 service_level = io_attr(container_name, 'io_service_level') |
| 407 if not os.path.exists(service_level): |
| 408 return # kernel predates propio features |
| 409 # or io cgroup is mounted separately from cpusets |
| 410 disk_infos = [] |
| 411 for disk,ioclass,limit,share in zip(disks, ioprio_classes, |
| 412 io_limits, io_shares): |
| 413 parts = (disk, str(ioclass), str(limit), str(share)) |
| 414 disk_info = ' '.join(parts) |
| 415 utils.write_one_line(service_level, disk_info) |
| 416 disk_infos.append(disk_info) |
| 417 logging.debug('set_io_controls of %s to %s', |
| 418 container_name, ', '.join(disk_infos)) |
| 419 |
| 420 |
| 421 def abbrev_list(vals): |
| 422 """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'.""" |
| 423 ranges = [] |
| 424 lower = 0 |
| 425 upper = -2 |
| 426 for val in sorted(vals)+[-1]: |
| 427 if val != upper+1: |
| 428 if lower == upper: |
| 429 ranges.append(str(lower)) |
| 430 elif lower <= upper: |
| 431 ranges.append('%d-%d' % (lower, upper)) |
| 432 lower = val |
| 433 upper = val |
| 434 return ','.join(ranges) |
| 435 |
| 436 |
| 437 def create_container_with_specific_mems_cpus(name, mems, cpus): |
| 438 need_fake_numa() |
| 439 os.mkdir(full_path(name)) |
| 440 utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1') |
| 441 utils.write_one_line(mems_path(name), ','.join(map(str, mems))) |
| 442 utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) |
| 443 logging.debug('container %s has %d cpus and %d nodes totalling %s bytes', |
| 444 name, len(cpus), len(get_mem_nodes(name)), |
| 445 utils.human_format(container_bytes(name)) ) |
| 446 |
| 447 |
| 448 def create_container_via_memcg(name, parent, bytes, cpus): |
| 449 # create container via direct memcg cgroup writes |
| 450 os.mkdir(full_path(name)) |
| 451 nodes = utils.read_one_line(mems_path(parent)) |
| 452 utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes |
| 453 utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes)) |
| 454 utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) |
| 455 logging.debug('Created container %s directly via memcg,' |
| 456 ' has %d cpus and %s bytes', |
| 457 name, len(cpus), utils.human_format(container_bytes(name))) |
| 458 |
| 459 |
| 460 def _create_fake_numa_container_directly(name, parent, mbytes, cpus): |
| 461 need_fake_numa() |
| 462 lockfile = my_lock('inner') # serialize race between parallel tests |
| 463 try: |
| 464 # Pick specific mem nodes for new cpuset's exclusive use |
| 465 # For now, arbitrarily pick highest available node numbers |
| 466 needed_kbytes = mbytes * 1024 |
| 467 nodes = sorted(list(available_exclusive_mem_nodes(parent))) |
| 468 kbytes = 0 |
| 469 nodecnt = 0 |
| 470 while kbytes < needed_kbytes and nodecnt < len(nodes): |
| 471 nodecnt += 1 |
| 472 kbytes += node_avail_kbytes(nodes[-nodecnt]) |
| 473 if kbytes < needed_kbytes: |
| 474 parent_mbytes = container_mbytes(parent) |
| 475 if mbytes > parent_mbytes: |
| 476 raise error.AutotestError( |
| 477 "New container's %d Mbytes exceeds " |
| 478 "parent container's %d Mbyte size" |
| 479 % (mbytes, parent_mbytes) ) |
| 480 else: |
| 481 raise error.AutotestError( |
| 482 "Existing sibling containers hold " |
| 483 "%d Mbytes needed by new container" |
| 484 % ((needed_kbytes - kbytes)//1024) ) |
| 485 mems = nodes[-nodecnt:] |
| 486 |
| 487 create_container_with_specific_mems_cpus(name, mems, cpus) |
| 488 finally: |
| 489 my_unlock(lockfile) |
| 490 |
| 491 |
| 492 def create_container_directly(name, mbytes, cpus): |
| 493 parent = os.path.dirname(name) |
| 494 if fake_numa_containers: |
| 495 _create_fake_numa_container_directly(name, parent, mbytes, cpus) |
| 496 else: |
| 497 create_container_via_memcg(name, parent, mbytes<<20, cpus) |
| 498 |
| 499 |
| 500 def create_container_with_mbytes_and_specific_cpus(name, mbytes, |
| 501 cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0): |
| 502 """\ |
| 503 Create a cpuset container and move job's current pid into it |
| 504 Allocate the list "cpus" of cpus to that container |
| 505 |
| 506 name = arbitrary string tag |
| 507 mbytes = reqested memory for job in megabytes |
| 508 cpus = list of cpu indicies to associate with the cpuset |
| 509 defaults to all cpus avail with given root |
| 510 root = the parent cpuset to nest this new set within |
| 511 '': unnested top-level container |
| 512 io = arguments for proportional IO containers |
| 513 move_in = True: Move current process into the new container now. |
| 514 timeout = must be 0: persist until explicitly deleted. |
| 515 """ |
| 516 need_mem_containers() |
| 517 if not container_exists(root): |
| 518 raise error.AutotestError('Parent container "%s" does not exist' |
| 519 % root) |
| 520 if cpus is None: |
| 521 # default to biggest container we can make under root |
| 522 cpus = get_cpus(root) |
| 523 else: |
| 524 cpus = set(cpus) # interface uses list |
| 525 if not cpus: |
| 526 raise error.AutotestError('Creating container with no cpus') |
| 527 name = os.path.join(root, name) # path relative to super_root |
| 528 if os.path.exists(full_path(name)): |
| 529 raise error.AutotestError('Container %s already exists' % name) |
| 530 create_container_directly(name, mbytes, cpus) |
| 531 set_io_controls(name, **io) |
| 532 if move_in: |
| 533 move_self_into_container(name) |
| 534 return name |
| 535 |
| 536 |
| 537 def get_boot_numa(): |
| 538 # get boot-time numa=fake=xyz option for current boot |
| 539 # eg numa=fake=nnn, numa=fake=nnnM, or nothing |
| 540 label = 'numa=fake=' |
| 541 for arg in utils.read_one_line('/proc/cmdline').split(): |
| 542 if arg.startswith(label): |
| 543 return arg[len(label):] |
| 544 return '' |
OLD | NEW |