diff options
author | Lennart Poettering <lennart@poettering.net> | 2015-09-01 19:22:36 +0200 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2015-09-01 23:52:27 +0200 |
commit | efdb02375beb0a940c3320865572913780b4d7de (patch) | |
tree | bffddfbb0344c1d7c2e1853f36b0acf3f1624d64 /src/core | |
parent | 92dcf85e11d24b60f099401c1865add607d0bf4a (diff) |
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/cgroup.c | 586 | ||||
-rw-r--r-- | src/core/cgroup.h | 32 | ||||
-rw-r--r-- | src/core/dbus-cgroup.c | 26 | ||||
-rw-r--r-- | src/core/dbus-unit.c | 3 | ||||
-rw-r--r-- | src/core/execute.h | 2 | ||||
-rw-r--r-- | src/core/manager.c | 6 | ||||
-rw-r--r-- | src/core/manager.h | 12 | ||||
-rw-r--r-- | src/core/mount-setup.c | 11 | ||||
-rw-r--r-- | src/core/scope.c | 67 | ||||
-rw-r--r-- | src/core/service.c | 16 | ||||
-rw-r--r-- | src/core/slice.c | 43 | ||||
-rw-r--r-- | src/core/unit.c | 168 | ||||
-rw-r--r-- | src/core/unit.h | 10 |
13 files changed, 678 insertions, 304 deletions
diff --git a/src/core/cgroup.c b/src/core/cgroup.c index e92d2cc850..a70b4d33ae 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -283,7 +283,7 @@ fail: return -errno; } -void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) { +void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) { bool is_root; int r; @@ -304,7 +304,7 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha * cgroup trees (assuming we are running in a container then), * and missing cgroups, i.e. EROFS and ENOENT. */ - if ((mask & CGROUP_CPU) && !is_root) { + if ((mask & CGROUP_MASK_CPU) && !is_root) { char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1]; sprintf(buf, "%lu\n", @@ -331,7 +331,7 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha "Failed to set cpu.cfs_quota_us on %s: %m", path); } - if (mask & CGROUP_BLKIO) { + if (mask & CGROUP_MASK_BLKIO) { char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1, DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1, DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)]; @@ -381,21 +381,30 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha } } - if ((mask & CGROUP_MEMORY) && !is_root) { + if ((mask & CGROUP_MASK_MEMORY) && !is_root) { if (c->memory_limit != (uint64_t) -1) { char buf[DECIMAL_STR_MAX(uint64_t) + 1]; sprintf(buf, "%" PRIu64 "\n", c->memory_limit); - r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf); - } else - r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1"); + + if (cg_unified() <= 0) + r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf); + else + r = cg_set_attribute("memory", path, "memory.max", buf); + + } else { + if (cg_unified() <= 0) + r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1"); + else + r = cg_set_attribute("memory", path, "memory.max", "max"); + } if (r < 0) log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set memory.limit_in_bytes on %s: %m", path); + "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path); } - if ((mask & CGROUP_DEVICE) && !is_root) { + if ((mask & CGROUP_MASK_DEVICE) && !is_root) { CGroupDeviceAllow *a; /* Changing the devices list of a populated cgroup @@ -459,8 +468,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha } } -CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) { - CGroupControllerMask mask = 0; +CGroupMask cgroup_context_get_mask(CGroupContext *c) { + CGroupMask mask = 0; /* Figure out which controllers we need */ @@ -468,29 +477,31 @@ CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) { c->cpu_shares != (unsigned long) -1 || c->startup_cpu_shares != (unsigned long) -1 || c->cpu_quota_per_sec_usec != USEC_INFINITY) - mask |= CGROUP_CPUACCT | CGROUP_CPU; + mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU; if (c->blockio_accounting || c->blockio_weight != (unsigned long) -1 || c->startup_blockio_weight != (unsigned long) -1 || c->blockio_device_weights || c->blockio_device_bandwidths) - mask |= CGROUP_BLKIO; + mask |= CGROUP_MASK_BLKIO; if (c->memory_accounting || c->memory_limit != (uint64_t) -1) - mask |= CGROUP_MEMORY; + mask |= CGROUP_MASK_MEMORY; if (c->device_allow || c->device_policy != CGROUP_AUTO) - mask |= CGROUP_DEVICE; + mask |= CGROUP_MASK_DEVICE; return mask; } -CGroupControllerMask unit_get_cgroup_mask(Unit *u) { +CGroupMask unit_get_own_mask(Unit *u) { CGroupContext *c; + /* Returns the mask of controllers the unit needs for itself */ + c = unit_get_cgroup_context(u); if (!c) return 0; @@ -505,15 +516,18 @@ CGroupControllerMask unit_get_cgroup_mask(Unit *u) { e = unit_get_exec_context(u); if (!e || exec_context_maintains_privileges(e)) - return _CGROUP_CONTROLLER_MASK_ALL; + return _CGROUP_MASK_ALL; } return cgroup_context_get_mask(c); } -CGroupControllerMask unit_get_members_mask(Unit *u) { +CGroupMask unit_get_members_mask(Unit *u) { assert(u); + /* Returns the mask of controllers all of the unit's children + * require, merged */ + if (u->cgroup_members_mask_valid) return u->cgroup_members_mask; @@ -532,7 +546,7 @@ CGroupControllerMask unit_get_members_mask(Unit *u) { continue; u->cgroup_members_mask |= - unit_get_cgroup_mask(member) | + unit_get_own_mask(member) | unit_get_members_mask(member); } } @@ -541,19 +555,52 @@ CGroupControllerMask unit_get_members_mask(Unit *u) { return u->cgroup_members_mask; } -CGroupControllerMask unit_get_siblings_mask(Unit *u) { +CGroupMask unit_get_siblings_mask(Unit *u) { assert(u); + /* Returns the mask of controllers all of the unit's siblings + * require, i.e. the members mask of the unit's parent slice + * if there is one. */ + if (UNIT_ISSET(u->slice)) return unit_get_members_mask(UNIT_DEREF(u->slice)); - return unit_get_cgroup_mask(u) | unit_get_members_mask(u); + return unit_get_own_mask(u) | unit_get_members_mask(u); } -CGroupControllerMask unit_get_target_mask(Unit *u) { - CGroupControllerMask mask; +CGroupMask unit_get_subtree_mask(Unit *u) { + + /* Returns the mask of this subtree, meaning of the group + * itself and its children. */ + + return unit_get_own_mask(u) | unit_get_members_mask(u); +} + +CGroupMask unit_get_target_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for a specific cgroup, i.e. everything it needs itself, + * plus all that its children need, plus all that its siblings + * need. This is primarily useful on the legacy cgroup + * hierarchy, where we need to duplicate each cgroup in each + * hierarchy that shall be enabled for it. */ - mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u); + mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u); + mask &= u->manager->cgroup_supported; + + return mask; +} + +CGroupMask unit_get_enable_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for the children of a specific cgroup. This is primarily + * useful for the unified cgroup hierarchy, where each cgroup + * controls which controllers are enabled for its children. */ + + mask = unit_get_members_mask(u); mask &= u->manager->cgroup_supported; return mask; @@ -562,13 +609,13 @@ CGroupControllerMask unit_get_target_mask(Unit *u) { /* Recurse from a unit up through its containing slices, propagating * mask bits upward. A unit is also member of itself. */ void unit_update_cgroup_members_masks(Unit *u) { - CGroupControllerMask m; + CGroupMask m; bool more; assert(u); /* Calculate subtree mask */ - m = unit_get_cgroup_mask(u) | unit_get_members_mask(u); + m = unit_get_subtree_mask(u); /* See if anything changed from the previous invocation. If * not, we're done. */ @@ -608,7 +655,7 @@ void unit_update_cgroup_members_masks(Unit *u) { } } -static const char *migrate_callback(CGroupControllerMask mask, void *userdata) { +static const char *migrate_callback(CGroupMask mask, void *userdata) { Unit *u = userdata; assert(mask != 0); @@ -626,7 +673,115 @@ static const char *migrate_callback(CGroupControllerMask mask, void *userdata) { return NULL; } -static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) { +char *unit_default_cgroup_path(Unit *u) { + _cleanup_free_ char *escaped = NULL, *slice = NULL; + int r; + + assert(u); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return strdup(u->manager->cgroup_root); + + if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) { + r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice); + if (r < 0) + return NULL; + } + + escaped = cg_escape(u->id); + if (!escaped) + return NULL; + + if (slice) + return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL); + else + return strjoin(u->manager->cgroup_root, "/", escaped, NULL); +} + +int unit_set_cgroup_path(Unit *u, const char *path) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + if (path) { + p = strdup(path); + if (!p) + return -ENOMEM; + } else + p = NULL; + + if (streq_ptr(u->cgroup_path, p)) + return 0; + + if (p) { + r = hashmap_put(u->manager->cgroup_unit, p, u); + if (r < 0) + return r; + } + + unit_release_cgroup(u); + + u->cgroup_path = p; + p = NULL; + + return 1; +} + +int unit_watch_cgroup(Unit *u) { + _cleanup_free_ char *populated = NULL; + int r; + + assert(u); + + if (!u->cgroup_path) + return 0; + + if (u->cgroup_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_unified(); + if (r < 0) + return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m"); + if (r == 0) + return 0; + + /* Don't watch the root slice, it's pointless. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated); + if (r < 0) + return log_oom(); + + u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY); + if (u->cgroup_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already + * gone we don't need to track + * it, so this is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path); + } + + r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m"); + + return 0; +} + +static int unit_create_cgroup( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + CGroupContext *c; int r; @@ -643,25 +798,29 @@ static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) { if (!path) return log_oom(); - r = hashmap_put(u->manager->cgroup_unit, path, u); - if (r < 0) { - log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r)); - return r; - } - if (r > 0) { - u->cgroup_path = path; - path = NULL; - } + r = unit_set_cgroup_path(u, path); + if (r == -EEXIST) + return log_unit_error_errno(u, r, "Control group %s exists already.", path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path); } /* First, create our own group */ - r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path); + r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path); + + /* Start watching it */ + (void) unit_watch_cgroup(u); + + /* Enable all controllers we need */ + r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path); if (r < 0) - return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path); + log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path); /* Keep track that this is now realized */ u->cgroup_realized = true; - u->cgroup_realized_mask = mask; + u->cgroup_realized_mask = target_mask; if (u->type != UNIT_SLICE && !c->delegate) { @@ -670,7 +829,7 @@ static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) { * for slice and delegation units. */ r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u); if (r < 0) - log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path); + log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path); } return 0; @@ -691,10 +850,10 @@ int unit_attach_pids_to_cgroup(Unit *u) { return 0; } -static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) { +static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) { assert(u); - return u->cgroup_realized && u->cgroup_realized_mask == mask; + return u->cgroup_realized && u->cgroup_realized_mask == target_mask; } /* Check if necessary controllers and attributes for a unit are in place. @@ -704,7 +863,7 @@ static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) { * * Returns 0 on success and < 0 on failure. */ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { - CGroupControllerMask mask; + CGroupMask target_mask, enable_mask; int r; assert(u); @@ -714,9 +873,8 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { u->in_cgroup_queue = false; } - mask = unit_get_target_mask(u); - - if (unit_has_mask_realized(u, mask)) + target_mask = unit_get_target_mask(u); + if (unit_has_mask_realized(u, target_mask)) return 0; /* First, realize parents */ @@ -727,12 +885,13 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { } /* And then do the real work */ - r = unit_create_cgroups(u, mask); + enable_mask = unit_get_enable_mask(u); + r = unit_create_cgroup(u, target_mask, enable_mask); if (r < 0) return r; /* Finally, apply the necessary attributes. */ - cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state); + cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state); return 0; } @@ -759,7 +918,7 @@ unsigned manager_dispatch_cgroup_queue(Manager *m) { r = unit_realize_cgroup_now(i, state); if (r < 0) - log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id); + log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id); n++; } @@ -829,39 +988,67 @@ int unit_realize_cgroup(Unit *u) { return unit_realize_cgroup_now(u, manager_state(u->manager)); } -void unit_destroy_cgroup_if_empty(Unit *u) { +void unit_release_cgroup(Unit *u) { + assert(u); + + /* Forgets all cgroup details for this cgroup */ + + if (u->cgroup_path) { + (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); + u->cgroup_path = mfree(u->cgroup_path); + } + + if (u->cgroup_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd)); + u->cgroup_inotify_wd = -1; + } +} + +void unit_prune_cgroup(Unit *u) { int r; + bool is_root_slice; assert(u); + /* Removes the cgroup, if empty and possible, and stops watching it. */ + if (!u->cgroup_path) return; - r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE)); + is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); + + r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice); if (r < 0) { - log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path); + log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path); return; } - hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); + if (is_root_slice) + return; + + unit_release_cgroup(u); - free(u->cgroup_path); - u->cgroup_path = NULL; u->cgroup_realized = false; u->cgroup_realized_mask = 0; } -pid_t unit_search_main_pid(Unit *u) { +int unit_search_main_pid(Unit *u, pid_t *ret) { _cleanup_fclose_ FILE *f = NULL; pid_t pid = 0, npid, mypid; + int r; assert(u); + assert(ret); if (!u->cgroup_path) - return 0; + return -ENXIO; - if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0) - return 0; + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f); + if (r < 0) + return r; mypid = getpid(); while (cg_read_pid(f, &npid) > 0) { @@ -874,90 +1061,274 @@ pid_t unit_search_main_pid(Unit *u) { if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid) continue; - if (pid != 0) { + if (pid != 0) /* Dang, there's more than one daemonized PID in this group, so we don't know what process is the main process. */ - pid = 0; - break; - } + + return -ENODATA; pid = npid; } - return pid; + *ret = pid; + return 0; +} + +static int unit_watch_pids_in_path(Unit *u, const char *path) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int ret = 0, r; + + assert(u); + assert(path); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f); + if (r < 0) + ret = r; + else { + pid_t pid; + + while ((r = cg_read_pid(f, &pid)) > 0) { + r = unit_watch_pid(u, pid); + if (r < 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) { + if (ret >= 0) + ret = r; + } else { + char *fn; + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = strjoin(path, "/", fn, NULL); + free(fn); + + if (!p) + return -ENOMEM; + + r = unit_watch_pids_in_path(u, p); + if (r < 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + } + + return ret; +} + +int unit_watch_all_pids(Unit *u) { + assert(u); + + /* Adds all PIDs from our cgroup to the set of PIDs we + * watch. This is a fallback logic for cases where we do not + * get reliable cgroup empty notifications: we try to use + * SIGCHLD as replacement. */ + + if (!u->cgroup_path) + return -ENOENT; + + if (cg_unified() > 0) /* On unified we can use proper notifications */ + return 0; + + return unit_watch_pids_in_path(u, u->cgroup_path); +} + +int unit_notify_cgroup_empty(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return 0; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r <= 0) + return r; + + unit_add_to_gc_queue(u); + + if (UNIT_VTABLE(u)->notify_cgroup_empty) + UNIT_VTABLE(u)->notify_cgroup_empty(u); + + return 0; +} + +static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(s); + assert(fd >= 0); + assert(m); + + for (;;) { + union inotify_event_buffer buffer; + struct inotify_event *e; + ssize_t l; + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (errno == EINTR || errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to read control group inotify events: %m"); + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + Unit *u; + + if (e->wd < 0) + /* Queue overflow has no watch descriptor */ + continue; + + if (e->mask & IN_IGNORED) + /* The watch was just removed */ + continue; + + u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (!u) /* Not that inotify might deliver + * events for a watch even after it + * was removed, because it was queued + * before the removal. Let's ignore + * this here safely. */ + continue; + + (void) unit_notify_cgroup_empty(u); + } + } } int manager_setup_cgroup(Manager *m) { _cleanup_free_ char *path = NULL; - int r; + CGroupController c; + int r, unified; + char *e; assert(m); /* 1. Determine hierarchy */ - free(m->cgroup_root); - m->cgroup_root = NULL; - + m->cgroup_root = mfree(m->cgroup_root); r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root); if (r < 0) return log_error_errno(r, "Cannot determine cgroup we are running in: %m"); - /* LEGACY: Already in /system.slice? If so, let's cut this - * off. This is to support live upgrades from older systemd - * versions where PID 1 was moved there. */ - if (m->running_as == MANAGER_SYSTEM) { - char *e; + /* Chop off the init scope, if we are already located in it */ + e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + /* LEGACY: Also chop off the system slice if we are in + * it. This is to support live upgrades from older systemd + * versions where PID 1 was moved there. Also see + * cg_get_root_path(). */ + if (!e && m->running_as == MANAGER_SYSTEM) { e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE); if (!e) - e = endswith(m->cgroup_root, "/system"); - if (e) - *e = 0; + e = endswith(m->cgroup_root, "/system"); /* even more legacy */ } + if (e) + *e = 0; /* And make sure to store away the root value without trailing * slash, even for the root dir, so that we can easily prepend * it everywhere. */ - if (streq(m->cgroup_root, "/")) - m->cgroup_root[0] = 0; + while ((e = endswith(m->cgroup_root, "/"))) + *e = 0; /* 2. Show data */ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path); if (r < 0) return log_error_errno(r, "Cannot find cgroup mount point: %m"); - log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path); + unified = cg_unified(); + if (unified < 0) + return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m"); + if (unified > 0) + log_debug("Unified cgroup hierarchy is located at %s.", path); + else + log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path); + if (!m->test_run) { + const char *scope_path; /* 3. Install agent */ - if (m->running_as == MANAGER_SYSTEM) { + if (unified) { + + /* In the unified hierarchy we can can get + * cgroup empty notifications via inotify. */ + + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + safe_close(m->cgroup_inotify_fd); + + m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->cgroup_inotify_fd < 0) + return log_error_errno(errno, "Failed to create control group inotify object: %m"); + + r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m); + if (r < 0) + return log_error_errno(r, "Failed to watch control group inotify object: %m"); + + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5); + if (r < 0) + return log_error_errno(r, "Failed to set priority of inotify event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); + + } else if (m->running_as == MANAGER_SYSTEM) { + + /* On the legacy hierarchy we only get + * notifications via cgroup agents. (Which + * isn't really reliable, since it does not + * generate events when control groups with + * children run empty. */ + r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH); if (r < 0) log_warning_errno(r, "Failed to install release agent, ignoring: %m"); else if (r > 0) log_debug("Installed release agent."); - else + else if (r == 0) log_debug("Release agent already installed."); } - /* 4. Make sure we are in the root cgroup */ - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0); + /* 4. Make sure we are in the special "init.scope" unit in the root slice. */ + scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); if (r < 0) - return log_error_errno(r, "Failed to create root cgroup hierarchy: %m"); + return log_error_errno(r, "Failed to create %s control group: %m", scope_path); + + /* also, move all other userspace processes remaining + * in the root cgroup into that scope. */ + r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false); + if (r < 0) + log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m"); /* 5. And pin it, so that it cannot be unmounted */ safe_close(m->pin_cgroupfs_fd); - m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK); if (m->pin_cgroupfs_fd < 0) return log_error_errno(errno, "Failed to open pin file: %m"); /* 6. Always enable hierarchical support if it exists... */ - cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); + if (!unified) + (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); } /* 7. Figure out which controllers are supported */ - m->cgroup_supported = cg_mask_supported(); + r = cg_mask_supported(&m->cgroup_supported); + if (r < 0) + return log_error_errno(r, "Failed to determine supported controllers: %m"); + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) + log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c)); return 0; } @@ -968,12 +1339,16 @@ void manager_shutdown_cgroup(Manager *m, bool delete) { /* We can't really delete the group, since we are in it. But * let's trim it. */ if (delete && m->cgroup_root) - cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false); + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false); + + m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit); + + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd); - free(m->cgroup_root); - m->cgroup_root = NULL; + m->cgroup_root = mfree(m->cgroup_root); } Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { @@ -992,8 +1367,8 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { char *e; e = strrchr(p, '/'); - if (e == p || !e) - return NULL; + if (!e || e == p) + return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE); *e = 0; @@ -1010,9 +1385,12 @@ Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) { assert(m); - if (pid <= 1) + if (pid <= 0) return NULL; + if (pid == 1) + return hashmap_get(m->units, SPECIAL_INIT_SCOPE); + u = hashmap_get(m->watch_pids1, LONG_TO_PTR(pid)); if (u) return u; @@ -1030,7 +1408,6 @@ Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) { int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { Unit *u; - int r; assert(m); assert(cgroup); @@ -1039,15 +1416,7 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { if (!u) return 0; - r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); - if (r <= 0) - return r; - - if (UNIT_VTABLE(u)->notify_cgroup_empty) - UNIT_VTABLE(u)->notify_cgroup_empty(u); - - unit_add_to_gc_queue(u); - return 0; + return unit_notify_cgroup_empty(u); } int unit_get_memory_current(Unit *u, uint64_t *ret) { @@ -1060,10 +1429,13 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) { if (!u->cgroup_path) return -ENODATA; - if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0) + if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) return -ENODATA; - r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v); + if (cg_unified() <= 0) + r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v); + else + r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v); if (r == -ENOENT) return -ENODATA; if (r < 0) @@ -1083,7 +1455,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { if (!u->cgroup_path) return -ENODATA; - if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0) + if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0) return -ENODATA; r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 7b38d210fb..1ce21f43f2 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -96,22 +96,32 @@ struct CGroupContext { void cgroup_context_init(CGroupContext *c); void cgroup_context_done(CGroupContext *c); void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix); -void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state); +void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state); -CGroupControllerMask cgroup_context_get_mask(CGroupContext *c); +CGroupMask cgroup_context_get_mask(CGroupContext *c); void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a); void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w); void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b); -CGroupControllerMask unit_get_cgroup_mask(Unit *u); -CGroupControllerMask unit_get_siblings_mask(Unit *u); -CGroupControllerMask unit_get_members_mask(Unit *u); -CGroupControllerMask unit_get_target_mask(Unit *u); +CGroupMask unit_get_own_mask(Unit *u); +CGroupMask unit_get_siblings_mask(Unit *u); +CGroupMask unit_get_members_mask(Unit *u); +CGroupMask unit_get_subtree_mask(Unit *u); + +CGroupMask unit_get_target_mask(Unit *u); +CGroupMask unit_get_enable_mask(Unit *u); void unit_update_cgroup_members_masks(Unit *u); + +char *unit_default_cgroup_path(Unit *u); +int unit_set_cgroup_path(Unit *u, const char *path); + int unit_realize_cgroup(Unit *u); -void unit_destroy_cgroup_if_empty(Unit *u); +void unit_release_cgroup(Unit *u); +void unit_prune_cgroup(Unit *u); +int unit_watch_cgroup(Unit *u); + int unit_attach_pids_to_cgroup(Unit *u); int manager_setup_cgroup(Manager *m); @@ -122,9 +132,8 @@ unsigned manager_dispatch_cgroup_queue(Manager *m); Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup); Unit* manager_get_unit_by_pid(Manager *m, pid_t pid); -pid_t unit_search_main_pid(Unit *u); - -int manager_notify_cgroup_empty(Manager *m, const char *group); +int unit_search_main_pid(Unit *u, pid_t *ret); +int unit_watch_all_pids(Unit *u); int unit_get_memory_current(Unit *u, uint64_t *ret); int unit_get_cpu_usage(Unit *u, nsec_t *ret); @@ -132,5 +141,8 @@ int unit_reset_cpu_usage(Unit *u); bool unit_cgroup_delegate(Unit *u); +int unit_notify_cgroup_empty(Unit *u); +int manager_notify_cgroup_empty(Manager *m, const char *group); + const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_; CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_; diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 9dcc51f240..e8fd44e294 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -228,7 +228,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->cpu_accounting = b; - u->cgroup_realized_mask &= ~CGROUP_CPUACCT; + u->cgroup_realized_mask &= ~CGROUP_MASK_CPUACCT; unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes" : "CPUAccounting=no"); } @@ -252,7 +252,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->cpu_shares = ul; - u->cgroup_realized_mask &= ~CGROUP_CPU; + u->cgroup_realized_mask &= ~CGROUP_MASK_CPU; unit_write_drop_in_private_format(u, mode, name, "CPUShares=%lu", ul); } @@ -276,7 +276,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->startup_cpu_shares = ul; - u->cgroup_realized_mask &= ~CGROUP_CPU; + u->cgroup_realized_mask &= ~CGROUP_MASK_CPU; unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%lu", ul); } @@ -294,7 +294,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->cpu_quota_per_sec_usec = u64; - u->cgroup_realized_mask &= ~CGROUP_CPU; + u->cgroup_realized_mask &= ~CGROUP_MASK_CPU; unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%", (double) (c->cpu_quota_per_sec_usec / 10000)); } @@ -309,7 +309,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->blockio_accounting = b; - u->cgroup_realized_mask &= ~CGROUP_BLKIO; + u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO; unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes" : "BlockIOAccounting=no"); } @@ -333,7 +333,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->blockio_weight = ul; - u->cgroup_realized_mask &= ~CGROUP_BLKIO; + u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO; unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%lu", ul); } @@ -357,7 +357,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->startup_blockio_weight = ul; - u->cgroup_realized_mask &= ~CGROUP_BLKIO; + u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO; unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%lu", ul); } @@ -427,7 +427,7 @@ int bus_cgroup_set_property( cgroup_context_free_blockio_device_bandwidth(c, a); } - u->cgroup_realized_mask &= ~CGROUP_BLKIO; + u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO; f = open_memstream(&buf, &size); if (!f) @@ -510,7 +510,7 @@ int bus_cgroup_set_property( cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); } - u->cgroup_realized_mask &= ~CGROUP_BLKIO; + u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO; f = open_memstream(&buf, &size); if (!f) @@ -535,7 +535,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->memory_accounting = b; - u->cgroup_realized_mask &= ~CGROUP_MEMORY; + u->cgroup_realized_mask &= ~CGROUP_MASK_MEMORY; unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes" : "MemoryAccounting=no"); } @@ -550,7 +550,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->memory_limit = limit; - u->cgroup_realized_mask &= ~CGROUP_MEMORY; + u->cgroup_realized_mask &= ~CGROUP_MASK_MEMORY; unit_write_drop_in_private_format(u, mode, name, "%s=%" PRIu64, name, limit); } @@ -572,7 +572,7 @@ int bus_cgroup_set_property( char *buf; c->device_policy = p; - u->cgroup_realized_mask &= ~CGROUP_DEVICE; + u->cgroup_realized_mask &= ~CGROUP_MASK_DEVICE; buf = strjoina("DevicePolicy=", policy); unit_write_drop_in_private(u, mode, name, buf); @@ -651,7 +651,7 @@ int bus_cgroup_set_property( cgroup_context_free_device_allow(c, c->device_allow); } - u->cgroup_realized_mask &= ~CGROUP_DEVICE; + u->cgroup_realized_mask &= ~CGROUP_MASK_DEVICE; f = open_memstream(&buf, &size); if (!f) diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 1e6291e762..31016b6c4a 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -25,6 +25,7 @@ #include "cgroup-util.h" #include "strv.h" #include "bus-common-errors.h" +#include "special.h" #include "dbus.h" #include "dbus-unit.h" @@ -973,6 +974,8 @@ static int bus_unit_set_transient_property( return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups."); if (u->type == UNIT_SLICE) return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units."); + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope"); r = sd_bus_message_read(message, "s", &s); if (r < 0) diff --git a/src/core/execute.h b/src/core/execute.h index 8d14fe23d0..a750246a89 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -214,7 +214,7 @@ struct ExecParameters { bool apply_tty_stdin; bool confirm_spawn; bool selinux_context_net; - CGroupControllerMask cgroup_supported; + CGroupMask cgroup_supported; const char *cgroup_path; bool cgroup_delegate; const char *runtime_prefix; diff --git a/src/core/manager.c b/src/core/manager.c index 14f069ba97..c3327e37f5 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -568,7 +568,9 @@ int manager_new(ManagerRunningAs running_as, bool test_run, Manager **_m) { m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; - m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1; + m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = + m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = + m->cgroup_inotify_fd = -1; m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */ m->ask_password_inotify_fd = -1; @@ -2722,7 +2724,7 @@ void manager_check_finished(Manager *m) { SET_FOREACH(u, m->startup_units, i) if (u->cgroup_path) - cgroup_context_apply(unit_get_cgroup_context(u), unit_get_cgroup_mask(u), u->cgroup_path, manager_state(m)); + cgroup_context_apply(unit_get_cgroup_context(u), unit_get_own_mask(u), u->cgroup_path, manager_state(m)); } static int create_generator_dir(Manager *m, char **generator, const char *name) { diff --git a/src/core/manager.h b/src/core/manager.h index 3f7fa24e58..9956cb7700 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -215,16 +215,22 @@ struct Manager { /* Data specific to the cgroup subsystem */ Hashmap *cgroup_unit; - CGroupControllerMask cgroup_supported; + CGroupMask cgroup_supported; char *cgroup_root; - int gc_marker; - unsigned n_in_gc_queue; + /* Notifications from cgroups, when the unified hierarchy is + * used is done via inotify. */ + int cgroup_inotify_fd; + sd_event_source *cgroup_inotify_event_source; + Hashmap *cgroup_inotify_wd_unit; /* Make sure the user cannot accidentally unmount our cgroup * file system */ int pin_cgroupfs_fd; + int gc_marker; + unsigned n_in_gc_queue; + /* Flags */ ManagerRunningAs running_as; ManagerExitCode exit_code:5; diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c index 1782d40720..c6f3569915 100644 --- a/src/core/mount-setup.c +++ b/src/core/mount-setup.c @@ -93,12 +93,14 @@ static const MountPoint mount_table[] = { #endif { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup", "/sys/fs/cgroup", "cgroup", "__DEVEL__sane_behavior", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_FATAL|MNT_IN_CONTAINER }, { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, - NULL, MNT_FATAL|MNT_IN_CONTAINER }, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, - NULL, MNT_IN_CONTAINER }, + cg_is_legacy_wanted, MNT_IN_CONTAINER }, { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, - NULL, MNT_FATAL|MNT_IN_CONTAINER }, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE }, #ifdef ENABLE_EFI @@ -217,6 +219,9 @@ int mount_cgroup_controllers(char ***join_controllers) { _cleanup_set_free_free_ Set *controllers = NULL; int r; + if (!cg_is_legacy_wanted()) + return 0; + /* Mount all available cgroup controllers that are built into the kernel. */ controllers = set_new(&string_hash_ops); diff --git a/src/core/scope.c b/src/core/scope.c index 1e94d63561..44cd324f58 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -22,12 +22,13 @@ #include <errno.h> #include <unistd.h> -#include "unit.h" -#include "scope.h" #include "log.h" -#include "dbus-scope.h" +#include "strv.h" #include "special.h" #include "unit-name.h" +#include "unit.h" +#include "scope.h" +#include "dbus-scope.h" #include "load-dropin.h" static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = { @@ -136,7 +137,9 @@ static int scope_verify(Scope *s) { if (UNIT(s)->load_state != UNIT_LOADED) return 0; - if (set_isempty(UNIT(s)->pids) && UNIT(s)->manager->n_reloading <= 0) { + if (set_isempty(UNIT(s)->pids) && + !manager_is_reloading_or_reexecuting(UNIT(s)->manager) <= 0 && + !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) { log_unit_error(UNIT(s), "Scope has no PIDs. Refusing."); return -EINVAL; } @@ -151,7 +154,7 @@ static int scope_load(Unit *u) { assert(s); assert(u->load_state == UNIT_STUB); - if (!u->transient && UNIT(s)->manager->n_reloading <= 0) + if (!u->transient && !manager_is_reloading_or_reexecuting(u->manager)) return -ENOENT; u->load_state = UNIT_LOADED; @@ -279,6 +282,9 @@ static int scope_start(Unit *u) { assert(s); + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return -EPERM; + if (s->state == SCOPE_FAILED) return -EPERM; @@ -289,7 +295,7 @@ static int scope_start(Unit *u) { assert(s->state == SCOPE_DEAD); - if (!u->transient && UNIT(s)->manager->n_reloading <= 0) + if (!u->transient && !manager_is_reloading_or_reexecuting(u->manager)) return -ENOENT; (void) unit_realize_cgroup(u); @@ -464,6 +470,9 @@ static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *user int scope_abandon(Scope *s) { assert(s); + if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) + return -EPERM; + if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)) return -ESTALE; @@ -499,6 +508,48 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) { return scope_state_to_string(SCOPE(u)->state); } +static int scope_enumerate(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Let's unconditionally add the "init.scope" special unit + * that encapsulates PID 1. Note that PID 1 already is in the + * cgroup for this, we hence just need to allocate the object + * for it and that's it. */ + + u = manager_get_unit(m, SPECIAL_INIT_SCOPE); + if (!u) { + u = unit_new(m, sizeof(Scope)); + if (!u) + return log_oom(); + + r = unit_add_name(u, SPECIAL_INIT_SCOPE); + if (r < 0) { + unit_free(u); + return log_error_errno(r, "Failed to add init.scope name"); + } + } + + u->transient = true; + u->default_dependencies = false; + u->no_gc = true; + SCOPE(u)->deserialized_state = SCOPE_RUNNING; + SCOPE(u)->kill_context.kill_signal = SIGRTMIN+14; + + /* Prettify things, if we can. */ + if (!u->description) + u->description = strdup("System and Service Manager"); + if (!u->documentation) + (void) strv_extend(&u->documentation, "man:systemd(1)"); + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); + + return 0; +} + static const char* const scope_state_table[_SCOPE_STATE_MAX] = { [SCOPE_DEAD] = "dead", [SCOPE_RUNNING] = "running", @@ -565,5 +616,7 @@ const UnitVTable scope_vtable = { .bus_set_property = bus_scope_set_property, .bus_commit_properties = bus_scope_commit_properties, - .can_transient = true + .can_transient = true, + + .enumerate = scope_enumerate, }; diff --git a/src/core/service.c b/src/core/service.c index 5a0a3aa867..292fe50de8 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -767,7 +767,7 @@ static int service_load_pid_file(Service *s, bool may_warn) { } static int service_search_main_pid(Service *s) { - pid_t pid; + pid_t pid = 0; int r; assert(s); @@ -782,9 +782,9 @@ static int service_search_main_pid(Service *s) { assert(s->main_pid <= 0); - pid = unit_search_main_pid(UNIT(s)); - if (pid <= 0) - return -ENOENT; + r = unit_search_main_pid(UNIT(s), &pid); + if (r < 0) + return r; log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid); r = service_set_main_pid(s, pid); @@ -860,7 +860,7 @@ static void service_set_state(Service *s, ServiceState state) { /* For the inactive states unit_notify() will trim the cgroup, * but for exit we have to do that ourselves... */ if (state == SERVICE_EXITED && UNIT(s)->manager->n_reloading <= 0) - unit_destroy_cgroup_if_empty(UNIT(s)); + unit_prune_cgroup(UNIT(s)); /* For remain_after_exit services, let's see if we can "release" the * hold on the console, since unit_notify() only does that in case of @@ -2644,7 +2644,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { break; } } else - service_search_main_pid(s); + (void) service_search_main_pid(s); service_enter_start_post(s); break; @@ -2666,7 +2666,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { break; } } else - service_search_main_pid(s); + (void) service_search_main_pid(s); service_enter_running(s, SERVICE_SUCCESS); break; @@ -2674,7 +2674,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { case SERVICE_RELOAD: if (f == SERVICE_SUCCESS) { service_load_pid_file(s, true); - service_search_main_pid(s); + (void) service_search_main_pid(s); } s->reload_result = f; diff --git a/src/core/slice.c b/src/core/slice.c index 7442d23391..b414462066 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -21,12 +21,13 @@ #include <errno.h> -#include "unit.h" -#include "slice.h" #include "log.h" -#include "dbus-slice.h" +#include "strv.h" #include "special.h" #include "unit-name.h" +#include "unit.h" +#include "slice.h" +#include "dbus-slice.h" static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = { [SLICE_DEAD] = UNIT_INACTIVE, @@ -252,6 +253,40 @@ _pure_ static const char *slice_sub_state_to_string(Unit *u) { return slice_state_to_string(SLICE(u)->state); } +static int slice_enumerate(Manager *m) { + Unit *u; + int r; + + assert(m); + + u = manager_get_unit(m, SPECIAL_ROOT_SLICE); + if (!u) { + u = unit_new(m, sizeof(Slice)); + if (!u) + return log_oom(); + + r = unit_add_name(u, SPECIAL_ROOT_SLICE); + if (r < 0) { + unit_free(u); + return log_error_errno(r, "Failed to add -.slice name"); + } + } + + u->default_dependencies = false; + u->no_gc = true; + SLICE(u)->deserialized_state = SLICE_ACTIVE; + + if (!u->description) + u->description = strdup("Root Slice"); + if (!u->documentation) + (void) strv_extend(&u->documentation, "man:systemd.special(7)"); + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); + + return 0; +} + static const char* const slice_state_table[_SLICE_STATE_MAX] = { [SLICE_DEAD] = "dead", [SLICE_ACTIVE] = "active" @@ -293,6 +328,8 @@ const UnitVTable slice_vtable = { .bus_set_property = bus_slice_set_property, .bus_commit_properties = bus_slice_commit_properties, + .enumerate = slice_enumerate, + .status_message_formats = { .finished_start_job = { [JOB_DONE] = "Created slice %s.", diff --git a/src/core/unit.c b/src/core/unit.c index 34d3adcd3b..8c07c6140d 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -91,6 +91,7 @@ Unit *unit_new(Manager *m, size_t size) { u->unit_file_state = _UNIT_FILE_STATE_INVALID; u->unit_file_preset = -1; u->on_failure_job_mode = JOB_REPLACE; + u->cgroup_inotify_wd = -1; RATELIMIT_INIT(u->auto_stop_ratelimit, 10 * USEC_PER_SEC, 16); @@ -525,10 +526,7 @@ void unit_free(Unit *u) { if (u->in_cgroup_queue) LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u); - if (u->cgroup_path) { - hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); - free(u->cgroup_path); - } + unit_release_cgroup(u); manager_update_failed_units(u->manager, u, false); set_remove(u->manager->startup_units, u); @@ -1801,7 +1799,7 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su /* Make sure the cgroup is always removed when we become inactive */ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) - unit_destroy_cgroup_if_empty(u); + unit_prune_cgroup(u); /* Note that this doesn't apply to RemainAfterExit services exiting * successfully, since there's no change of state in that case. Which is @@ -2028,70 +2026,7 @@ void unit_unwatch_all_pids(Unit *u) { while (!set_isempty(u->pids)) unit_unwatch_pid(u, PTR_TO_LONG(set_first(u->pids))); - set_free(u->pids); - u->pids = NULL; -} - -static int unit_watch_pids_in_path(Unit *u, const char *path) { - _cleanup_closedir_ DIR *d = NULL; - _cleanup_fclose_ FILE *f = NULL; - int ret = 0, r; - - assert(u); - assert(path); - - /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */ - - r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f); - if (r >= 0) { - pid_t pid; - - while ((r = cg_read_pid(f, &pid)) > 0) { - r = unit_watch_pid(u, pid); - if (r < 0 && ret >= 0) - ret = r; - } - if (r < 0 && ret >= 0) - ret = r; - - } else if (ret >= 0) - ret = r; - - r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); - if (r >= 0) { - char *fn; - - while ((r = cg_read_subgroup(d, &fn)) > 0) { - _cleanup_free_ char *p = NULL; - - p = strjoin(path, "/", fn, NULL); - free(fn); - - if (!p) - return -ENOMEM; - - r = unit_watch_pids_in_path(u, p); - if (r < 0 && ret >= 0) - ret = r; - } - if (r < 0 && ret >= 0) - ret = r; - - } else if (ret >= 0) - ret = r; - - return ret; -} - -int unit_watch_all_pids(Unit *u) { - assert(u); - - /* Adds all PIDs from our cgroup to the set of PIDs we watch */ - - if (!u->cgroup_path) - return -ENOENT; - - return unit_watch_pids_in_path(u, u->cgroup_path); + u->pids = set_free(u->pids); } void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) { @@ -2400,31 +2335,6 @@ char *unit_dbus_path(Unit *u) { return unit_dbus_path_from_name(u->id); } -char *unit_default_cgroup_path(Unit *u) { - _cleanup_free_ char *escaped = NULL, *slice = NULL; - int r; - - assert(u); - - if (unit_has_name(u, SPECIAL_ROOT_SLICE)) - return strdup(u->manager->cgroup_root); - - if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) { - r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice); - if (r < 0) - return NULL; - } - - escaped = cg_escape(u->id); - if (!escaped) - return NULL; - - if (slice) - return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL); - else - return strjoin(u->manager->cgroup_root, "/", escaped, NULL); -} - int unit_set_slice(Unit *u, Unit *slice) { assert(u); assert(slice); @@ -2447,6 +2357,10 @@ int unit_set_slice(Unit *u, Unit *slice) { if (slice->type != UNIT_SLICE) return -EINVAL; + if (unit_has_name(u, SPECIAL_INIT_SCOPE) && + !unit_has_name(slice, SPECIAL_ROOT_SLICE)) + return -EPERM; + if (UNIT_DEREF(u->slice) == slice) return 0; @@ -2495,7 +2409,7 @@ int unit_set_default_slice(Unit *u) { slice_name = b; } else slice_name = - u->manager->running_as == MANAGER_SYSTEM + u->manager->running_as == MANAGER_SYSTEM && !unit_has_name(u, SPECIAL_INIT_SCOPE) ? SPECIAL_SYSTEM_SLICE : SPECIAL_ROOT_SLICE; @@ -2704,40 +2618,6 @@ void unit_serialize_item(Unit *u, FILE *f, const char *key, const char *value) { fprintf(f, "%s=%s\n", key, value); } -static int unit_set_cgroup_path(Unit *u, const char *path) { - _cleanup_free_ char *p = NULL; - int r; - - assert(u); - - if (path) { - p = strdup(path); - if (!p) - return -ENOMEM; - } else - p = NULL; - - if (streq_ptr(u->cgroup_path, p)) - return 0; - - if (p) { - r = hashmap_put(u->manager->cgroup_unit, p, u); - if (r < 0) - return r; - } - - if (u->cgroup_path) { - log_unit_debug(u, "Changing cgroup path from %s to %s.", u->cgroup_path, strna(p)); - hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); - free(u->cgroup_path); - } - - u->cgroup_path = p; - p = NULL; - - return 0; -} - int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { ExecRuntime **rt = NULL; size_t offset; @@ -2868,6 +2748,8 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { if (r < 0) log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v); + (void) unit_watch_cgroup(u); + continue; } else if (streq(l, "cgroup-realized")) { int b; @@ -3600,18 +3482,22 @@ int unit_kill_context( } else if (r > 0) { - /* FIXME: For now, we will not wait for the - * cgroup members to die if we are running in - * a container or if this is a delegation - * unit, simply because cgroup notification is - * unreliable in these cases. It doesn't work - * at all in containers, and outside of - * containers it can be confused easily by - * left-over directories in the cgroup -- - * which however should not exist in - * non-delegated units. */ - - if (detect_container(NULL) == 0 && !unit_cgroup_delegate(u)) + /* FIXME: For now, on the legacy hierarchy, we + * will not wait for the cgroup members to die + * if we are running in a container or if this + * is a delegation unit, simply because cgroup + * notification is unreliable in these + * cases. It doesn't work at all in + * containers, and outside of containers it + * can be confused easily by left-over + * directories in the cgroup -- which however + * should not exist in non-delegated units. On + * the unified hierarchy that's different, + * there we get proper events. Hence rely on + * them.*/ + + if (cg_unified() > 0 || + (detect_container(NULL) == 0 && !unit_cgroup_delegate(u))) wait_for_exit = true; if (c->send_sighup && k != KILL_KILL) { diff --git a/src/core/unit.h b/src/core/unit.h index bc26653247..3c7684411b 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -184,9 +184,10 @@ struct Unit { /* Counterparts in the cgroup filesystem */ char *cgroup_path; - CGroupControllerMask cgroup_realized_mask; - CGroupControllerMask cgroup_subtree_mask; - CGroupControllerMask cgroup_members_mask; + CGroupMask cgroup_realized_mask; + CGroupMask cgroup_subtree_mask; + CGroupMask cgroup_members_mask; + int cgroup_inotify_wd; /* How to start OnFailure units */ JobMode on_failure_job_mode; @@ -522,7 +523,6 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su int unit_watch_pid(Unit *u, pid_t pid); void unit_unwatch_pid(Unit *u, pid_t pid); -int unit_watch_all_pids(Unit *u); void unit_unwatch_all_pids(Unit *u); void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2); @@ -567,8 +567,6 @@ bool unit_active_or_pending(Unit *u); int unit_add_default_target_dependency(Unit *u, Unit *target); -char *unit_default_cgroup_path(Unit *u); - void unit_start_on_failure(Unit *u); void unit_trigger_notify(Unit *u); |