From d8fdc62037b5b0a9fd603ad5efd6b49f956f86b5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 May 2016 20:43:23 +0200 Subject: core: use an AF_UNIX/SOCK_DGRAM socket for cgroup agent notification dbus-daemon currently uses a backlog of 30 on its D-bus system bus socket. On overloaded systems this means that only 30 connections may be queued without dbus-daemon processing them before further connection attempts fail. Our cgroups-agent binary so far used D-Bus for its messaging, and hitting this limit hence may result in us losing cgroup empty messages. This patch adds a seperate cgroup agent socket of type AF_UNIX/SOCK_DGRAM. Since sockets of these types need no connection set up, no listen() backlog applies. Our cgroup-agent binary will hence simply block as long as it can't enqueue its datagram message, so that we won't lose cgroup empty messages as likely anymore. This also rearranges the ordering of the processing of SIGCHLD signals, service notification messages (sd_notify()...) and the two types of cgroup notifications (inotify for the unified hierarchy support, and agent for the classic hierarchy support). We now always process events for these in the following order: 1. service notification messages (SD_EVENT_PRIORITY_NORMAL-7) 2. SIGCHLD signals (SD_EVENT_PRIORITY_NORMAL-6) 3. cgroup inotify and cgroup agent (SD_EVENT_PRIORITY_NORMAL-5) This is because when receiving SIGCHLD we invalidate PID information, which we need to process the service notification messages which are bound to PIDs. Hence the order between the first two items. And we want to process SIGCHLD metadata to detect whether a service is gone, before using cgroup notifications, to decide when a service is gone, since the former carries more useful metadata. Related to this: https://bugs.freedesktop.org/show_bug.cgi?id=95264 https://github.com/systemd/systemd/issues/1961 --- src/core/cgroup.c | 6 ++- src/core/dbus.c | 73 ++++++++++--------------- src/core/dbus.h | 2 + src/core/manager.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++---- src/core/manager.h | 3 ++ 5 files changed, 184 insertions(+), 55 deletions(-) (limited to 'src/core') diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 25cc6962f9..1a94b188cb 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1312,7 +1312,9 @@ int manager_setup_cgroup(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to watch control group inotify object: %m"); - r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5); + /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also + * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5); if (r < 0) return log_error_errno(r, "Failed to set priority of inotify event source: %m"); @@ -1458,6 +1460,8 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { assert(m); assert(cgroup); + log_debug("Got cgroup empty notification for: %s", cgroup); + u = manager_get_unit_by_cgroup(m, cgroup); if (!u) return 0; diff --git a/src/core/dbus.c b/src/core/dbus.c index 263955d874..c8375a0475 100644 --- a/src/core/dbus.c +++ b/src/core/dbus.c @@ -71,28 +71,42 @@ int bus_send_queued_message(Manager *m) { return 0; } +int bus_forward_agent_released(Manager *m, const char *path) { + int r; + + assert(m); + assert(path); + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (!m->system_bus) + return 0; + + /* If we are running a system instance we forward the agent message on the system bus, so that the user + * instances get notified about this, too */ + + r = sd_bus_emit_signal(m->system_bus, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + "s", path); + if (r < 0) + return log_warning_errno(r, "Failed to propagate agent release message: %m"); + + return 1; +} + static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) { _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; - const char *cgroup, *me; Manager *m = userdata; + const char *cgroup; uid_t sender_uid; - sd_bus *bus; int r; assert(message); assert(m); - /* ignore recursive events sent by us on the system/user bus */ - bus = sd_bus_message_get_bus(message); - if (!sd_bus_is_server(bus)) { - r = sd_bus_get_unique_name(bus, &me); - if (r < 0) - return r; - - if (streq_ptr(sd_bus_message_get_sender(message), me)) - return 0; - } - /* only accept org.freedesktop.systemd1.Agent from UID=0 */ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); if (r < 0) @@ -110,16 +124,6 @@ static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus } manager_notify_cgroup_empty(m, cgroup); - - /* if running as system-instance, forward under our name */ - if (MANAGER_IS_SYSTEM(m) && m->system_bus) { - r = sd_bus_message_rewind(message, 1); - if (r >= 0) - r = sd_bus_send(m->system_bus, message, NULL); - if (r < 0) - log_warning_errno(r, "Failed to forward Released message: %m"); - } - return 0; } @@ -690,25 +694,6 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void return 0; } - if (MANAGER_IS_SYSTEM(m)) { - /* When we run as system instance we get the Released - * signal via a direct connection */ - - r = sd_bus_add_match( - bus, - NULL, - "type='signal'," - "interface='org.freedesktop.systemd1.Agent'," - "member='Released'," - "path='/org/freedesktop/systemd1/agent'", - signal_agent_released, m); - - if (r < 0) { - log_warning_errno(r, "Failed to register Released match on new connection bus: %m"); - return 0; - } - } - r = bus_setup_disconnected_match(m, bus); if (r < 0) return 0; @@ -906,8 +891,8 @@ static int bus_setup_system(Manager *m, sd_bus *bus) { assert(m); assert(bus); - /* On kdbus or if we are a user instance we get the Released message via the system bus */ - if (MANAGER_IS_USER(m) || m->kdbus_fd >= 0) { + /* if we are a user instance we get the Released message via the system bus */ + if (MANAGER_IS_USER(m)) { r = sd_bus_add_match( bus, NULL, diff --git a/src/core/dbus.h b/src/core/dbus.h index e16a84fbb8..6baaffbd75 100644 --- a/src/core/dbus.h +++ b/src/core/dbus.h @@ -40,3 +40,5 @@ int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error); + +int bus_forward_agent_released(Manager *m, const char *path); diff --git a/src/core/manager.c b/src/core/manager.c index bd00c224f4..17b940c11a 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -87,6 +87,7 @@ #include "watchdog.h" #define NOTIFY_RCVBUF_SIZE (8*1024*1024) +#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) /* Initial delay and the interval for printing status messages about running jobs */ #define JOBS_IN_PROGRESS_WAIT_USEC (5*USEC_PER_SEC) @@ -94,6 +95,7 @@ #define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); @@ -484,11 +486,11 @@ static int manager_setup_signals(Manager *m) { (void) sd_event_source_set_description(m->signal_event_source, "manager-signal"); - /* Process signals a bit earlier than the rest of things, but - * later than notify_fd processing, so that the notify - * processing can still figure out to which process/service a - * message belongs, before we reap the process. */ - r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-5); + /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the + * notify processing can still figure out to which process/service a message belongs, before we reap the + * process. Also, process this before handling cgroup notifications, so that we always collect child exit + * status information before detecting that there's no process in a cgroup. */ + r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6); if (r < 0) return r; @@ -581,12 +583,12 @@ int manager_new(UnitFileScope scope, bool test_run, Manager **_m) { m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; - m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = - m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->cgroup_inotify_fd = -1; + m->pin_cgroupfs_fd = m->notify_fd = m->cgroups_agent_fd = m->signal_fd = m->time_change_fd = + m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->cgroup_inotify_fd = + m->ask_password_inotify_fd = -1; m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */ - m->ask_password_inotify_fd = -1; m->have_ask_password = -EINVAL; /* we don't know */ m->first_boot = -1; @@ -722,8 +724,8 @@ static int manager_setup_notify(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to allocate notify event source: %m"); - /* Process signals a bit earlier than SIGCHLD, so that we can - * still identify to which service an exit message belongs */ + /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which + * service an exit message belongs. */ r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-7); if (r < 0) return log_error_errno(r, "Failed to set priority of notify event source: %m"); @@ -734,6 +736,79 @@ static int manager_setup_notify(Manager *m) { return 0; } +static int manager_setup_cgroups_agent(Manager *m) { + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/cgroups-agent", + }; + int r; + + /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering + * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and + * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on + * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number + * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, + * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX + * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and + * we thus won't lose messages. + * + * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen + * to it. The system instance hence listens on this special socket, but the user instances listen on the system + * bus for these messages. */ + + if (m->test_run) + return 0; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (cg_unified() > 0) /* We don't need this anymore on the unified hierarchy */ + return 0; + + if (m->cgroups_agent_fd < 0) { + _cleanup_close_ int fd = -1; + + /* First free all secondary fields */ + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); + + fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); + + (void) unlink(sa.un.sun_path); + + /* Only allow root to connect to this socket */ + RUN_WITH_UMASK(0077) + r = bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + m->cgroups_agent_fd = fd; + fd = -1; + } + + if (!m->cgroups_agent_event_source) { + r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); + + /* Process cgroups notifications early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification, + * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of + * cgroup inotify for the unified cgroup stuff. */ + r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); + + (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); + } + + return 0; +} + static int manager_setup_kdbus(Manager *m) { _cleanup_free_ char *p = NULL; @@ -944,12 +1019,14 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->signal_event_source); sd_event_source_unref(m->notify_event_source); + sd_event_source_unref(m->cgroups_agent_event_source); sd_event_source_unref(m->time_change_event_source); sd_event_source_unref(m->jobs_in_progress_event_source); sd_event_source_unref(m->run_queue_event_source); safe_close(m->signal_fd); safe_close(m->notify_fd); + safe_close(m->cgroups_agent_fd); safe_close(m->time_change_fd); safe_close(m->kdbus_fd); @@ -1142,6 +1219,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { if (q < 0 && r == 0) r = q; + q = manager_setup_cgroups_agent(m); + if (q < 0 && r == 0) + r = q; + /* We might have deserialized the kdbus control fd, but if we * didn't, then let's create the bus now. */ manager_setup_kdbus(m); @@ -1479,6 +1560,35 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) { return n; } +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + char buf[PATH_MAX+1]; + ssize_t n; + + n = recv(fd, buf, sizeof(buf), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read cgroups agent message: %m"); + if (n == 0) { + log_error("Got zero-length cgroups agent message, ignoring."); + return 0; + } + if ((size_t) n >= sizeof(buf)) { + log_error("Got overly long cgroups agent message, ignoring."); + return 0; + } + + if (memchr(buf, 0, n)) { + log_error("Got cgroups agent message with embedded NUL byte, ignoring."); + return 0; + } + buf[n] = 0; + + manager_notify_cgroup_empty(m, buf); + bus_forward_agent_released(m, buf); + + return 0; +} + static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, const char *buf, size_t n, FDSet *fds) { _cleanup_strv_free_ char **tags = NULL; @@ -2265,6 +2375,16 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) { fprintf(f, "notify-socket=%s\n", m->notify_socket); } + if (m->cgroups_agent_fd >= 0) { + int copy; + + copy = fdset_put_dup(fds, m->cgroups_agent_fd); + if (copy < 0) + return copy; + + fprintf(f, "cgroups-agent-fd=%i\n", copy); + } + if (m->kdbus_fd >= 0) { int copy; @@ -2432,6 +2552,17 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { free(m->notify_socket); m->notify_socket = n; + } else if (startswith(l, "cgroups-agent-fd=")) { + int fd; + + if (safe_atoi(l + 17, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_debug("Failed to parse cgroups agent fd: %s", l + 10); + else { + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + safe_close(m->cgroups_agent_fd); + m->cgroups_agent_fd = fdset_remove(fds, fd); + } + } else if (startswith(l, "kdbus-fd=")) { int fd; @@ -2552,6 +2683,10 @@ int manager_reload(Manager *m) { if (q < 0 && r >= 0) r = q; + q = manager_setup_cgroups_agent(m); + if (q < 0 && r >= 0) + r = q; + /* Third, fire things up! */ manager_coldplug(m); diff --git a/src/core/manager.h b/src/core/manager.h index 17f84e6963..4bccca75cb 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -132,6 +132,9 @@ struct Manager { int notify_fd; sd_event_source *notify_event_source; + int cgroups_agent_fd; + sd_event_source *cgroups_agent_event_source; + int signal_fd; sd_event_source *signal_event_source; -- cgit v1.2.3-54-g00ecf