From 8d6e80343a1463afbaed1beca4b18c49ce056034 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 6 Dec 2016 15:51:26 +0100 Subject: util-lib: improve container detection logic Previously, systemd-detect-virt was unable to detect "systemd-nspawn -a" container environments, i.e. where PID 1 is a stub process running in host context, as in that case /proc/1/environ was inherited from the host. Let's improve that, and add an additional check for container environments where /proc/1/environ is not cleaned up and does not contain the $container environment variable: The /proc/1/sched file shows the host PID in the first line. if this is not 1, we know we are running in a PID namespace (but not which implementation). With these changes we should be able to detect container environments that don't set $container at all. --- src/basic/virt.c | 81 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 37 deletions(-) (limited to 'src') diff --git a/src/basic/virt.c b/src/basic/virt.c index 9b7eb71319..33641e6886 100644 --- a/src/basic/virt.c +++ b/src/basic/virt.c @@ -409,8 +409,7 @@ int detect_container(void) { if (cached_found >= 0) return cached_found; - /* /proc/vz exists in container and outside of the container, - * /proc/bc only outside of the container. */ + /* /proc/vz exists in container and outside of the container, /proc/bc only outside of the container. */ if (access("/proc/vz", F_OK) >= 0 && access("/proc/bc", F_OK) < 0) { r = VIRTUALIZATION_OPENVZ; @@ -418,50 +417,58 @@ int detect_container(void) { } if (getpid() == 1) { - /* If we are PID 1 we can just check our own - * environment variable */ + /* If we are PID 1 we can just check our own environment variable, and that's authoritative. */ e = getenv("container"); if (isempty(e)) { r = VIRTUALIZATION_NONE; goto finish; } - } else { - - /* Otherwise, PID 1 dropped this information into a - * file in /run. This is better than accessing - * /proc/1/environ, since we don't need CAP_SYS_PTRACE - * for that. */ - - r = read_one_line_file("/run/systemd/container", &m); - if (r == -ENOENT) { - - /* Fallback for cases where PID 1 was not - * systemd (for example, cases where - * init=/bin/sh is used. */ - - r = getenv_for_pid(1, "container", &m); - if (r <= 0) { - - /* If that didn't work, give up, - * assume no container manager. - * - * Note: This means we still cannot - * detect containers if init=/bin/sh - * is passed but privileges dropped, - * as /proc/1/environ is only readable - * with privileges. */ - - r = VIRTUALIZATION_NONE; - goto finish; - } - } - if (r < 0) - return r; + goto translate_name; + } + + /* Otherwise, PID 1 might have dropped this information into a file in /run. This is better than accessing + * /proc/1/environ, since we don't need CAP_SYS_PTRACE for that. */ + r = read_one_line_file("/run/systemd/container", &m); + if (r >= 0) { + e = m; + goto translate_name; + } + if (r != -ENOENT) + return log_debug_errno(r, "Failed to read /run/systemd/container: %m"); + + /* Fallback for cases where PID 1 was not systemd (for example, cases where init=/bin/sh is used. */ + r = getenv_for_pid(1, "container", &m); + if (r > 0) { e = m; + goto translate_name; } + if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */ + log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m"); + + /* Interestingly /proc/1/sched actually shows the host's PID for what we see as PID 1. Hence, if the PID shown + * there is not 1, we know we are in a PID namespace. and hence a container. */ + r = read_one_line_file("/proc/1/sched", &m); + if (r >= 0) { + const char *t; + + t = strrchr(m, '('); + if (!t) + return -EIO; + + if (!startswith(t, "(1,")) { + r = VIRTUALIZATION_CONTAINER_OTHER; + goto finish; + } + } else if (r != -ENOENT) + return r; + + /* If that didn't work, give up, assume no container manager. */ + r = VIRTUALIZATION_NONE; + goto finish; +translate_name: for (j = 0; j < ELEMENTSOF(value_table); j++) if (streq(e, value_table[j].value)) { r = value_table[j].id; @@ -471,7 +478,7 @@ int detect_container(void) { r = VIRTUALIZATION_CONTAINER_OTHER; finish: - log_debug("Found container virtualization %s", virtualization_to_string(r)); + log_debug("Found container virtualization %s.", virtualization_to_string(r)); cached_found = r; return r; } -- cgit v1.2.3-54-g00ecf From 75bf701f5c5ff4ce0679d7bcc7000aa497e1f6ad Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 6 Dec 2016 18:19:23 +0100 Subject: nspawn: flush out environment block of the -a stub init process The container detection code in virt.c we ship checks for /proc/1/environ, looking for "container=" in it. Let's make sure our "-a" init stub exposes that correctly. Without this "systemd-detect-virt" run in a "-a" container won't detect that it is being run in a container. --- src/nspawn/nspawn-stub-pid1.c | 29 ++++++++++++++++++++++++++++- src/nspawn/nspawn-stub-pid1.h | 4 +++- src/nspawn/nspawn.c | 2 +- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/nspawn/nspawn-stub-pid1.c b/src/nspawn/nspawn-stub-pid1.c index 2de87e3c63..38ab37367e 100644 --- a/src/nspawn/nspawn-stub-pid1.c +++ b/src/nspawn/nspawn-stub-pid1.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "fd-util.h" #include "log.h" @@ -29,7 +30,22 @@ #include "time-util.h" #include "def.h" -int stub_pid1(void) { +static int reset_environ(const char *new_environment, size_t length) { + unsigned long start, end; + + start = (unsigned long) new_environment; + end = start + length; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0) + return -errno; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0) + return -errno; + + return 0; +} + +int stub_pid1(sd_id128_t uuid) { enum { STATE_RUNNING, STATE_REBOOT, @@ -41,6 +57,11 @@ int stub_pid1(void) { pid_t pid; int r; + /* The new environment we set up, on the stack. */ + char new_environment[] = + "container=systemd-nspawn\0" + "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; + /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful * for allowing arbitrary processes run in a container, and still have all zombies reaped. */ @@ -64,6 +85,12 @@ int stub_pid1(void) { close_all_fds(NULL, 0); log_open(); + /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also, + * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ + * find them set set. */ + sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX); + reset_environ(new_environment, sizeof(new_environment)); + rename_process("STUBINIT"); assert_se(sigemptyset(&waitmask) >= 0); diff --git a/src/nspawn/nspawn-stub-pid1.h b/src/nspawn/nspawn-stub-pid1.h index 36c1aaf5dd..7ca83078c0 100644 --- a/src/nspawn/nspawn-stub-pid1.h +++ b/src/nspawn/nspawn-stub-pid1.h @@ -19,4 +19,6 @@ along with systemd; If not, see . ***/ -int stub_pid1(void); +#include "sd-id128.h" + +int stub_pid1(sd_id128_t uuid); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 080bd7c31e..dcc639f15c 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2278,7 +2278,7 @@ static int inner_child( return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir); if (arg_start_mode == START_PID2) { - r = stub_pid1(); + r = stub_pid1(arg_uuid); if (r < 0) return r; } -- cgit v1.2.3-54-g00ecf From 493097eecc03105892740b8b0fe9739296130703 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 6 Dec 2016 19:34:18 +0100 Subject: journalctl: improve wording in an errors message Fixes: #4660 --- src/journal/journalctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c index 10d3ff3b45..ecd1e94a33 100644 --- a/src/journal/journalctl.c +++ b/src/journal/journalctl.c @@ -2318,7 +2318,7 @@ int main(int argc, char *argv[]) { if (arg_boot_offset != 0 && sd_journal_has_runtime_files(j) > 0 && sd_journal_has_persistent_files(j) == 0) { - log_info("Specifying boot ID has no effect, no persistent journal was found"); + log_info("Specifying boot ID or boot offset has no effect, no persistent journal was found."); r = 0; goto finish; } -- cgit v1.2.3-54-g00ecf From 61f638e5446d0d4a5b5e7f81c174e4f072bd01f7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 6 Dec 2016 19:35:31 +0100 Subject: machinectl: make "machinectl -E … shell" work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: #4823 --- src/machine/machinectl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/machine/machinectl.c b/src/machine/machinectl.c index 3294ea7821..36c2607ba9 100644 --- a/src/machine/machinectl.c +++ b/src/machine/machinectl.c @@ -2769,7 +2769,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argv); for (;;) { - static const char option_string[] = "-hp:als:H:M:qn:o:"; + static const char option_string[] = "-hp:als:H:M:qn:o:E:"; c = getopt_long(argc, argv, option_string + reorder, options, NULL); if (c < 0) -- cgit v1.2.3-54-g00ecf From e932f5407ef5ad05d25d7dfefa4cda0fe81cc346 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 6 Dec 2016 19:36:30 +0100 Subject: sysv-generator: properly translate sysv facilities We used the wrong return value in one case, so that our translations were thrown away. While we are at it, make sure to always initialize *ret on successful function exits. Fixes: #4762 --- src/sysv-generator/sysv-generator.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/sysv-generator/sysv-generator.c b/src/sysv-generator/sysv-generator.c index 921fd478d0..9fde9b1884 100644 --- a/src/sysv-generator/sysv-generator.c +++ b/src/sysv-generator/sysv-generator.c @@ -292,8 +292,10 @@ static int sysv_translate_facility(SysvStub *s, unsigned line, const char *name, if (!streq(table[i], n)) continue; - if (!table[i+1]) + if (!table[i+1]) { + *ret = NULL; return 0; + } m = strdup(table[i+1]); if (!m) @@ -312,7 +314,7 @@ static int sysv_translate_facility(SysvStub *s, unsigned line, const char *name, if (r < 0) return log_error_errno(r, "[%s:%u] Could not build name for facility %s: %m", s->path, line, name); - return r; + return 1; } /* Strip ".sh" suffix from file name for comparison */ @@ -324,8 +326,10 @@ static int sysv_translate_facility(SysvStub *s, unsigned line, const char *name, } /* Names equaling the file name of the services are redundant */ - if (streq_ptr(n, filename)) + if (streq_ptr(n, filename)) { + *ret = NULL; return 0; + } /* Everything else we assume to be normal service names */ m = sysv_translate_name(n); -- cgit v1.2.3-54-g00ecf From 9bfaffd5a9c138b0bbdfb452b3f4435746f2d18b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 6 Dec 2016 20:29:07 +0100 Subject: util-lib: rework rename_process() to be able to make use of PR_SET_MM_ARG_START PR_SET_MM_ARG_START allows us to relatively cleanly implement process renaming. However, it's only available with privileges. Hence, let's try to make use of it, and if we can't fall back to the traditional way of overriding argv[0]. This removes size restrictions on the process name shown in argv[] at least for privileged processes. --- src/basic/process-util.c | 102 +++++++++++++++++++++++++++++++++++++------ src/basic/process-util.h | 2 +- src/test/test-process-util.c | 61 ++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 1f4c2e4e43..d5e7edb589 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -274,27 +275,100 @@ int get_process_cmdline(pid_t pid, size_t max_length, bool comm_fallback, char * return 0; } -void rename_process(const char name[8]) { - assert(name); +int rename_process(const char name[]) { + static size_t mm_size = 0; + static char *mm = NULL; + bool truncated = false; + size_t l; + + /* This is a like a poor man's setproctitle(). It changes the comm field, argv[0], and also the glibc's + * internally used name of the process. For the first one a limit of 16 chars applies; to the second one in + * many cases one of 10 (i.e. length of "/sbin/init") — however if we have CAP_SYS_RESOURCES it is unbounded; + * to the third one 7 (i.e. the length of "systemd". If you pass a longer string it will likely be + * truncated. + * + * Returns 0 if a name was set but truncated, > 0 if it was set but not truncated. */ + + if (isempty(name)) + return -EINVAL; /* let's not confuse users unnecessarily with an empty name */ - /* This is a like a poor man's setproctitle(). It changes the - * comm field, argv[0], and also the glibc's internally used - * name of the process. For the first one a limit of 16 chars - * applies, to the second one usually one of 10 (i.e. length - * of "/sbin/init"), to the third one one of 7 (i.e. length of - * "systemd"). If you pass a longer string it will be - * truncated */ + l = strlen(name); + /* First step, change the comm field. */ (void) prctl(PR_SET_NAME, name); + if (l > 15) /* Linux process names can be 15 chars at max */ + truncated = true; + + /* Second step, change glibc's ID of the process name. */ + if (program_invocation_name) { + size_t k; + + k = strlen(program_invocation_name); + strncpy(program_invocation_name, name, k); + if (l > k) + truncated = true; + } + + /* Third step, completely replace the argv[] array the kernel maintains for us. This requires privileges, but + * has the advantage that the argv[] array is exactly what we want it to be, and not filled up with zeros at + * the end. This is the best option for changing /proc/self/cmdline.*/ + if (mm_size < l+1) { + size_t nn_size; + char *nn; + + /* Let's not bother with this if we don't have euid == 0. Strictly speaking if people do weird stuff + * with capabilities this could work even for euid != 0, but our own code generally doesn't do that, + * hence let's use this as quick bypass check, to avoid calling mmap() if PR_SET_MM_ARG_START fails + * with EPERM later on anyway. After all geteuid() is dead cheap to call, but mmap() is not. */ + if (geteuid() != 0) { + log_debug("Skipping PR_SET_MM_ARG_START, as we don't have privileges."); + goto use_saved_argv; + } + + nn_size = PAGE_ALIGN(l+1); + nn = mmap(NULL, nn_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (nn == MAP_FAILED) { + log_debug_errno(errno, "mmap() failed: %m"); + goto use_saved_argv; + } + + strncpy(nn, name, nn_size); + + /* Now, let's tell the kernel about this new memory */ + if (prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long) nn, 0, 0) < 0) { + log_debug_errno(errno, "PR_SET_MM_ARG_START failed, proceeding without: %m"); + (void) munmap(nn, nn_size); + goto use_saved_argv; + } + + /* And update the end pointer to the new end, too. If this fails, we don't really know what to do, it's + * pretty unlikely that we can rollback, hence we'll just accept the failure, and continue. */ + if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) nn + l + 1, 0, 0) < 0) + log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m"); - if (program_invocation_name) - strncpy(program_invocation_name, name, strlen(program_invocation_name)); + if (mm) + (void) munmap(mm, mm_size); + + mm = nn; + mm_size = nn_size; + } else + strncpy(mm, name, mm_size); + +use_saved_argv: + /* Fourth step: in all cases we'll also update the original argv[], so that our own code gets it right too if + * it still looks here */ if (saved_argc > 0) { int i; - if (saved_argv[0]) - strncpy(saved_argv[0], name, strlen(saved_argv[0])); + if (saved_argv[0]) { + size_t k; + + k = strlen(saved_argv[0]); + strncpy(saved_argv[0], name, k); + if (l > k) + truncated = true; + } for (i = 1; i < saved_argc; i++) { if (!saved_argv[i]) @@ -303,6 +377,8 @@ void rename_process(const char name[8]) { memzero(saved_argv[i], strlen(saved_argv[i])); } } + + return !truncated; } int is_kernel_thread(pid_t pid) { diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 89dfeb4d6a..d378901399 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -64,7 +64,7 @@ void sigkill_waitp(pid_t *pid); int kill_and_sigcont(pid_t pid, int sig); -void rename_process(const char name[8]); +int rename_process(const char name[]); int is_kernel_thread(pid_t pid); int getenv_for_pid(pid_t pid, const char *field, char **_value); diff --git a/src/test/test-process-util.c b/src/test/test-process-util.c index 7242b2c8b5..c5edbcc5d2 100644 --- a/src/test/test-process-util.c +++ b/src/test/test-process-util.c @@ -355,10 +355,70 @@ static void test_get_process_cmdline_harder(void) { _exit(0); } +static void test_rename_process_one(const char *p, int ret) { + _cleanup_free_ char *comm = NULL, *cmdline = NULL; + pid_t pid; + int r; + + pid = fork(); + assert_se(pid >= 0); + + if (pid > 0) { + siginfo_t si; + + assert_se(wait_for_terminate(pid, &si) >= 0); + assert_se(si.si_code == CLD_EXITED); + assert_se(si.si_status == EXIT_SUCCESS); + + return; + } + + /* child */ + r = rename_process(p); + + assert_se(r == ret || + (ret == 0 && r >= 0) || + (ret > 0 && r > 0)); + + if (r < 0) + goto finish; + +#ifdef HAVE_VALGRIND_VALGRIND_H + /* see above, valgrind is weird, we can't verify what we are doing here */ + if (RUNNING_ON_VALGRIND) + goto finish; +#endif + + assert_se(get_process_comm(0, &comm) >= 0); + log_info("comm = <%s>", comm); + assert_se(strneq(comm, p, 15)); + + assert_se(get_process_cmdline(0, 0, false, &cmdline) >= 0); + log_info("cmdline = <%s>", cmdline); + assert_se(strneq(p, cmdline, strlen("test-process-util"))); + assert_se(startswith(p, cmdline)); + +finish: + _exit(EXIT_SUCCESS); +} + +static void test_rename_process(void) { + test_rename_process_one(NULL, -EINVAL); + test_rename_process_one("", -EINVAL); + test_rename_process_one("foo", 1); /* should always fit */ + test_rename_process_one("this is a really really long process name, followed by some more words", 0); /* unlikely to fit */ + test_rename_process_one("1234567", 1); /* should always fit */ +} + int main(int argc, char *argv[]) { + + log_set_max_level(LOG_DEBUG); log_parse_environment(); log_open(); + saved_argc = argc; + saved_argv = argv; + if (argc > 1) { pid_t pid = 0; @@ -373,6 +433,7 @@ int main(int argc, char *argv[]) { test_pid_is_alive(); test_personality(); test_get_process_cmdline_harder(); + test_rename_process(); return 0; } -- cgit v1.2.3-54-g00ecf