diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/basic/fs-util.c | 187 | ||||
-rw-r--r-- | src/basic/fs-util.h | 2 | ||||
-rw-r--r-- | src/basic/mount-util.c | 71 | ||||
-rw-r--r-- | src/basic/mount-util.h | 2 | ||||
-rw-r--r-- | src/basic/user-util.c | 32 | ||||
-rw-r--r-- | src/basic/user-util.h | 1 | ||||
-rw-r--r-- | src/core/dbus-execute.c | 9 | ||||
-rw-r--r-- | src/core/execute.c | 392 | ||||
-rw-r--r-- | src/core/execute.h | 2 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 2 | ||||
-rw-r--r-- | src/core/main.c | 6 | ||||
-rw-r--r-- | src/core/namespace.c | 586 | ||||
-rw-r--r-- | src/core/namespace.h | 3 | ||||
-rw-r--r-- | src/core/unit.c | 6 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.c | 34 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 2 | ||||
-rw-r--r-- | src/run/run.c | 18 | ||||
-rw-r--r-- | src/shared/bus-unit-util.c | 2 | ||||
-rw-r--r-- | src/test/test-execute.c | 26 | ||||
-rw-r--r-- | src/test/test-fs-util.c | 96 | ||||
-rw-r--r-- | src/test/test-ns.c | 14 |
21 files changed, 1244 insertions, 249 deletions
diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c index ce87257bc1..86d9ad7e36 100644 --- a/src/basic/fs-util.c +++ b/src/basic/fs-util.c @@ -597,3 +597,190 @@ int inotify_add_watch_fd(int fd, int what, uint32_t mask) { return r; } + +int chase_symlinks(const char *path, const char *_root, char **ret) { + _cleanup_free_ char *buffer = NULL, *done = NULL, *root = NULL; + _cleanup_close_ int fd = -1; + unsigned max_follow = 32; /* how many symlinks to follow before giving up and returning ELOOP */ + char *todo; + int r; + + assert(path); + + /* This is a lot like canonicalize_file_name(), but takes an additional "root" parameter, that allows following + * symlinks relative to a root directory, instead of the root of the host. + * + * Note that "root" matters only if we encounter an absolute symlink, it's unused otherwise. Most importantly + * this means the path parameter passed in is not prefixed by it. + * + * Algorithmically this operates on two path buffers: "done" are the components of the path we already + * processed and resolved symlinks, "." and ".." of. "todo" are the components of the path we still need to + * process. On each iteration, we move one component from "todo" to "done", processing it's special meaning + * each time. The "todo" path always starts with at least one slash, the "done" path always ends in no + * slash. We always keep an O_PATH fd to the component we are currently processing, thus keeping lookup races + * at a minimum. */ + + r = path_make_absolute_cwd(path, &buffer); + if (r < 0) + return r; + + if (_root) { + r = path_make_absolute_cwd(_root, &root); + if (r < 0) + return r; + } + + fd = open("/", O_CLOEXEC|O_NOFOLLOW|O_PATH); + if (fd < 0) + return -errno; + + todo = buffer; + for (;;) { + _cleanup_free_ char *first = NULL; + _cleanup_close_ int child = -1; + struct stat st; + size_t n, m; + + /* Determine length of first component in the path */ + n = strspn(todo, "/"); /* The slashes */ + m = n + strcspn(todo + n, "/"); /* The entire length of the component */ + + /* Extract the first component. */ + first = strndup(todo, m); + if (!first) + return -ENOMEM; + + todo += m; + + /* Just a single slash? Then we reached the end. */ + if (isempty(first) || path_equal(first, "/")) + break; + + /* Just a dot? Then let's eat this up. */ + if (path_equal(first, "/.")) + continue; + + /* Two dots? Then chop off the last bit of what we already found out. */ + if (path_equal(first, "/..")) { + _cleanup_free_ char *parent = NULL; + int fd_parent = -1; + + if (isempty(done) || path_equal(done, "/")) + return -EINVAL; + + parent = dirname_malloc(done); + if (!parent) + return -ENOMEM; + + /* Don't allow this to leave the root dir */ + if (root && + path_startswith(done, root) && + !path_startswith(parent, root)) + return -EINVAL; + + free(done); + done = parent; + parent = NULL; + + fd_parent = openat(fd, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH); + if (fd_parent < 0) + return -errno; + + safe_close(fd); + fd = fd_parent; + + continue; + } + + /* Otherwise let's see what this is. */ + child = openat(fd, first + n, O_CLOEXEC|O_NOFOLLOW|O_PATH); + if (child < 0) + return -errno; + + if (fstat(child, &st) < 0) + return -errno; + + if (S_ISLNK(st.st_mode)) { + _cleanup_free_ char *destination = NULL; + + /* This is a symlink, in this case read the destination. But let's make sure we don't follow + * symlinks without bounds. */ + if (--max_follow <= 0) + return -ELOOP; + + r = readlinkat_malloc(fd, first + n, &destination); + if (r < 0) + return r; + if (isempty(destination)) + return -EINVAL; + + if (path_is_absolute(destination)) { + + /* An absolute destination. Start the loop from the beginning, but use the root + * directory as base. */ + + safe_close(fd); + fd = open(root ?: "/", O_CLOEXEC|O_NOFOLLOW|O_PATH); + if (fd < 0) + return -errno; + + free(buffer); + buffer = destination; + destination = NULL; + + todo = buffer; + free(done); + + /* Note that we do not revalidate the root, we take it as is. */ + if (isempty(root)) + done = NULL; + else { + done = strdup(root); + if (!done) + return -ENOMEM; + } + + } else { + char *joined; + + /* A relative destination. If so, this is what we'll prefix what's left to do with what + * we just read, and start the loop again, but remain in the current directory. */ + + joined = strjoin("/", destination, todo, NULL); + if (!joined) + return -ENOMEM; + + free(buffer); + todo = buffer = joined; + } + + continue; + } + + /* If this is not a symlink, then let's just add the name we read to what we already verified. */ + if (!done) { + done = first; + first = NULL; + } else { + if (!strextend(&done, first, NULL)) + return -ENOMEM; + } + + /* And iterate again, but go one directory further down. */ + safe_close(fd); + fd = child; + child = -1; + } + + if (!done) { + /* Special case, turn the empty string into "/", to indicate the root directory. */ + done = strdup("/"); + if (!done) + return -ENOMEM; + } + + *ret = done; + done = NULL; + + return 0; +} diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h index 2c3b9a1c74..31df47cf1e 100644 --- a/src/basic/fs-util.h +++ b/src/basic/fs-util.h @@ -77,3 +77,5 @@ union inotify_event_buffer { }; int inotify_add_watch_fd(int fd, int what, uint32_t mask); + +int chase_symlinks(const char *path, const char *_root, char **ret); diff --git a/src/basic/mount-util.c b/src/basic/mount-util.c index bfa04394fe..b9affb4e70 100644 --- a/src/basic/mount-util.c +++ b/src/basic/mount-util.c @@ -36,6 +36,7 @@ #include "set.h" #include "stdio-util.h" #include "string-util.h" +#include "strv.h" static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) { char path[strlen("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)]; @@ -287,10 +288,12 @@ int umount_recursive(const char *prefix, int flags) { continue; if (umount2(p, flags) < 0) { - r = -errno; + r = log_debug_errno(errno, "Failed to umount %s: %m", p); continue; } + log_debug("Successfully unmounted %s", p); + again = true; n++; @@ -311,24 +314,21 @@ static int get_mount_flags(const char *path, unsigned long *flags) { return 0; } -int bind_remount_recursive(const char *prefix, bool ro) { +int bind_remount_recursive(const char *prefix, bool ro, char **blacklist) { _cleanup_set_free_free_ Set *done = NULL; _cleanup_free_ char *cleaned = NULL; int r; - /* Recursively remount a directory (and all its submounts) - * read-only or read-write. If the directory is already - * mounted, we reuse the mount and simply mark it - * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write - * operation). If it isn't we first make it one. Afterwards we - * apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to all - * submounts we can access, too. When mounts are stacked on - * the same mount point we only care for each individual - * "top-level" mount on each point, as we cannot - * influence/access the underlying mounts anyway. We do not - * have any effect on future submounts that might get - * propagated, they migt be writable. This includes future - * submounts that have been triggered via autofs. */ + /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already + * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write + * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to + * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each + * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We + * do not have any effect on future submounts that might get propagated, they migt be writable. This includes + * future submounts that have been triggered via autofs. + * + * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the + * remount operation. Note that we'll ignore the blacklist for the top-level path. */ cleaned = strdup(prefix); if (!cleaned) @@ -385,6 +385,33 @@ int bind_remount_recursive(const char *prefix, bool ro) { if (r < 0) return r; + if (!path_startswith(p, cleaned)) + continue; + + /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall + * operate on. */ + if (!path_equal(cleaned, p)) { + bool blacklisted = false; + char **i; + + STRV_FOREACH(i, blacklist) { + + if (path_equal(*i, cleaned)) + continue; + + if (!path_startswith(*i, cleaned)) + continue; + + if (path_startswith(p, *i)) { + blacklisted = true; + log_debug("Not remounting %s, because blacklisted by %s, called for %s", p, *i, cleaned); + break; + } + } + if (blacklisted) + continue; + } + /* Let's ignore autofs mounts. If they aren't * triggered yet, we want to avoid triggering * them, as we don't make any guarantees for @@ -396,12 +423,9 @@ int bind_remount_recursive(const char *prefix, bool ro) { continue; } - if (path_startswith(p, cleaned) && - !set_contains(done, p)) { - + if (!set_contains(done, p)) { r = set_consume(todo, p); p = NULL; - if (r == -EEXIST) continue; if (r < 0) @@ -418,8 +442,7 @@ int bind_remount_recursive(const char *prefix, bool ro) { if (!set_contains(done, cleaned) && !set_contains(todo, cleaned)) { - /* The prefix directory itself is not yet a - * mount, make it one. */ + /* The prefix directory itself is not yet a mount, make it one. */ if (mount(cleaned, cleaned, NULL, MS_BIND|MS_REC, NULL) < 0) return -errno; @@ -430,6 +453,8 @@ int bind_remount_recursive(const char *prefix, bool ro) { if (mount(NULL, prefix, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0) return -errno; + log_debug("Made top-level directory %s a mount point.", prefix); + x = strdup(cleaned); if (!x) return -ENOMEM; @@ -447,8 +472,7 @@ int bind_remount_recursive(const char *prefix, bool ro) { if (r < 0) return r; - /* Deal with mount points that are obstructed by a - * later mount */ + /* Deal with mount points that are obstructed by a later mount */ r = path_is_mount_point(x, 0); if (r == -ENOENT || r == 0) continue; @@ -463,6 +487,7 @@ int bind_remount_recursive(const char *prefix, bool ro) { if (mount(NULL, x, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0) return -errno; + log_debug("Remounted %s read-only.", x); } } } diff --git a/src/basic/mount-util.h b/src/basic/mount-util.h index f46989ebb3..74730de663 100644 --- a/src/basic/mount-util.h +++ b/src/basic/mount-util.h @@ -35,7 +35,7 @@ int path_is_mount_point(const char *path, int flags); int repeat_unmount(const char *path, int flags); int umount_recursive(const char *target, int flags); -int bind_remount_recursive(const char *prefix, bool ro); +int bind_remount_recursive(const char *prefix, bool ro, char **blacklist); int mount_move_root(const char *path); diff --git a/src/basic/user-util.c b/src/basic/user-util.c index 122d9a0c7c..0522bce1d1 100644 --- a/src/basic/user-util.c +++ b/src/basic/user-util.c @@ -31,14 +31,15 @@ #include <unistd.h> #include <utmp.h> -#include "missing.h" #include "alloc-util.h" #include "fd-util.h" #include "formats-util.h" #include "macro.h" +#include "missing.h" #include "parse-util.h" #include "path-util.h" #include "string-util.h" +#include "strv.h" #include "user-util.h" #include "utf8.h" @@ -175,6 +176,35 @@ int get_user_creds( return 0; } +int get_user_creds_clean( + const char **username, + uid_t *uid, gid_t *gid, + const char **home, + const char **shell) { + + int r; + + /* Like get_user_creds(), but resets home/shell to NULL if they don't contain anything relevant. */ + + r = get_user_creds(username, uid, gid, home, shell); + if (r < 0) + return r; + + if (shell && + (isempty(*shell) || PATH_IN_SET(*shell, + "/bin/nologin", + "/sbin/nologin", + "/usr/bin/nologin", + "/usr/sbin/nologin"))) + *shell = NULL; + + if (home && + (isempty(*home) || path_equal(*home, "/"))) + *home = NULL; + + return 0; +} + int get_group_creds(const char **groupname, gid_t *gid) { struct group *g; gid_t id; diff --git a/src/basic/user-util.h b/src/basic/user-util.h index f569363811..6c61f63cae 100644 --- a/src/basic/user-util.h +++ b/src/basic/user-util.h @@ -40,6 +40,7 @@ char* getlogname_malloc(void); char* getusername_malloc(void); int get_user_creds(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell); +int get_user_creds_clean(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell); int get_group_creds(const char **groupname, gid_t *gid); char* uid_to_name(uid_t uid); diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 7e33a2d201..eec4500c8c 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property( "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", + "ProtectControlGroups")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property( c->dynamic_user = b; else if (streq(name, "RemoveIPC")) c->remove_ipc = b; + else if (streq(name, "ProtectKernelTunables")) + c->protect_kernel_tunables = b; + else if (streq(name, "ProtectControlGroups")) + c->protect_control_groups = b; unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b)); } diff --git a/src/core/execute.c b/src/core/execute.c index 2026137721..3da7ef3be6 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -837,6 +837,8 @@ static int null_conv( return PAM_CONV_ERR; } +#endif + static int setup_pam( const char *name, const char *user, @@ -845,6 +847,8 @@ static int setup_pam( char ***env, int fds[], unsigned n_fds) { +#ifdef HAVE_PAM + static const struct pam_conv conv = { .conv = null_conv, .appdata_ptr = NULL @@ -1038,8 +1042,10 @@ fail: closelog(); return r; -} +#else + return 0; #endif +} static void rename_process_from_path(const char *path) { char process_name[11]; @@ -1273,6 +1279,10 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (!seccomp) return -ENOMEM; + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + r = seccomp_rule_add( seccomp, SCMP_ACT_ERRNO(EPERM), @@ -1322,6 +1332,10 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (!seccomp) return -ENOMEM; + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + /* Determine the highest policy constant we want to allow */ for (i = 0; i < ELEMENTSOF(permitted_policies); i++) if (permitted_policies[i] > max_policy) @@ -1375,12 +1389,121 @@ finish: return r; } +static int apply_protect_sysctl(Unit *u, const ExecContext *c) { + scmp_filter_ctx *seccomp; + int r; + + assert(c); + + /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but + * let's protect even those systems where this is left on in the kernel. */ + + if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(_sysctl), + 0); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + +static int apply_private_devices(Unit *u, const ExecContext *c) { + const SystemCallFilterSet *set; + scmp_filter_ctx *seccomp; + const char *sys; + bool syscalls_found = false; + int r; + + assert(c); + + /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ + + if (skip_seccomp_unavailable(u, "PrivateDevices=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + for (set = syscall_filter_sets; set->set_name; set++) + if (streq(set->set_name, "@raw-io")) { + syscalls_found = true; + break; + } + + /* We should never fail here */ + if (!syscalls_found) { + r = -EOPNOTSUPP; + goto finish; + } + + NULSTR_FOREACH(sys, set->value) { + int id; + bool add = true; + +#ifndef __NR_s390_pci_mmio_read + if (streq(sys, "s390_pci_mmio_read")) + add = false; +#endif +#ifndef __NR_s390_pci_mmio_write + if (streq(sys, "s390_pci_mmio_write")) + add = false; +#endif + + if (!add) + continue; + + id = seccomp_syscall_resolve_name(sys); + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + id, 0); + if (r < 0) + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + #endif static void do_idle_pipe_dance(int idle_pipe[4]) { assert(idle_pipe); - idle_pipe[1] = safe_close(idle_pipe[1]); idle_pipe[2] = safe_close(idle_pipe[2]); @@ -1581,7 +1704,9 @@ static bool exec_needs_mount_namespace( if (context->private_devices || context->protect_system != PROTECT_SYSTEM_NO || - context->protect_home != PROTECT_HOME_NO) + context->protect_home != PROTECT_HOME_NO || + context->protect_kernel_tunables || + context->protect_control_groups) return true; return false; @@ -1740,6 +1865,111 @@ static int setup_private_users(uid_t uid, gid_t gid) { return 0; } +static int setup_runtime_directory( + const ExecContext *context, + const ExecParameters *params, + uid_t uid, + gid_t gid) { + + char **rt; + int r; + + assert(context); + assert(params); + + STRV_FOREACH(rt, context->runtime_directory) { + _cleanup_free_ char *p; + + p = strjoin(params->runtime_prefix, "/", *rt, NULL); + if (!p) + return -ENOMEM; + + r = mkdir_p_label(p, context->runtime_directory_mode); + if (r < 0) + return r; + + r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid); + if (r < 0) + return r; + } + + return 0; +} + +static int setup_smack( + const ExecContext *context, + const ExecCommand *command) { + +#ifdef HAVE_SMACK + int r; + + assert(context); + assert(command); + + if (!mac_smack_use()) + return 0; + + if (context->smack_process_label) { + r = mac_smack_apply_pid(0, context->smack_process_label); + if (r < 0) + return r; + } +#ifdef SMACK_DEFAULT_PROCESS_LABEL + else { + _cleanup_free_ char *exec_label = NULL; + + r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label); + if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) + return r; + + r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); + if (r < 0) + return r; + } +#endif +#endif + + return 0; +} + +static int compile_read_write_paths( + const ExecContext *context, + const ExecParameters *params, + char ***ret) { + + _cleanup_strv_free_ char **l = NULL; + char **rt; + + /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all + * runtime directories. */ + + if (strv_isempty(context->read_write_paths) && + strv_isempty(context->runtime_directory)) { + *ret = NULL; /* NOP if neither is set */ + return 0; + } + + l = strv_copy(context->read_write_paths); + if (!l) + return -ENOMEM; + + STRV_FOREACH(rt, context->runtime_directory) { + char *s; + + s = strjoin(params->runtime_prefix, "/", *rt, NULL); + if (!s) + return -ENOMEM; + + if (strv_consume(&l, s) < 0) + return -ENOMEM; + } + + *ret = l; + l = NULL; + + return 0; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -1796,6 +2026,37 @@ static int close_remaining_fds( return close_all_fds(dont_close, n_dont_close); } +static bool context_has_address_families(const ExecContext *c) { + assert(c); + + return c->address_families_whitelist || + !set_isempty(c->address_families); +} + +static bool context_has_syscall_filters(const ExecContext *c) { + assert(c); + + return c->syscall_whitelist || + !set_isempty(c->syscall_filter) || + !set_isempty(c->syscall_archs); +} + +static bool context_has_no_new_privileges(const ExecContext *c) { + assert(c); + + if (c->no_new_privileges) + return true; + + if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ + return false; + + return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */ + c->memory_deny_write_execute || + c->restrict_realtime || + c->protect_kernel_tunables || + context_has_syscall_filters(c); +} + static int send_user_lookup( Unit *unit, int user_lookup_fd, @@ -1940,22 +2201,14 @@ static int exec_child( } else { if (context->user) { username = context->user; - r = get_user_creds(&username, &uid, &gid, &home, &shell); + r = get_user_creds_clean(&username, &uid, &gid, &home, &shell); if (r < 0) { *exit_status = EXIT_USER; return r; } - /* Don't set $HOME or $SHELL if they are are not particularly enlightening anyway. */ - if (isempty(home) || path_equal(home, "/")) - home = NULL; - - if (isempty(shell) || PATH_IN_SET(shell, - "/bin/nologin", - "/sbin/nologin", - "/usr/bin/nologin", - "/usr/sbin/nologin")) - shell = NULL; + /* Note that we don't set $HOME or $SHELL if they are are not particularly enlightening anyway + * (i.e. are "/" or "/bin/nologin"). */ } if (context->group) { @@ -2108,28 +2361,10 @@ static int exec_child( } if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) { - char **rt; - - STRV_FOREACH(rt, context->runtime_directory) { - _cleanup_free_ char *p; - - p = strjoin(params->runtime_prefix, "/", *rt, NULL); - if (!p) { - *exit_status = EXIT_RUNTIME_DIRECTORY; - return -ENOMEM; - } - - r = mkdir_p_label(p, context->runtime_directory_mode); - if (r < 0) { - *exit_status = EXIT_RUNTIME_DIRECTORY; - return r; - } - - r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid); - if (r < 0) { - *exit_status = EXIT_RUNTIME_DIRECTORY; - return r; - } + r = setup_runtime_directory(context, params, uid, gid); + if (r < 0) { + *exit_status = EXIT_RUNTIME_DIRECTORY; + return r; } } @@ -2168,41 +2403,15 @@ static int exec_child( } accum_env = strv_env_clean(accum_env); - umask(context->umask); + (void) umask(context->umask); if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - r = enforce_groups(context, username, gid); + r = setup_smack(context, command); if (r < 0) { - *exit_status = EXIT_GROUP; + *exit_status = EXIT_SMACK_PROCESS_LABEL; return r; } -#ifdef HAVE_SMACK - if (context->smack_process_label) { - r = mac_smack_apply_pid(0, context->smack_process_label); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - } -#ifdef SMACK_DEFAULT_PROCESS_LABEL - else { - _cleanup_free_ char *exec_label = NULL; - r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label); - if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - - r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - } -#endif -#endif -#ifdef HAVE_PAM if (context->pam_name && username) { r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { @@ -2210,7 +2419,6 @@ static int exec_child( return r; } } -#endif } if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) { @@ -2222,8 +2430,8 @@ static int exec_child( } needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime); - if (needs_mount_namespace) { + _cleanup_free_ char **rw = NULL; char *tmp = NULL, *var = NULL; /* The runtime struct only contains the parent @@ -2239,14 +2447,22 @@ static int exec_child( var = strjoina(runtime->var_tmp_dir, "/tmp"); } + r = compile_read_write_paths(context, params, &rw); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return r; + } + r = setup_namespace( (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, - context->read_write_paths, + rw, context->read_only_paths, context->inaccessible_paths, tmp, var, context->private_devices, + context->protect_kernel_tunables, + context->protect_control_groups, context->protect_home, context->protect_system, context->mount_flags); @@ -2264,6 +2480,14 @@ static int exec_child( } } + if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { + r = enforce_groups(context, username, gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return r; + } + } + if (context->working_directory_home) wd = home; else if (context->working_directory) @@ -2335,11 +2559,6 @@ static int exec_child( if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - bool use_address_families = context->address_families_whitelist || - !set_isempty(context->address_families); - bool use_syscall_filter = context->syscall_whitelist || - !set_isempty(context->syscall_filter) || - !set_isempty(context->syscall_archs); int secure_bits = context->secure_bits; for (i = 0; i < _RLIMIT_MAX; i++) { @@ -2416,15 +2635,14 @@ static int exec_child( return -errno; } - if (context->no_new_privileges || - (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter))) + if (context_has_no_new_privileges(context)) if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { *exit_status = EXIT_NO_NEW_PRIVILEGES; return -errno; } #ifdef HAVE_SECCOMP - if (use_address_families) { + if (context_has_address_families(context)) { r = apply_address_families(unit, context); if (r < 0) { *exit_status = EXIT_ADDRESS_FAMILIES; @@ -2448,7 +2666,23 @@ static int exec_child( } } - if (use_syscall_filter) { + if (context->protect_kernel_tunables) { + r = apply_protect_sysctl(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + + if (context->private_devices) { + r = apply_private_devices(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + + if (context_has_syscall_filters(context)) { r = apply_seccomp(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; @@ -2880,6 +3114,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sNonBlocking: %s\n" "%sPrivateTmp: %s\n" "%sPrivateDevices: %s\n" + "%sProtectKernelTunables: %s\n" + "%sProtectControlGroups: %s\n" "%sPrivateNetwork: %s\n" "%sPrivateUsers: %s\n" "%sProtectHome: %s\n" @@ -2893,6 +3129,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->non_blocking), prefix, yes_no(c->private_tmp), prefix, yes_no(c->private_devices), + prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_control_groups), prefix, yes_no(c->private_network), prefix, yes_no(c->private_users), prefix, protect_home_to_string(c->protect_home), diff --git a/src/core/execute.h b/src/core/execute.h index 6082c42aba..449180c903 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -174,6 +174,8 @@ struct ExecContext { bool private_users; ProtectSystem protect_system; ProtectHome protect_home; + bool protect_kernel_tunables; + bool protect_control_groups; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 2e6c965aec..c49c1d6732 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -89,6 +89,8 @@ $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) +$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) $1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context) diff --git a/src/core/main.c b/src/core/main.c index 803307c9d5..be0cb0b6d1 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -996,10 +996,8 @@ static int parse_argv(int argc, char *argv[]) { case ARG_MACHINE_ID: r = set_machine_id(optarg); - if (r < 0) { - log_error("MachineID '%s' is not valid.", optarg); - return r; - } + if (r < 0) + return log_error_errno(r, "MachineID '%s' is not valid.", optarg); break; case 'h': diff --git a/src/core/namespace.c b/src/core/namespace.c index 52a2505d94..43a2f4ba6e 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -29,6 +29,7 @@ #include "alloc-util.h" #include "dev-setup.h" #include "fd-util.h" +#include "fs-util.h" #include "loopback-setup.h" #include "missing.h" #include "mkdir.h" @@ -53,61 +54,230 @@ typedef enum MountMode { PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, - READWRITE + READWRITE, } MountMode; typedef struct BindMount { - const char *path; + const char *path; /* stack memory, doesn't need to be freed explicitly */ + char *chased; /* malloc()ed memory, needs to be freed */ MountMode mode; - bool done; - bool ignore; + bool ignore; /* Ignore if path does not exist */ } BindMount; +typedef struct TargetMount { + const char *path; + MountMode mode; + bool ignore; /* Ignore if path does not exist */ +} TargetMount; + +/* + * The following Protect tables are to protect paths and mark some of them + * READONLY, in case a path is covered by an option from another table, then + * it is marked READWRITE in the current one, and the more restrictive mode is + * applied from that other table. This way all options can be combined in a + * safe and comprehensible way for users. + */ + +/* ProtectKernelTunables= option and the related filesystem APIs */ +static const TargetMount protect_kernel_tunables_table[] = { + { "/proc/sys", READONLY, false }, + { "/proc/sysrq-trigger", READONLY, true }, + { "/proc/latency_stats", READONLY, true }, + { "/proc/mtrr", READONLY, true }, + { "/proc/apm", READONLY, true }, + { "/proc/acpi", READONLY, true }, + { "/proc/timer_stats", READONLY, true }, + { "/proc/asound", READONLY, true }, + { "/proc/bus", READONLY, true }, + { "/proc/fs", READONLY, true }, + { "/proc/irq", READONLY, true }, + { "/sys", READONLY, false }, + { "/sys/kernel/debug", READONLY, true }, + { "/sys/kernel/tracing", READONLY, true }, + { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ +}; + +/* + * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of + * system should be protected by ProtectSystem= + */ +static const TargetMount protect_home_read_only_table[] = { + { "/home", READONLY, true }, + { "/run/user", READONLY, true }, + { "/root", READONLY, true }, +}; + +/* ProtectHome=yes table */ +static const TargetMount protect_home_yes_table[] = { + { "/home", INACCESSIBLE, true }, + { "/run/user", INACCESSIBLE, true }, + { "/root", INACCESSIBLE, true }, +}; + +/* ProtectSystem=yes table */ +static const TargetMount protect_system_yes_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, +}; + +/* ProtectSystem=full includes ProtectSystem=yes */ +static const TargetMount protect_system_full_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, + { "/etc", READONLY, false }, +}; + +/* + * ProtectSystem=strict table. In this strict mode, we mount everything + * read-only, except for /proc, /dev, /sys which are the kernel API VFS, + * which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. + * (And of course /home and friends are also left writable, as ProtectHome= + * shall manage those, orthogonally). + */ +static const TargetMount protect_system_strict_table[] = { + { "/", READONLY, false }, + { "/proc", READWRITE, false }, /* ProtectKernelTunables= */ + { "/sys", READWRITE, false }, /* ProtectKernelTunables= */ + { "/dev", READWRITE, false }, /* PrivateDevices= */ + { "/home", READWRITE, true }, /* ProtectHome= */ + { "/run/user", READWRITE, true }, /* ProtectHome= */ + { "/root", READWRITE, true }, /* ProtectHome= */ +}; + +static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) { + (*p)->path = path; + (*p)->mode = mode; + (*p)->ignore = ignore; +} + static int append_mounts(BindMount **p, char **strv, MountMode mode) { char **i; assert(p); STRV_FOREACH(i, strv) { + bool ignore = false; - (*p)->ignore = false; - (*p)->done = false; - - if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') { - (*p)->ignore = true; + if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) { (*i)++; + ignore = true; } if (!path_is_absolute(*i)) return -EINVAL; - (*p)->path = *i; - (*p)->mode = mode; + set_bind_mount(p, *i, mode, ignore); (*p)++; } return 0; } -static int mount_path_compare(const void *a, const void *b) { - const BindMount *p = a, *q = b; - int d; +static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) { + unsigned i; - d = path_compare(p->path, q->path); + assert(p); + assert(mounts); + + for (i = 0; i < size; i++) { + /* + * Here we assume that the ignore field is set during + * declaration we do not support "-" at the beginning. + */ + const TargetMount *m = &mounts[i]; + const char *path = prefix_roota(root_directory, m->path); + + if (!path_is_absolute(path)) + return -EINVAL; + + set_bind_mount(p, path, m->mode, m->ignore); + (*p)++; + } + + return 0; +} + +static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) { + assert(p); + + return append_target_mounts(p, root_directory, protect_kernel_tunables_table, + ELEMENTSOF(protect_kernel_tunables_table)); +} - if (d == 0) { - /* If the paths are equal, check the mode */ - if (p->mode < q->mode) - return -1; +static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) { + int r = 0; - if (p->mode > q->mode) - return 1; + assert(p); + if (protect_home == PROTECT_HOME_NO) return 0; + + switch (protect_home) { + case PROTECT_HOME_READ_ONLY: + r = append_target_mounts(p, root_directory, protect_home_read_only_table, + ELEMENTSOF(protect_home_read_only_table)); + break; + case PROTECT_HOME_YES: + r = append_target_mounts(p, root_directory, protect_home_yes_table, + ELEMENTSOF(protect_home_yes_table)); + break; + default: + r = -EINVAL; + break; } + return r; +} + +static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) { + int r = 0; + + assert(p); + + if (protect_system == PROTECT_SYSTEM_NO) + return 0; + + switch (protect_system) { + case PROTECT_SYSTEM_STRICT: + r = append_target_mounts(p, root_directory, protect_system_strict_table, + ELEMENTSOF(protect_system_strict_table)); + break; + case PROTECT_SYSTEM_YES: + r = append_target_mounts(p, root_directory, protect_system_yes_table, + ELEMENTSOF(protect_system_yes_table)); + break; + case PROTECT_SYSTEM_FULL: + r = append_target_mounts(p, root_directory, protect_system_full_table, + ELEMENTSOF(protect_system_full_table)); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static int mount_path_compare(const void *a, const void *b) { + const BindMount *p = a, *q = b; + int d; + /* If the paths are not equal, then order prefixes first */ - return d; + d = path_compare(p->path, q->path); + if (d != 0) + return d; + + /* If the paths are equal, check the mode */ + if (p->mode < q->mode) + return -1; + + if (p->mode > q->mode) + return 1; + + return 0; } static void drop_duplicates(BindMount *m, unsigned *n) { @@ -116,16 +286,110 @@ static void drop_duplicates(BindMount *m, unsigned *n) { assert(m); assert(n); + /* Drops duplicate entries. Expects that the array is properly ordered already. */ + for (f = m, t = m, previous = NULL; f < m+*n; f++) { - /* The first one wins */ - if (previous && path_equal(f->path, previous->path)) + /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare() + * above. */ + if (previous && path_equal(f->path, previous->path)) { + log_debug("%s is duplicate.", f->path); continue; + } *t = *f; - previous = t; + t++; + } + + *n = t - m; +} + +static void drop_inaccessible(BindMount *m, unsigned *n) { + BindMount *f, *t; + const char *clear = NULL; + + assert(m); + assert(n); + + /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly + * ordered already. */ + for (f = m, t = m; f < m+*n; f++) { + + /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop + * it, as inaccessible paths really should drop the entire subtree. */ + if (clear && path_startswith(f->path, clear)) { + log_debug("%s is masked by %s.", f->path, clear); + continue; + } + + clear = f->mode == INACCESSIBLE ? f->path : NULL; + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_nop(BindMount *m, unsigned *n) { + BindMount *f, *t; + + assert(m); + assert(n); + + /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the + * list is ordered by prefixes. */ + + for (f = m, t = m; f < m+*n; f++) { + + /* Only suppress such subtrees for READONLY and READWRITE entries */ + if (IN_SET(f->mode, READONLY, READWRITE)) { + BindMount *p; + bool found = false; + + /* Now let's find the first parent of the entry we are looking at. */ + for (p = t-1; p >= m; p--) { + if (path_startswith(f->path, p->path)) { + found = true; + break; + } + } + + /* We found it, let's see if it's the same mode, if so, we can drop this entry */ + if (found && p->mode == f->mode) { + log_debug("%s is redundant by %s", f->path, p->path); + continue; + } + } + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) { + BindMount *f, *t; + + assert(m); + assert(n); + + if (!root_directory) + return; + + /* Drops all mounts that are outside of the root directory. */ + + for (f = m, t = m; f < m+*n; f++) { + + if (!path_startswith(f->path, root_directory)) { + log_debug("%s is outside of root directory.", f->path); + continue; + } + + *t = *f; t++; } @@ -278,24 +542,23 @@ static int apply_mount( const char *what; int r; - struct stat target; assert(m); + log_debug("Applying namespace mount on %s", m->path); + switch (m->mode) { - case INACCESSIBLE: + case INACCESSIBLE: { + struct stat target; /* First, get rid of everything that is below if there * is anything... Then, overmount it with an * inaccessible path. */ - umount_recursive(m->path, 0); + (void) umount_recursive(m->path, 0); - if (lstat(m->path, &target) < 0) { - if (m->ignore && errno == ENOENT) - return 0; - return -errno; - } + if (lstat(m->path, &target) < 0) + return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path); what = mode_to_inaccessible_node(target.st_mode); if (!what) { @@ -303,11 +566,20 @@ static int apply_mount( return -ELOOP; } break; + } + case READONLY: case READWRITE: - /* Nothing to mount here, we just later toggle the - * MS_RDONLY bit for the mount point */ - return 0; + + r = path_is_mount_point(m->path, 0); + if (r < 0) + return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path); + if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */ + return 0; + + /* This isn't a mount point yet, let's make it one. */ + what = m->path; + break; case PRIVATE_TMP: what = tmp_dir; @@ -326,38 +598,104 @@ static int apply_mount( assert(what); - r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL); - if (r >= 0) { - log_debug("Successfully mounted %s to %s", what, m->path); - return r; - } else { - if (m->ignore && errno == ENOENT) - return 0; + if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path); - } + + log_debug("Successfully mounted %s to %s", what, m->path); + return 0; } -static int make_read_only(BindMount *m) { - int r; +static int make_read_only(BindMount *m, char **blacklist) { + int r = 0; assert(m); if (IN_SET(m->mode, INACCESSIBLE, READONLY)) - r = bind_remount_recursive(m->path, true); - else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV)) { - r = bind_remount_recursive(m->path, false); - if (r == 0 && m->mode == PRIVATE_DEV) /* can be readonly but the submounts can't*/ - if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0) - r = -errno; + r = bind_remount_recursive(m->path, true, blacklist); + else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/ + if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0) + r = -errno; } else - r = 0; - - if (m->ignore && r == -ENOENT) return 0; + /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only + * already stays this way. This improves compatibility with container managers, where we won't attempt to undo + * read-only mounts already applied. */ + return r; } +static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) { + BindMount *f, *t; + int r; + + assert(m); + assert(n); + + /* Since mount() will always follow symlinks and we need to take the different root directory into account we + * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we + * can't resolve the path, and which have been marked for such removal. */ + + for (f = m, t = m; f < m+*n; f++) { + + r = chase_symlinks(f->path, root_directory, &f->chased); + if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */ + continue; + if (r < 0) + return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path); + + if (path_equal(f->path, f->chased)) + f->chased = mfree(f->chased); + else { + log_debug("Chased %s → %s", f->path, f->chased); + f->path = f->chased; + } + + *t = *f; + t++; + } + + *n = t - m; + return 0; +} + +static unsigned namespace_calculate_mounts( + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, + const char* tmp_dir, + const char* var_tmp_dir, + bool private_dev, + bool protect_sysctl, + bool protect_cgroups, + ProtectHome protect_home, + ProtectSystem protect_system) { + + unsigned protect_home_cnt; + unsigned protect_system_cnt = + (protect_system == PROTECT_SYSTEM_STRICT ? + ELEMENTSOF(protect_system_strict_table) : + ((protect_system == PROTECT_SYSTEM_FULL) ? + ELEMENTSOF(protect_system_full_table) : + ((protect_system == PROTECT_SYSTEM_YES) ? + ELEMENTSOF(protect_system_yes_table) : 0))); + + protect_home_cnt = + (protect_home == PROTECT_HOME_YES ? + ELEMENTSOF(protect_home_yes_table) : + ((protect_home == PROTECT_HOME_READ_ONLY) ? + ELEMENTSOF(protect_home_read_only_table) : 0)); + + return !!tmp_dir + !!var_tmp_dir + + strv_length(read_write_paths) + + strv_length(read_only_paths) + + strv_length(inaccessible_paths) + + private_dev + + (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (protect_cgroups ? 1 : 0) + + protect_home_cnt + protect_system_cnt; +} + int setup_namespace( const char* root_directory, char** read_write_paths, @@ -366,28 +704,31 @@ int setup_namespace( const char* tmp_dir, const char* var_tmp_dir, bool private_dev, + bool protect_sysctl, + bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags) { BindMount *m, *mounts = NULL; + bool make_slave = false; unsigned n; int r = 0; if (mount_flags == 0) mount_flags = MS_SHARED; - if (unshare(CLONE_NEWNS) < 0) - return -errno; + n = namespace_calculate_mounts(read_write_paths, + read_only_paths, + inaccessible_paths, + tmp_dir, var_tmp_dir, + private_dev, protect_sysctl, + protect_cgroups, protect_home, + protect_system); - n = !!tmp_dir + !!var_tmp_dir + - strv_length(read_write_paths) + - strv_length(read_only_paths) + - strv_length(inaccessible_paths) + - private_dev + - (protect_home != PROTECT_HOME_NO ? 3 : 0) + - (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) + - (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0); + /* Set mount slave mode */ + if (root_directory || n > 0) + make_slave = true; if (n > 0) { m = mounts = (BindMount *) alloca0(n * sizeof(BindMount)); @@ -421,94 +762,112 @@ int setup_namespace( m++; } - if (protect_home != PROTECT_HOME_NO) { - const char *home_dir, *run_user_dir, *root_dir; + if (protect_sysctl) + append_protect_kernel_tunables(&m, root_directory); - home_dir = prefix_roota(root_directory, "/home"); - home_dir = strjoina("-", home_dir); - run_user_dir = prefix_roota(root_directory, "/run/user"); - run_user_dir = strjoina("-", run_user_dir); - root_dir = prefix_roota(root_directory, "/root"); - root_dir = strjoina("-", root_dir); - - r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir), - protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE); - if (r < 0) - return r; + if (protect_cgroups) { + m->path = prefix_roota(root_directory, "/sys/fs/cgroup"); + m->mode = READONLY; + m++; } - if (protect_system != PROTECT_SYSTEM_NO) { - const char *usr_dir, *boot_dir, *etc_dir; - - usr_dir = prefix_roota(root_directory, "/usr"); - boot_dir = prefix_roota(root_directory, "/boot"); - boot_dir = strjoina("-", boot_dir); - etc_dir = prefix_roota(root_directory, "/etc"); + r = append_protect_home(&m, root_directory, protect_home); + if (r < 0) + return r; - r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL - ? STRV_MAKE(usr_dir, boot_dir, etc_dir) - : STRV_MAKE(usr_dir, boot_dir), READONLY); - if (r < 0) - return r; - } + r = append_protect_system(&m, root_directory, protect_system); + if (r < 0) + return r; assert(mounts + n == m); + /* Resolve symlinks manually first, as mount() will always follow them relative to the host's + * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit + * racy. */ + r = chase_all_symlinks(root_directory, mounts, &n); + if (r < 0) + goto finish; + qsort(mounts, n, sizeof(BindMount), mount_path_compare); + drop_duplicates(mounts, &n); + drop_outside_root(root_directory, mounts, &n); + drop_inaccessible(mounts, &n); + drop_nop(mounts, &n); + } + + if (unshare(CLONE_NEWNS) < 0) { + r = -errno; + goto finish; } - if (n > 0 || root_directory) { + if (make_slave) { /* Remount / as SLAVE so that nothing now mounted in the namespace shows up in the parent */ - if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) - return -errno; + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { + r = -errno; + goto finish; + } } if (root_directory) { - /* Turn directory into bind mount */ - if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) - return -errno; + /* Turn directory into bind mount, if it isn't one yet */ + r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW); + if (r < 0) + goto finish; + if (r == 0) { + if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) { + r = -errno; + goto finish; + } + } } if (n > 0) { + char **blacklist; + unsigned j; + + /* First round, add in all special mounts we need */ for (m = mounts; m < mounts + n; ++m) { r = apply_mount(m, tmp_dir, var_tmp_dir); if (r < 0) - goto fail; + goto finish; } + /* Create a blacklist we can pass to bind_mount_recursive() */ + blacklist = newa(char*, n+1); + for (j = 0; j < n; j++) + blacklist[j] = (char*) mounts[j].path; + blacklist[j] = NULL; + + /* Second round, flip the ro bits if necessary. */ for (m = mounts; m < mounts + n; ++m) { - r = make_read_only(m); + r = make_read_only(m, blacklist); if (r < 0) - goto fail; + goto finish; } } if (root_directory) { /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ r = mount_move_root(root_directory); - - /* at this point, we cannot rollback */ if (r < 0) - return r; + goto finish; } /* Remount / as the desired mode. Not that this will not * reestablish propagation from our side to the host, since * what's disconnected is disconnected. */ - if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) - /* at this point, we cannot rollback */ - return -errno; + if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { + r = -errno; + goto finish; + } - return 0; + r = 0; -fail: - if (n > 0) { - for (m = mounts; m < mounts + n; ++m) - if (m->done) - (void) umount2(m->path, MNT_DETACH); - } +finish: + for (m = mounts; m < mounts + n; m++) + free(m->chased); return r; } @@ -658,6 +1017,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { [PROTECT_SYSTEM_NO] = "no", [PROTECT_SYSTEM_YES] = "yes", [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", }; DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem); diff --git a/src/core/namespace.h b/src/core/namespace.h index 1aedf5f208..6505bcc499 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -35,6 +35,7 @@ typedef enum ProtectSystem { PROTECT_SYSTEM_NO, PROTECT_SYSTEM_YES, PROTECT_SYSTEM_FULL, + PROTECT_SYSTEM_STRICT, _PROTECT_SYSTEM_MAX, _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; @@ -46,6 +47,8 @@ int setup_namespace(const char *chroot, const char *tmp_dir, const char *var_tmp_dir, bool private_dev, + bool protect_sysctl, + bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags); diff --git a/src/core/unit.c b/src/core/unit.c index de22f657c6..5d284a359d 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3377,8 +3377,14 @@ int unit_patch_contexts(Unit *u) { return -ENOMEM; } + /* If the dynamic user option is on, let's make sure that the unit can't leave its UID/GID + * around in the file system or on IPC objects. Hence enforce a strict sandbox. */ + ec->private_tmp = true; ec->remove_ipc = true; + ec->protect_system = PROTECT_SYSTEM_STRICT; + if (ec->protect_home == PROTECT_HOME_NO) + ec->protect_home = PROTECT_HOME_READ_ONLY; } } diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 295b75341f..25d38aa742 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -314,19 +314,21 @@ int mount_all(const char *dest, } MountPoint; static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/ - { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */ - { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true }, - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false }, - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false }, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/ + { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */ + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */ + { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, false, true, false }, /* Bind mount first ...*/ + { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true, false }, /* ... then, make it r/o */ + { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true }, + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false }, + { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, + { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false }, #ifdef HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */ + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */ #endif }; @@ -356,7 +358,7 @@ int mount_all(const char *dest, continue; r = mkdir_p(where, 0755); - if (r < 0) { + if (r < 0 && r != -EEXIST) { if (mount_table[k].fatal) return log_error_errno(r, "Failed to create directory %s: %m", where); @@ -476,7 +478,7 @@ static int mount_bind(const char *dest, CustomMount *m) { return log_error_errno(errno, "mount(%s) failed: %m", where); if (m->read_only) { - r = bind_remount_recursive(where, true); + r = bind_remount_recursive(where, true, NULL); if (r < 0) return log_error_errno(r, "Read-only bind mount failed: %m"); } @@ -990,7 +992,7 @@ int setup_volatile_state( /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */ - r = bind_remount_recursive(directory, true); + r = bind_remount_recursive(directory, true, NULL); if (r < 0) return log_error_errno(r, "Failed to remount %s read-only: %m", directory); @@ -1065,7 +1067,7 @@ int setup_volatile( bind_mounted = true; - r = bind_remount_recursive(t, true); + r = bind_remount_recursive(t, true, NULL); if (r < 0) { log_error_errno(r, "Failed to remount %s read-only: %m", t); goto fail; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 5e0207adf0..c2733a19f8 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3019,7 +3019,7 @@ static int outer_child( return r; if (arg_read_only) { - r = bind_remount_recursive(directory, true); + r = bind_remount_recursive(directory, true, NULL); if (r < 0) return log_error_errno(r, "Failed to make tree read-only: %m"); } diff --git a/src/run/run.c b/src/run/run.c index 2dd229868c..81b53fdfab 100644 --- a/src/run/run.c +++ b/src/run/run.c @@ -1168,17 +1168,21 @@ static int start_transient_scope( uid_t uid; gid_t gid; - r = get_user_creds(&arg_exec_user, &uid, &gid, &home, &shell); + r = get_user_creds_clean(&arg_exec_user, &uid, &gid, &home, &shell); if (r < 0) return log_error_errno(r, "Failed to resolve user %s: %m", arg_exec_user); - r = strv_extendf(&user_env, "HOME=%s", home); - if (r < 0) - return log_oom(); + if (home) { + r = strv_extendf(&user_env, "HOME=%s", home); + if (r < 0) + return log_oom(); + } - r = strv_extendf(&user_env, "SHELL=%s", shell); - if (r < 0) - return log_oom(); + if (shell) { + r = strv_extendf(&user_env, "SHELL=%s", shell); + if (r < 0) + return log_oom(); + } r = strv_extendf(&user_env, "USER=%s", arg_exec_user); if (r < 0) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index feb4a06737..c6bd2f145c 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { r = parse_boolean(eq); if (r < 0) diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 25489cefbc..8b4ff22495 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -133,6 +133,28 @@ static void test_exec_privatedevices(Manager *m) { test(m, "exec-privatedevices-no.service", 0, CLD_EXITED); } +static void test_exec_privatedevices_capabilities(Manager *m) { + if (detect_container() > 0) { + log_notice("testing in container, skipping private device tests"); + return; + } + test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED); + test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED); +} + +static void test_exec_readonlypaths(Manager *m) { + test(m, "exec-readonlypaths.service", 0, CLD_EXITED); + test(m, "exec-readonlypaths-mount-propagation.service", 0, CLD_EXITED); +} + +static void test_exec_readwritepaths(Manager *m) { + test(m, "exec-readwritepaths-mount-propagation.service", 0, CLD_EXITED); +} + +static void test_exec_inaccessiblepaths(Manager *m) { + test(m, "exec-inaccessiblepaths-mount-propagation.service", 0, CLD_EXITED); +} + static void test_exec_systemcallfilter(Manager *m) { #ifdef HAVE_SECCOMP if (!is_seccomp_available()) @@ -345,6 +367,10 @@ int main(int argc, char *argv[]) { test_exec_ignoresigpipe, test_exec_privatetmp, test_exec_privatedevices, + test_exec_privatedevices_capabilities, + test_exec_readonlypaths, + test_exec_readwritepaths, + test_exec_inaccessiblepaths, test_exec_privatenetwork, test_exec_systemcallfilter, test_exec_systemcallerrornumber, diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c index b35a2ea2c8..53a3cdc663 100644 --- a/src/test/test-fs-util.c +++ b/src/test/test-fs-util.c @@ -20,16 +20,109 @@ #include <unistd.h> #include "alloc-util.h" -#include "fileio.h" #include "fd-util.h" +#include "fileio.h" #include "fs-util.h" #include "macro.h" #include "mkdir.h" +#include "path-util.h" #include "rm-rf.h" #include "string-util.h" #include "strv.h" #include "util.h" +static void test_chase_symlinks(void) { + _cleanup_free_ char *result = NULL; + char temp[] = "/tmp/test-chase.XXXXXX"; + const char *top, *p, *q; + int r; + + assert_se(mkdtemp(temp)); + + top = strjoina(temp, "/top"); + assert_se(mkdir(top, 0700) >= 0); + + p = strjoina(top, "/dot"); + assert_se(symlink(".", p) >= 0); + + p = strjoina(top, "/dotdot"); + assert_se(symlink("..", p) >= 0); + + p = strjoina(top, "/dotdota"); + assert_se(symlink("../a", p) >= 0); + + p = strjoina(temp, "/a"); + assert_se(symlink("b", p) >= 0); + + p = strjoina(temp, "/b"); + assert_se(symlink("/usr", p) >= 0); + + p = strjoina(temp, "/start"); + assert_se(symlink("top/dot/dotdota", p) >= 0); + + r = chase_symlinks(p, NULL, &result); + assert_se(r >= 0); + assert_se(path_equal(result, "/usr")); + + result = mfree(result); + r = chase_symlinks(p, temp, &result); + assert_se(r == -ENOENT); + + q = strjoina(temp, "/usr"); + assert_se(mkdir(q, 0700) >= 0); + + r = chase_symlinks(p, temp, &result); + assert_se(r >= 0); + assert_se(path_equal(result, q)); + + p = strjoina(temp, "/slash"); + assert_se(symlink("/", p) >= 0); + + result = mfree(result); + r = chase_symlinks(p, NULL, &result); + assert_se(r >= 0); + assert_se(path_equal(result, "/")); + + result = mfree(result); + r = chase_symlinks(p, temp, &result); + assert_se(r >= 0); + assert_se(path_equal(result, temp)); + + p = strjoina(temp, "/slashslash"); + assert_se(symlink("///usr///", p) >= 0); + + result = mfree(result); + r = chase_symlinks(p, NULL, &result); + assert_se(r >= 0); + assert_se(path_equal(result, "/usr")); + + result = mfree(result); + r = chase_symlinks(p, temp, &result); + assert_se(r >= 0); + assert_se(path_equal(result, q)); + + result = mfree(result); + r = chase_symlinks("/etc/./.././", NULL, &result); + assert_se(r >= 0); + assert_se(path_equal(result, "/")); + + result = mfree(result); + r = chase_symlinks("/etc/./.././", "/etc", &result); + assert_se(r == -EINVAL); + + result = mfree(result); + r = chase_symlinks("/etc/machine-id/foo", NULL, &result); + assert_se(r == -ENOTDIR); + + result = mfree(result); + p = strjoina(temp, "/recursive-symlink"); + assert_se(symlink("recursive-symlink", p) >= 0); + r = chase_symlinks(p, NULL, &result); + assert_se(r == -ELOOP); + + assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); +} + static void test_unlink_noerrno(void) { char name[] = "/tmp/test-close_nointr.XXXXXX"; int fd; @@ -144,6 +237,7 @@ int main(int argc, char *argv[]) { test_readlink_and_make_absolute(); test_get_files_in_directory(); test_var_tmp(); + test_chase_symlinks(); return 0; } diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 9248f2987c..c4d4da6d05 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -26,13 +26,18 @@ int main(int argc, char *argv[]) { const char * const writable[] = { "/home", + "-/home/lennart/projects/foobar", /* this should be masked automatically */ NULL }; const char * const readonly[] = { - "/", - "/usr", + /* "/", */ + /* "/usr", */ "/boot", + "/lib", + "/usr/lib", + "-/lib64", + "-/usr/lib64", NULL }; @@ -42,11 +47,12 @@ int main(int argc, char *argv[]) { }; char *root_directory; char *projects_directory; - int r; char tmp_dir[] = "/tmp/systemd-private-XXXXXX", var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX"; + log_set_max_level(LOG_DEBUG); + assert_se(mkdtemp(tmp_dir)); assert_se(mkdtemp(var_tmp_dir)); @@ -69,6 +75,8 @@ int main(int argc, char *argv[]) { tmp_dir, var_tmp_dir, true, + true, + true, PROTECT_HOME_NO, PROTECT_SYSTEM_NO, 0); |