summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--man/systemd.exec.xml20
-rw-r--r--src/core/dbus-execute.c9
-rw-r--r--src/core/execute.c100
-rw-r--r--src/core/execute.h2
-rw-r--r--src/core/load-fragment-gperf.gperf.m42
-rw-r--r--src/core/namespace.c36
-rw-r--r--src/core/namespace.h2
-rw-r--r--src/shared/bus-unit-util.c2
-rw-r--r--src/test/test-ns.c2
9 files changed, 159 insertions, 16 deletions
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index bcedebd5bb..07128b489e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1060,6 +1060,26 @@
</varlistentry>
<varlistentry>
+ <term><varname>ProtectKernelTunables=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
+ <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
+ unit. Usually, tunable kernel variables should only be written at boot-time, with the
+ <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
+ no services need to write to these at runtime; it is hence recommended to turn this on for most
+ services. Defaults to off.</para></listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><varname>ProtectControlGroups=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
+ through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
+ container managers no services should require write access to the control groups hierarchies; it is hence
+ recommended to turn this on for most services. Defaults to off.</para></listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><varname>MountFlags=</varname></term>
<listitem><para>Takes a mount propagation flag:
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 7e33a2d201..eec4500c8c 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property(
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
- "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+ "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
+ "ProtectControlGroups")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property(
c->dynamic_user = b;
else if (streq(name, "RemoveIPC"))
c->remove_ipc = b;
+ else if (streq(name, "ProtectKernelTunables"))
+ c->protect_kernel_tunables = b;
+ else if (streq(name, "ProtectControlGroups"))
+ c->protect_control_groups = b;
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
}
diff --git a/src/core/execute.c b/src/core/execute.c
index ee734e8445..609b69a859 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1383,6 +1383,45 @@ finish:
return r;
}
+static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
+ scmp_filter_ctx *seccomp;
+ int r;
+
+ assert(c);
+
+ /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+ * let's protect even those systems where this is left on in the kernel. */
+
+ if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+ return 0;
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ r = seccomp_add_secondary_archs(seccomp);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(_sysctl),
+ 0);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_load(seccomp);
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
#endif
static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace(
if (context->private_devices ||
context->protect_system != PROTECT_SYSTEM_NO ||
- context->protect_home != PROTECT_HOME_NO)
+ context->protect_home != PROTECT_HOME_NO ||
+ context->protect_kernel_tunables ||
+ context->protect_control_groups)
return true;
return false;
@@ -1804,6 +1845,37 @@ static int close_remaining_fds(
return close_all_fds(dont_close, n_dont_close);
}
+static bool context_has_address_families(const ExecContext *c) {
+ assert(c);
+
+ return c->address_families_whitelist ||
+ !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_whitelist ||
+ !set_isempty(c->syscall_filter) ||
+ !set_isempty(c->syscall_archs);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+ assert(c);
+
+ if (c->no_new_privileges)
+ return true;
+
+ if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+ return false;
+
+ return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+ c->memory_deny_write_execute ||
+ c->restrict_realtime ||
+ c->protect_kernel_tunables ||
+ context_has_syscall_filters(c);
+}
+
static int send_user_lookup(
Unit *unit,
int user_lookup_fd,
@@ -2255,6 +2327,8 @@ static int exec_child(
tmp,
var,
context->private_devices,
+ context->protect_kernel_tunables,
+ context->protect_control_groups,
context->protect_home,
context->protect_system,
context->mount_flags);
@@ -2343,11 +2417,6 @@ static int exec_child(
if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
- bool use_address_families = context->address_families_whitelist ||
- !set_isempty(context->address_families);
- bool use_syscall_filter = context->syscall_whitelist ||
- !set_isempty(context->syscall_filter) ||
- !set_isempty(context->syscall_archs);
int secure_bits = context->secure_bits;
for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -2424,15 +2493,14 @@ static int exec_child(
return -errno;
}
- if (context->no_new_privileges ||
- (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
+ if (context_has_no_new_privileges(context))
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
*exit_status = EXIT_NO_NEW_PRIVILEGES;
return -errno;
}
#ifdef HAVE_SECCOMP
- if (use_address_families) {
+ if (context_has_address_families(context)) {
r = apply_address_families(unit, context);
if (r < 0) {
*exit_status = EXIT_ADDRESS_FAMILIES;
@@ -2456,7 +2524,15 @@ static int exec_child(
}
}
- if (use_syscall_filter) {
+ if (context->protect_kernel_tunables) {
+ r = apply_protect_sysctl(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
+ }
+ }
+
+ if (context_has_syscall_filters(context)) {
r = apply_seccomp(unit, context);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
@@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
+ "%sProtectKernelTunables: %s\n"
+ "%sProtectControlGroups: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateUsers: %s\n"
"%sProtectHome: %s\n"
@@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
+ prefix, yes_no(c->protect_kernel_tunables),
+ prefix, yes_no(c->protect_control_groups),
prefix, yes_no(c->private_network),
prefix, yes_no(c->private_users),
prefix, protect_home_to_string(c->protect_home),
diff --git a/src/core/execute.h b/src/core/execute.h
index 6082c42aba..449180c903 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -174,6 +174,8 @@ struct ExecContext {
bool private_users;
ProtectSystem protect_system;
ProtectHome protect_home;
+ bool protect_kernel_tunables;
+ bool protect_control_groups;
bool no_new_privileges;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 2e6c965aec..c49c1d6732 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -89,6 +89,8 @@ $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0,
$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths)
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
+$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables)
+$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups)
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 52a2505d94..f2768aeb28 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -53,7 +53,7 @@ typedef enum MountMode {
PRIVATE_TMP,
PRIVATE_VAR_TMP,
PRIVATE_DEV,
- READWRITE
+ READWRITE,
} MountMode;
typedef struct BindMount {
@@ -366,6 +366,8 @@ int setup_namespace(
const char* tmp_dir,
const char* var_tmp_dir,
bool private_dev,
+ bool protect_sysctl,
+ bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags) {
@@ -385,6 +387,8 @@ int setup_namespace(
strv_length(read_only_paths) +
strv_length(inaccessible_paths) +
private_dev +
+ (protect_sysctl ? 3 : 0) +
+ (protect_cgroups != protect_sysctl) +
(protect_home != PROTECT_HOME_NO ? 3 : 0) +
(protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
@@ -421,6 +425,27 @@ int setup_namespace(
m++;
}
+ if (protect_sysctl) {
+ m->path = prefix_roota(root_directory, "/proc/sys");
+ m->mode = READONLY;
+ m++;
+
+ m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
+ m->mode = READONLY;
+ m->ignore = true; /* Not always compiled into the kernel */
+ m++;
+
+ m->path = prefix_roota(root_directory, "/sys");
+ m->mode = READONLY;
+ m++;
+ }
+
+ if (protect_cgroups != protect_sysctl) {
+ m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
+ m->mode = protect_cgroups ? READONLY : READWRITE;
+ m++;
+ }
+
if (protect_home != PROTECT_HOME_NO) {
const char *home_dir, *run_user_dir, *root_dir;
@@ -505,9 +530,12 @@ int setup_namespace(
fail:
if (n > 0) {
- for (m = mounts; m < mounts + n; ++m)
- if (m->done)
- (void) umount2(m->path, MNT_DETACH);
+ for (m = mounts; m < mounts + n; ++m) {
+ if (!m->done)
+ continue;
+
+ (void) umount2(m->path, MNT_DETACH);
+ }
}
return r;
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 1aedf5f208..3845336287 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -46,6 +46,8 @@ int setup_namespace(const char *chroot,
const char *tmp_dir,
const char *var_tmp_dir,
bool private_dev,
+ bool protect_sysctl,
+ bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index feb4a06737..c6bd2f145c 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
- "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+ "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
r = parse_boolean(eq);
if (r < 0)
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 9248f2987c..05f243c75c 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -69,6 +69,8 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
true,
+ true,
+ true,
PROTECT_HOME_NO,
PROTECT_SYSTEM_NO,
0);