summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--man/systemd.exec.xml13
-rw-r--r--src/core/dbus-execute.c5
-rw-r--r--src/core/execute.c103
-rw-r--r--src/core/execute.h4
-rw-r--r--src/core/load-fragment-gperf.gperf.m42
-rw-r--r--src/core/unit.c1
6 files changed, 120 insertions, 8 deletions
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index dbfc7692f7..ed02666daf 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1413,6 +1413,19 @@
</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>RestrictRealtime=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of
+ the unit are refused. This restricts access to realtime task scheduling policies such as
+ <constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See
+ <citerefentry><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about
+ these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods
+ of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It
+ is hence recommended to restrict access to realtime scheduling to the few programs that actually require
+ them. Defaults to off.</para></listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 4c88c41127..644b9561b5 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_VTABLE_END
};
@@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property(
} else if (STR_IN_SET(name,
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
"PrivateTmp", "PrivateDevices", "PrivateNetwork",
- "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) {
+ "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property(
c->syslog_level_prefix = b;
else if (streq(name, "MemoryDenyWriteExecute"))
c->memory_deny_write_execute = b;
+ else if (streq(name, "RestrictRealtime"))
+ c->restrict_realtime = b;
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
}
diff --git a/src/core/execute.c b/src/core/execute.c
index 3c3369373f..8cb18dbd5b 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1237,7 +1237,7 @@ static int apply_memory_deny_write_execute(const ExecContext *c) {
r = seccomp_rule_add(
seccomp,
- SCMP_ACT_KILL,
+ SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(mmap),
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
@@ -1246,7 +1246,7 @@ static int apply_memory_deny_write_execute(const ExecContext *c) {
r = seccomp_rule_add(
seccomp,
- SCMP_ACT_KILL,
+ SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(mprotect),
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
@@ -1264,6 +1264,76 @@ finish:
return r;
}
+static int apply_restrict_realtime(const ExecContext *c) {
+ static const int permitted_policies[] = {
+ SCHED_OTHER,
+ SCHED_BATCH,
+ SCHED_IDLE,
+ };
+
+ scmp_filter_ctx *seccomp;
+ unsigned i;
+ int r, p, max_policy = 0;
+
+ assert(c);
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ /* Determine the highest policy constant we want to allow */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] > max_policy)
+ max_policy = permitted_policies[i];
+
+ /* Go through all policies with lower values than that, and block them -- unless they appear in the
+ * whitelist. */
+ for (p = 0; p < max_policy; p++) {
+ bool good = false;
+
+ /* Check if this is in the whitelist. */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] == p) {
+ good = true;
+ break;
+ }
+
+ if (good)
+ continue;
+
+ /* Deny this policy */
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(sched_setscheduler),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, p));
+ if (r < 0)
+ goto finish;
+ }
+
+ /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
+ * hence no need no check for < 0 values. */
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(sched_setscheduler),
+ 1,
+ SCMP_A1(SCMP_CMP_GT, max_policy));
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_load(seccomp);
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
#endif
static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1951,10 +2021,20 @@ static int exec_child(
int secure_bits = context->secure_bits;
for (i = 0; i < _RLIMIT_MAX; i++) {
+
if (!context->rlimit[i])
continue;
- if (setrlimit_closest(i, context->rlimit[i]) < 0) {
+ r = setrlimit_closest(i, context->rlimit[i]);
+ if (r < 0) {
+ *exit_status = EXIT_LIMITS;
+ return r;
+ }
+ }
+
+ /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
+ if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
+ if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
*exit_status = EXIT_LIMITS;
return -errno;
}
@@ -2015,7 +2095,7 @@ static int exec_child(
}
if (context->no_new_privileges ||
- (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || use_syscall_filter)))
+ (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
*exit_status = EXIT_NO_NEW_PRIVILEGES;
return -errno;
@@ -2037,6 +2117,15 @@ static int exec_child(
return r;
}
}
+
+ if (context->restrict_realtime) {
+ r = apply_restrict_realtime(context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
+ }
+ }
+
if (use_syscall_filter) {
r = apply_seccomp(context);
if (r < 0) {
@@ -2472,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sIgnoreSIGPIPE: %s\n"
- "%sMemoryDenyWriteExecute: %s\n",
+ "%sMemoryDenyWriteExecute: %s\n"
+ "%sRestrictRealtime: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
prefix, c->root_directory ? c->root_directory : "/",
@@ -2483,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(c->ignore_sigpipe),
- prefix, yes_no(c->memory_deny_write_execute));
+ prefix, yes_no(c->memory_deny_write_execute),
+ prefix, yes_no(c->restrict_realtime));
STRV_FOREACH(e, c->environment)
fprintf(f, "%sEnvironment: %s\n", prefix, *e);
diff --git a/src/core/execute.h b/src/core/execute.h
index cd1f7b36f6..210eea0e82 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -193,12 +193,14 @@ struct ExecContext {
char **runtime_directory;
mode_t runtime_directory_mode;
+ bool memory_deny_write_execute;
+ bool restrict_realtime;
+
bool oom_score_adjust_set:1;
bool nice_set:1;
bool ioprio_set:1;
bool cpu_sched_set:1;
bool no_new_privileges_set:1;
- bool memory_deny_write_execute;
};
#include "cgroup-util.h"
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index eb58586523..fe1006830b 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP',
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context)
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
+$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
$1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit)
diff --git a/src/core/unit.c b/src/core/unit.c
index 581962eba6..0a1a5321df 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3364,6 +3364,7 @@ int unit_write_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name, co
/* When this is a transient unit file in creation, then let's not create a new drop-in but instead
* write to the transient unit file. */
fputs(data, u->transient_file);
+ fputc('\n', u->transient_file);
return 0;
}