diff options
| -rw-r--r-- | man/systemd.exec.xml | 13 | ||||
| -rw-r--r-- | src/core/dbus-execute.c | 5 | ||||
| -rw-r--r-- | src/core/execute.c | 95 | ||||
| -rw-r--r-- | src/core/execute.h | 4 | ||||
| -rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 2 | 
5 files changed, 114 insertions, 5 deletions
| diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index dbfc7692f7..ed02666daf 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1413,6 +1413,19 @@          </para></listitem>        </varlistentry> +      <varlistentry> +        <term><varname>RestrictRealtime=</varname></term> + +        <listitem><para>Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of +        the unit are refused. This restricts access to realtime task scheduling policies such as +        <constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See +        <citerefentry><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about +        these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods +        of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It +        is hence recommended to restrict access to realtime scheduling to the few programs that actually require +        them. Defaults to off.</para></listitem> +      </varlistentry> +      </variablelist>    </refsect1> diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 4c88c41127..644b9561b5 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = {          SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), +        SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_VTABLE_END  }; @@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property(          } else if (STR_IN_SET(name,                                "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",                                "PrivateTmp", "PrivateDevices", "PrivateNetwork", -                              "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) { +                              "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) {                  int b;                  r = sd_bus_message_read(message, "b", &b); @@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property(                                  c->syslog_level_prefix = b;                          else if (streq(name, "MemoryDenyWriteExecute"))                                  c->memory_deny_write_execute = b; +                        else if (streq(name, "RestrictRealtime")) +                                c->restrict_realtime = b;                          unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));                  } diff --git a/src/core/execute.c b/src/core/execute.c index cf52355fc4..8cb18dbd5b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1264,6 +1264,76 @@ finish:          return r;  } +static int apply_restrict_realtime(const ExecContext *c) { +        static const int permitted_policies[] = { +                SCHED_OTHER, +                SCHED_BATCH, +                SCHED_IDLE, +        }; + +        scmp_filter_ctx *seccomp; +        unsigned i; +        int r, p, max_policy = 0; + +        assert(c); + +        seccomp = seccomp_init(SCMP_ACT_ALLOW); +        if (!seccomp) +                return -ENOMEM; + +        /* Determine the highest policy constant we want to allow */ +        for (i = 0; i < ELEMENTSOF(permitted_policies); i++) +                if (permitted_policies[i] > max_policy) +                        max_policy = permitted_policies[i]; + +        /* Go through all policies with lower values than that, and block them -- unless they appear in the +         * whitelist. */ +        for (p = 0; p < max_policy; p++) { +                bool good = false; + +                /* Check if this is in the whitelist. */ +                for (i = 0; i < ELEMENTSOF(permitted_policies); i++) +                        if (permitted_policies[i] == p) { +                                good = true; +                                break; +                        } + +                if (good) +                        continue; + +                /* Deny this policy */ +                r = seccomp_rule_add( +                                seccomp, +                                SCMP_ACT_ERRNO(EPERM), +                                SCMP_SYS(sched_setscheduler), +                                1, +                                SCMP_A1(SCMP_CMP_EQ, p)); +                if (r < 0) +                        goto finish; +        } + +        /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here, +         * hence no need no check for < 0 values. */ +        r = seccomp_rule_add( +                        seccomp, +                        SCMP_ACT_ERRNO(EPERM), +                        SCMP_SYS(sched_setscheduler), +                        1, +                        SCMP_A1(SCMP_CMP_GT, max_policy)); +        if (r < 0) +                goto finish; + +        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); +        if (r < 0) +                goto finish; + +        r = seccomp_load(seccomp); + +finish: +        seccomp_release(seccomp); +        return r; +} +  #endif  static void do_idle_pipe_dance(int idle_pipe[4]) { @@ -1962,6 +2032,14 @@ static int exec_child(                          }                  } +                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */ +                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) { +                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) { +                                *exit_status = EXIT_LIMITS; +                                return -errno; +                        } +                } +                  if (!cap_test_all(context->capability_bounding_set)) {                          r = capability_bounding_set_drop(context->capability_bounding_set, false);                          if (r < 0) { @@ -2017,7 +2095,7 @@ static int exec_child(                          }                  if (context->no_new_privileges || -                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || use_syscall_filter))) +                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))                          if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {                                  *exit_status = EXIT_NO_NEW_PRIVILEGES;                                  return -errno; @@ -2039,6 +2117,15 @@ static int exec_child(                                  return r;                          }                  } + +                if (context->restrict_realtime) { +                        r = apply_restrict_realtime(context); +                        if (r < 0) { +                                *exit_status = EXIT_SECCOMP; +                                return r; +                        } +                } +                  if (use_syscall_filter) {                          r = apply_seccomp(context);                          if (r < 0) { @@ -2474,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {                  "%sProtectHome: %s\n"                  "%sProtectSystem: %s\n"                  "%sIgnoreSIGPIPE: %s\n" -                "%sMemoryDenyWriteExecute: %s\n", +                "%sMemoryDenyWriteExecute: %s\n" +                "%sRestrictRealtime: %s\n",                  prefix, c->umask,                  prefix, c->working_directory ? c->working_directory : "/",                  prefix, c->root_directory ? c->root_directory : "/", @@ -2485,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {                  prefix, protect_home_to_string(c->protect_home),                  prefix, protect_system_to_string(c->protect_system),                  prefix, yes_no(c->ignore_sigpipe), -                prefix, yes_no(c->memory_deny_write_execute)); +                prefix, yes_no(c->memory_deny_write_execute), +                prefix, yes_no(c->restrict_realtime));          STRV_FOREACH(e, c->environment)                  fprintf(f, "%sEnvironment: %s\n", prefix, *e); diff --git a/src/core/execute.h b/src/core/execute.h index cd1f7b36f6..210eea0e82 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -193,12 +193,14 @@ struct ExecContext {          char **runtime_directory;          mode_t runtime_directory_mode; +        bool memory_deny_write_execute; +        bool restrict_realtime; +          bool oom_score_adjust_set:1;          bool nice_set:1;          bool ioprio_set:1;          bool cpu_sched_set:1;          bool no_new_privileges_set:1; -        bool memory_deny_write_execute;  };  #include "cgroup-util.h" diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index eb58586523..fe1006830b 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP',  $1.SystemCallArchitectures,      config_parse_syscall_archs,         0,                             offsetof($1, exec_context.syscall_archs)  $1.SystemCallErrorNumber,        config_parse_syscall_errno,         0,                             offsetof($1, exec_context)  $1.MemoryDenyWriteExecute,       config_parse_bool,                  0,                             offsetof($1, exec_context.memory_deny_write_execute) +$1.RestrictRealtime,             config_parse_bool,                  0,                             offsetof($1, exec_context.restrict_realtime)  $1.RestrictAddressFamilies,      config_parse_address_families,      0,                             offsetof($1, exec_context)',  `$1.SystemCallFilter,            config_parse_warn_compat,           DISABLED_CONFIGURATION,        0  $1.SystemCallArchitectures,      config_parse_warn_compat,           DISABLED_CONFIGURATION,        0  $1.SystemCallErrorNumber,        config_parse_warn_compat,           DISABLED_CONFIGURATION,        0  $1.MemoryDenyWriteExecute,       config_parse_warn_compat,           DISABLED_CONFIGURATION,        0 +$1.RestrictRealtime,             config_parse_warn_compat,           DISABLED_CONFIGURATION,        0  $1.RestrictAddressFamilies,      config_parse_warn_compat,           DISABLED_CONFIGURATION,        0')  $1.LimitCPU,                     config_parse_limit,                 RLIMIT_CPU,                    offsetof($1, exec_context.rlimit)  $1.LimitFSIZE,                   config_parse_limit,                 RLIMIT_FSIZE,                  offsetof($1, exec_context.rlimit) | 
