diff options
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | man/systemd.resource-control.xml | 71 | ||||
-rw-r--r-- | src/basic/missing.h | 12 | ||||
-rw-r--r-- | src/core/cgroup.c | 63 | ||||
-rw-r--r-- | src/core/cgroup.h | 4 | ||||
-rw-r--r-- | src/core/dbus-cgroup.c | 82 | ||||
-rw-r--r-- | src/core/dbus-execute.c | 2 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 3 | ||||
-rw-r--r-- | src/core/load-fragment.c | 25 | ||||
-rw-r--r-- | src/login/org.freedesktop.login1.policy.in | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn-patch-uid.c | 15 | ||||
-rw-r--r-- | src/nspawn/nspawn-seccomp.c | 143 | ||||
-rw-r--r-- | src/nspawn/nspawn-seccomp.h | 24 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 112 | ||||
-rw-r--r-- | src/shared/bus-unit-util.c | 6 | ||||
-rw-r--r-- | src/systemctl/systemctl.c | 39 |
16 files changed, 434 insertions, 171 deletions
diff --git a/Makefile.am b/Makefile.am index 305099ab66..f8e1fac967 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3016,6 +3016,8 @@ systemd_nspawn_SOURCES = \ src/nspawn/nspawn-expose-ports.h \ src/nspawn/nspawn-cgroup.c \ src/nspawn/nspawn-cgroup.h \ + src/nspawn/nspawn-seccomp.c \ + src/nspawn/nspawn-seccomp.h \ src/nspawn/nspawn-register.c \ src/nspawn/nspawn-register.h \ src/nspawn/nspawn-setuid.c \ diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 066f2cc19b..570619a743 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -114,6 +114,13 @@ prefixed ones. On unified hierarchy, IO resource control also applies to buffered writes.</para> </listitem> </varlistentry> + <varlistentry> + <term><option>Memory</option></term> + <listitem> + <para><varname>MemoryMax</varname> replaces <varname>MemoryLimit</varname>. <varname>MemoryLow</varname> + and <varname>MemoryHigh</varname> are effective only on unified hierarchy.</para> + </listitem> + </varlistentry> </variablelist> </para> @@ -213,6 +220,67 @@ </varlistentry> <varlistentry> + <term><varname>MemoryLow=<replaceable>bytes</replaceable></varname></term> + + <listitem> + <para>Specify the best-effort memory usage protection of the executed processes in this unit. If the memory + usages of this unit and all its ancestors are below their low boundaries, this unit's memory won't be + reclaimed as long as memory can be reclaimed from unprotected units.</para> + + <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is + parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. This controls the + <literal>memory.low</literal> control group attribute. For details about this control group attribute, see + <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para> + + <para>Implies <literal>MemoryAccounting=true</literal>.</para> + + <para>This setting is supported only if the unified control group hierarchy is used.</para> + </listitem> + </varlistentry> + + <varlistentry> + <term><varname>MemoryHigh=<replaceable>bytes</replaceable></varname></term> + + <listitem> + <para>Specify the high limit on memory usage of the executed processes in this unit. Memory usage may go + above the limit if unavoidable, but the processes are heavily slowed down and memory is taken away + aggressively in such cases. This is the main mechanism to control memory usage of a unit.</para> + + <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is + parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. If assigned the + special value <literal>max</literal>, no memory limit is applied. This controls the + <literal>memory.high</literal> control group attribute. For details about this control group attribute, see + <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para> + + <para>Implies <literal>MemoryAccounting=true</literal>.</para> + + <para>This setting is supported only if the unified control group hierarchy is used.</para> + </listitem> + </varlistentry> + + <varlistentry> + <term><varname>MemoryMax=<replaceable>bytes</replaceable></varname></term> + + <listitem> + <para>Specify the absolute limit on memory usage of the executed processes in this unit. If memory usage + cannot be contained under the limit, out-of-memory killer is invoked inside the unit. It is recommended to + use <varname>MemoryHigh=</varname> as the main control mechanism and use <varname>MemoryMax=</varname> as the + last line of defense.</para> + + <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is + parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. If assigned the + special value <literal>max</literal>, no memory limit is applied. This controls the + <literal>memory.max</literal> control group attribute. For details about this control group attribute, see + <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para> + + <para>Implies <literal>MemoryAccounting=true</literal>.</para> + + <para>This setting is supported only if the unified control group hierarchy is used. Use + <varname>MemoryLimit=</varname> on systems using the legacy control group hierarchy.</para> + </listitem> + </varlistentry> + + <varlistentry> <term><varname>MemoryLimit=<replaceable>bytes</replaceable></varname></term> <listitem> @@ -230,6 +298,9 @@ url="https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt">memory.txt</ulink>.</para> <para>Implies <literal>MemoryAccounting=true</literal>.</para> + + <para>This setting is supported only if the legacy control group hierarchy is used. Use + <varname>MemoryMax=</varname> on systems using the unified control group hierarchy.</para> </listitem> </varlistentry> diff --git a/src/basic/missing.h b/src/basic/missing.h index 651e414395..2077ada72d 100644 --- a/src/basic/missing.h +++ b/src/basic/missing.h @@ -453,6 +453,18 @@ struct btrfs_ioctl_quota_ctl_args { #define MQUEUE_MAGIC 0x19800202 #endif +#ifndef SECURITYFS_MAGIC +#define SECURITYFS_MAGIC 0x73636673 +#endif + +#ifndef TRACEFS_MAGIC +#define TRACEFS_MAGIC 0x74726163 +#endif + +#ifndef BPF_FS_MAGIC +#define BPF_FS_MAGIC 0xcafe4a11 +#endif + #ifndef MS_MOVE #define MS_MOVE 8192 #endif diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 0fb63b1bd1..fbe69df4e9 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -46,7 +46,10 @@ void cgroup_context_init(CGroupContext *c) { c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID; c->cpu_quota_per_sec_usec = USEC_INFINITY; - c->memory_limit = (uint64_t) -1; + c->memory_high = CGROUP_LIMIT_MAX; + c->memory_max = CGROUP_LIMIT_MAX; + + c->memory_limit = CGROUP_LIMIT_MAX; c->io_weight = CGROUP_WEIGHT_INVALID; c->startup_io_weight = CGROUP_WEIGHT_INVALID; @@ -147,6 +150,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { "%sStartupIOWeight=%" PRIu64 "\n" "%sBlockIOWeight=%" PRIu64 "\n" "%sStartupBlockIOWeight=%" PRIu64 "\n" + "%sMemoryLow=%" PRIu64 "\n" + "%sMemoryHigh=%" PRIu64 "\n" + "%sMemoryMax=%" PRIu64 "\n" "%sMemoryLimit=%" PRIu64 "\n" "%sTasksMax=%" PRIu64 "\n" "%sDevicePolicy=%s\n" @@ -163,6 +169,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, c->startup_io_weight, prefix, c->blockio_weight, prefix, c->startup_blockio_weight, + prefix, c->memory_low, + prefix, c->memory_high, + prefix, c->memory_max, prefix, c->memory_limit, prefix, c->tasks_max, prefix, cgroup_device_policy_to_string(c->device_policy), @@ -496,6 +505,23 @@ static unsigned cgroup_apply_blkio_device_limit(const char *path, const char *de return n; } +static bool cgroup_context_has_unified_memory_config(CGroupContext *c) { + return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX; +} + +static void cgroup_apply_unified_memory_limit(const char *path, const char *file, uint64_t v) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max"; + int r; + + if (v != CGROUP_LIMIT_MAX) + xsprintf(buf, "%" PRIu64 "\n", v); + + r = cg_set_attribute("memory", path, file, buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set %s on %s: %m", file, path); +} + void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) { bool is_root; int r; @@ -662,26 +688,30 @@ void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, M } if ((mask & CGROUP_MASK_MEMORY) && !is_root) { - if (c->memory_limit != (uint64_t) -1) { - char buf[DECIMAL_STR_MAX(uint64_t) + 1]; - - sprintf(buf, "%" PRIu64 "\n", c->memory_limit); + if (cg_unified() > 0) { + uint64_t max = c->memory_max; - if (cg_unified() <= 0) - r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf); + if (cgroup_context_has_unified_memory_config(c)) + max = c->memory_max; else - r = cg_set_attribute("memory", path, "memory.max", buf); + max = c->memory_limit; + cgroup_apply_unified_memory_limit(path, "memory.low", c->memory_low); + cgroup_apply_unified_memory_limit(path, "memory.high", c->memory_high); + cgroup_apply_unified_memory_limit(path, "memory.max", max); } else { - if (cg_unified() <= 0) - r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1"); + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + + if (c->memory_limit != CGROUP_LIMIT_MAX) + xsprintf(buf, "%" PRIu64 "\n", c->memory_limit); else - r = cg_set_attribute("memory", path, "memory.max", "max"); - } + xsprintf(buf, "%" PRIu64 "\n", c->memory_max); - if (r < 0) - log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path); + r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set memory.limit_in_bytes on %s: %m", path); + } } if ((mask & CGROUP_MASK_DEVICES) && !is_root) { @@ -778,7 +808,8 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) { mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; if (c->memory_accounting || - c->memory_limit != (uint64_t) -1) + c->memory_limit != CGROUP_LIMIT_MAX || + cgroup_context_has_unified_memory_config(c)) mask |= CGROUP_MASK_MEMORY; if (c->device_allow || diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 2b1edbafc4..ff87adfba1 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -94,6 +94,10 @@ struct CGroupContext { LIST_HEAD(CGroupIODeviceWeight, io_device_weights); LIST_HEAD(CGroupIODeviceLimit, io_device_limits); + uint64_t memory_low; + uint64_t memory_high; + uint64_t memory_max; + /* For legacy hierarchies */ uint64_t cpu_shares; uint64_t startup_cpu_shares; diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index eef1c47c14..27050b4507 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -228,6 +228,9 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0), + SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0), + SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0), + SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0), SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0), SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0), SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0), @@ -260,7 +263,7 @@ static int bus_cgroup_set_transient_property( if (mode != UNIT_CHECK) { c->delegate = b; - unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes" : "Delegate=no"); + unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes\n" : "Delegate=no\n"); } return 1; @@ -295,7 +298,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->cpu_accounting = b; unit_invalidate_cgroup(u, CGROUP_MASK_CPUACCT|CGROUP_MASK_CPU); - unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes" : "CPUAccounting=no"); + unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes\n" : "CPUAccounting=no\n"); } return 1; @@ -315,9 +318,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_CPU); if (shares == CGROUP_CPU_SHARES_INVALID) - unit_write_drop_in_private(u, mode, name, "CPUShares="); + unit_write_drop_in_private(u, mode, name, "CPUShares=\n"); else - unit_write_drop_in_private_format(u, mode, name, "CPUShares=%" PRIu64, shares); + unit_write_drop_in_private_format(u, mode, name, "CPUShares=%" PRIu64 "\n", shares); } return 1; @@ -337,9 +340,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_CPU); if (shares == CGROUP_CPU_SHARES_INVALID) - unit_write_drop_in_private(u, mode, name, "StartupCPUShares="); + unit_write_drop_in_private(u, mode, name, "StartupCPUShares=\n"); else - unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%" PRIu64, shares); + unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%" PRIu64 "\n", shares); } return 1; @@ -357,7 +360,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->cpu_quota_per_sec_usec = u64; unit_invalidate_cgroup(u, CGROUP_MASK_CPU); - unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%", (double) (c->cpu_quota_per_sec_usec / 10000)); + unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%\n", (double) (c->cpu_quota_per_sec_usec / 10000)); } return 1; @@ -372,7 +375,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->io_accounting = b; unit_invalidate_cgroup(u, CGROUP_MASK_IO); - unit_write_drop_in_private(u, mode, name, b ? "IOAccounting=yes" : "IOAccounting=no"); + unit_write_drop_in_private(u, mode, name, b ? "IOAccounting=yes\n" : "IOAccounting=no\n"); } return 1; @@ -392,9 +395,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_IO); if (weight == CGROUP_WEIGHT_INVALID) - unit_write_drop_in_private(u, mode, name, "IOWeight="); + unit_write_drop_in_private(u, mode, name, "IOWeight=\n"); else - unit_write_drop_in_private_format(u, mode, name, "IOWeight=%" PRIu64, weight); + unit_write_drop_in_private_format(u, mode, name, "IOWeight=%" PRIu64 "\n", weight); } return 1; @@ -414,9 +417,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_IO); if (weight == CGROUP_WEIGHT_INVALID) - unit_write_drop_in_private(u, mode, name, "StartupIOWeight="); + unit_write_drop_in_private(u, mode, name, "StartupIOWeight=\n"); else - unit_write_drop_in_private_format(u, mode, name, "StartupIOWeight=%" PRIu64, weight); + unit_write_drop_in_private_format(u, mode, name, "StartupIOWeight=%" PRIu64 "\n", weight); } return 1; @@ -589,7 +592,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->blockio_accounting = b; unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); - unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes" : "BlockIOAccounting=no"); + unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes\n" : "BlockIOAccounting=no\n"); } return 1; @@ -609,9 +612,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); if (weight == CGROUP_BLKIO_WEIGHT_INVALID) - unit_write_drop_in_private(u, mode, name, "BlockIOWeight="); + unit_write_drop_in_private(u, mode, name, "BlockIOWeight=\n"); else - unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%" PRIu64, weight); + unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%" PRIu64 "\n", weight); } return 1; @@ -623,7 +626,7 @@ int bus_cgroup_set_property( if (r < 0) return r; - if (CGROUP_BLKIO_WEIGHT_IS_OK(weight)) + if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight)) return sd_bus_error_set_errnof(error, EINVAL, "StartupBlockIOWeight value out of range"); if (mode != UNIT_CHECK) { @@ -631,9 +634,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); if (weight == CGROUP_BLKIO_WEIGHT_INVALID) - unit_write_drop_in_private(u, mode, name, "StartupBlockIOWeight="); + unit_write_drop_in_private(u, mode, name, "StartupBlockIOWeight=\n"); else - unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%" PRIu64, weight); + unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%" PRIu64 "\n", weight); } return 1; @@ -821,7 +824,32 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->memory_accounting = b; unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY); - unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes" : "MemoryAccounting=no"); + unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes\n" : "MemoryAccounting=no\n"); + } + + return 1; + + } else if (STR_IN_SET(name, "MemoryLow", "MemoryHigh", "MemoryMax")) { + uint64_t v; + + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + + if (mode != UNIT_CHECK) { + if (streq(name, "MemoryLow")) + c->memory_low = v; + else if (streq(name, "MemoryHigh")) + c->memory_high = v; + else + c->memory_max = v; + + unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY); + + if (v == CGROUP_LIMIT_MAX) + unit_write_drop_in_private_format(u, mode, name, "%s=max\n", name); + else + unit_write_drop_in_private_format(u, mode, name, "%s=%" PRIu64 "\n", name, v); } return 1; @@ -838,9 +866,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY); if (limit == (uint64_t) -1) - unit_write_drop_in_private(u, mode, name, "MemoryLimit=infinity"); + unit_write_drop_in_private(u, mode, name, "MemoryLimit=infinity\n"); else - unit_write_drop_in_private_format(u, mode, name, "MemoryLimit=%" PRIu64, limit); + unit_write_drop_in_private_format(u, mode, name, "MemoryLimit=%" PRIu64 "\n", limit); } return 1; @@ -858,13 +886,9 @@ int bus_cgroup_set_property( return -EINVAL; if (mode != UNIT_CHECK) { - char *buf; - c->device_policy = p; unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); - - buf = strjoina("DevicePolicy=", policy); - unit_write_drop_in_private(u, mode, name, buf); + unit_write_drop_in_private_format(u, mode, name, "DevicePolicy=%s\n", policy); } return 1; @@ -968,7 +992,7 @@ int bus_cgroup_set_property( if (mode != UNIT_CHECK) { c->tasks_accounting = b; unit_invalidate_cgroup(u, CGROUP_MASK_PIDS); - unit_write_drop_in_private(u, mode, name, b ? "TasksAccounting=yes" : "TasksAccounting=no"); + unit_write_drop_in_private(u, mode, name, b ? "TasksAccounting=yes\n" : "TasksAccounting=no\n"); } return 1; @@ -985,9 +1009,9 @@ int bus_cgroup_set_property( unit_invalidate_cgroup(u, CGROUP_MASK_PIDS); if (limit == (uint64_t) -1) - unit_write_drop_in_private(u, mode, name, "TasksMax=infinity"); + unit_write_drop_in_private(u, mode, name, "TasksMax=infinity\n"); else - unit_write_drop_in_private_format(u, mode, name, "TasksMax=%" PRIu64, limit); + unit_write_drop_in_private_format(u, mode, name, "TasksMax=%" PRIu64 "\n", limit); } return 1; diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 06943c6365..e21956def1 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -987,7 +987,7 @@ int bus_exec_context_set_transient_property( } c->working_directory_missing_ok = missing_ok; - unit_write_drop_in_private_format(u, mode, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s); + unit_write_drop_in_private_format(u, mode, name, "WorkingDirectory=%s%s\n", missing_ok ? "-" : "", s); } return 1; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 8193418980..00bdc238ce 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -117,6 +117,9 @@ $1.CPUShares, config_parse_cpu_shares, 0, $1.StartupCPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.startup_cpu_shares) $1.CPUQuota, config_parse_cpu_quota, 0, offsetof($1, cgroup_context) $1.MemoryAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.memory_accounting) +$1.MemoryLow, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryHigh, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryMax, config_parse_memory_limit, 0, offsetof($1, cgroup_context) $1.MemoryLimit, config_parse_memory_limit, 0, offsetof($1, cgroup_context) $1.DeviceAllow, config_parse_device_allow, 0, offsetof($1, cgroup_context) $1.DevicePolicy, config_parse_device_policy, 0, offsetof($1, cgroup_context.device_policy) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 86b4fb071b..09d3f65c77 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -2793,21 +2793,26 @@ int config_parse_memory_limit( void *userdata) { CGroupContext *c = data; - uint64_t bytes; + uint64_t bytes = CGROUP_LIMIT_MAX; int r; - if (isempty(rvalue) || streq(rvalue, "infinity")) { - c->memory_limit = (uint64_t) -1; - return 0; + if (!isempty(rvalue) && !streq(rvalue, "infinity") && !streq(rvalue, "max")) { + r = parse_size(rvalue, 1024, &bytes); + if (r < 0 || bytes < 1) { + log_syntax(unit, LOG_ERR, filename, line, r, "Memory limit '%s' invalid. Ignoring.", rvalue); + return 0; + } } - r = parse_size(rvalue, 1024, &bytes); - if (r < 0 || bytes < 1) { - log_syntax(unit, LOG_ERR, filename, line, r, "Memory limit '%s' invalid. Ignoring.", rvalue); - return 0; - } + if (streq(lvalue, "MemoryLow")) + c->memory_low = bytes; + else if (streq(lvalue, "MemoryHigh")) + c->memory_high = bytes; + else if (streq(lvalue, "MemoryMax")) + c->memory_max = bytes; + else + c->memory_limit = bytes; - c->memory_limit = bytes; return 0; } diff --git a/src/login/org.freedesktop.login1.policy.in b/src/login/org.freedesktop.login1.policy.in index 1fa6441629..66cbce393c 100644 --- a/src/login/org.freedesktop.login1.policy.in +++ b/src/login/org.freedesktop.login1.policy.in @@ -116,6 +116,8 @@ <_message>Explicit request is required to run programs as a non-logged-in user.</_message> <defaults> <allow_any>yes</allow_any> + <allow_inactive>yes</allow_inactive> + <allow_active>yes</allow_active> </defaults> </action> diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c index c7382d412d..cc79597c95 100644 --- a/src/nspawn/nspawn-patch-uid.c +++ b/src/nspawn/nspawn-patch-uid.c @@ -280,7 +280,13 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift return r > 0 || changed; } -static int is_procfs_sysfs_or_suchlike(int fd) { +/* + * Check if the filesystem is fully compatible with user namespaces or + * UID/GID patching. Some filesystems in this list can be fully mounted inside + * user namespaces, however their inodes may relate to host resources or only + * valid in the global user namespace, therefore no patching should be applied. + */ +static int is_fs_fully_userns_compatible(int fd) { struct statfs sfs; assert(fd >= 0); @@ -300,6 +306,9 @@ static int is_procfs_sysfs_or_suchlike(int fd) { F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); } @@ -311,8 +320,8 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's - * stop the recursion when we hit a procfs or sysfs file system. */ - r = is_procfs_sysfs_or_suchlike(fd); + * stop the recursion when we hit procfs, sysfs or some other special file systems. */ + r = is_fs_fully_userns_compatible(fd); if (r < 0) goto finish; if (r > 0) { diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c new file mode 100644 index 0000000000..2d145b68a7 --- /dev/null +++ b/src/nspawn/nspawn-seccomp.c @@ -0,0 +1,143 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <errno.h> +#include <linux/netlink.h> +#include <sys/capability.h> +#include <sys/types.h> + +#ifdef HAVE_SECCOMP +#include <seccomp.h> +#endif + +#include "log.h" + +#ifdef HAVE_SECCOMP +#include "seccomp-util.h" +#endif + +#include "nspawn-seccomp.h" + +#ifdef HAVE_SECCOMP + +static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, + uint64_t cap_list_retain) { + unsigned i; + int r; + static const struct { + uint64_t capability; + int syscall_num; + } blacklist[] = { + { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, + { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, + { CAP_SYS_BOOT, SCMP_SYS(kexec_load) }, + { CAP_SYS_ADMIN, SCMP_SYS(swapon) }, + { CAP_SYS_ADMIN, SCMP_SYS(swapoff) }, + { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) }, + { CAP_SYS_MODULE, SCMP_SYS(init_module) }, + { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, + { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, + { CAP_SYSLOG, SCMP_SYS(syslog) }, + }; + + for (i = 0; i < ELEMENTSOF(blacklist); i++) { + if (cap_list_retain & (1ULL << blacklist[i].capability)) + continue; + + r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); + if (r == -EFAULT) + continue; /* unknown syscall */ + if (r < 0) { + log_error_errno(r, "Failed to block syscall: %m"); + return r; + } + } + + return 0; +} + +int setup_seccomp(uint64_t cap_list_retain) { + scmp_filter_ctx seccomp; + int r; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return log_oom(); + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) { + log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); + goto finish; + } + + r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); + if (r < 0) + goto finish; + + /* + Audit is broken in containers, much of the userspace audit + hookup will fail if running inside a container. We don't + care and just turn off creation of audit sockets. + + This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail + with EAFNOSUPPORT which audit userspace uses as indication + that audit is disabled in the kernel. + */ + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 2, + SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), + SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); + if (r < 0) { + log_error_errno(r, "Failed to add audit seccomp rule: %m"); + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) { + log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); + goto finish; + } + + r = seccomp_load(seccomp); + if (r == -EINVAL) { + log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m"); + r = 0; + goto finish; + } + if (r < 0) { + log_error_errno(r, "Failed to install seccomp audit filter: %m"); + goto finish; + } + +finish: + seccomp_release(seccomp); + return r; +} + +#else + +int setup_seccomp(uint64_t cap_list_retain) { + return 0; +} + +#endif diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h new file mode 100644 index 0000000000..5bde16faf9 --- /dev/null +++ b/src/nspawn/nspawn-seccomp.h @@ -0,0 +1,24 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> + +int setup_seccomp(uint64_t cap_list_retain); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index ac11bcea5a..b421c182ce 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -26,9 +26,6 @@ #include <linux/loop.h> #include <pwd.h> #include <sched.h> -#ifdef HAVE_SECCOMP -#include <seccomp.h> -#endif #ifdef HAVE_SELINUX #include <selinux/selinux.h> #endif @@ -82,15 +79,13 @@ #include "nspawn-settings.h" #include "nspawn-setuid.h" #include "nspawn-stub-pid1.h" +#include "nspawn-seccomp.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" #include "ptyfwd.h" #include "random-util.h" #include "rm-rf.h" -#ifdef HAVE_SECCOMP -#include "seccomp-util.h" -#endif #include "selinux-util.h" #include "signal-util.h" #include "socket-util.h" @@ -136,7 +131,7 @@ static StartMode arg_start_mode = START_PID1; static bool arg_ephemeral = false; static LinkJournal arg_link_journal = LINK_AUTO; static bool arg_link_journal_try = false; -static uint64_t arg_retain = +static uint64_t arg_caps_retain = (1ULL << CAP_CHOWN) | (1ULL << CAP_DAC_OVERRIDE) | (1ULL << CAP_DAC_READ_SEARCH) | @@ -1075,7 +1070,7 @@ static int parse_argv(int argc, char *argv[]) { if (mask_all_settings) arg_settings_mask = _SETTINGS_MASK_ALL; - arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; + arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; r = detect_unified_cgroup_hierarchy(); if (r < 0) @@ -1632,7 +1627,7 @@ static int setup_journal(const char *directory) { } static int drop_capabilities(void) { - return capability_bounding_set_drop(arg_retain, false); + return capability_bounding_set_drop(arg_caps_retain, false); } static int reset_audit_loginuid(void) { @@ -1667,99 +1662,6 @@ static int reset_audit_loginuid(void) { return 0; } -static int setup_seccomp(void) { - -#ifdef HAVE_SECCOMP - static const struct { - uint64_t capability; - int syscall_num; - } blacklist[] = { - { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, - { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, - { CAP_SYS_BOOT, SCMP_SYS(kexec_load) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapon) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapoff) }, - { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) }, - { CAP_SYS_MODULE, SCMP_SYS(init_module) }, - { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, - { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, - { CAP_SYSLOG, SCMP_SYS(syslog) }, - }; - - scmp_filter_ctx seccomp; - unsigned i; - int r; - - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); - goto finish; - } - - for (i = 0; i < ELEMENTSOF(blacklist); i++) { - if (arg_retain & (1ULL << blacklist[i].capability)) - continue; - - r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); - if (r == -EFAULT) - continue; /* unknown syscall */ - if (r < 0) { - log_error_errno(r, "Failed to block syscall: %m"); - goto finish; - } - } - - /* - Audit is broken in containers, much of the userspace audit - hookup will fail if running inside a container. We don't - care and just turn off creation of audit sockets. - - This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail - with EAFNOSUPPORT which audit userspace uses as indication - that audit is disabled in the kernel. - */ - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EAFNOSUPPORT), - SCMP_SYS(socket), - 2, - SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), - SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); - if (r < 0) { - log_error_errno(r, "Failed to add audit seccomp rule: %m"); - goto finish; - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - - r = seccomp_load(seccomp); - if (r == -EINVAL) { - log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m"); - r = 0; - goto finish; - } - if (r < 0) { - log_error_errno(r, "Failed to install seccomp audit filter: %m"); - goto finish; - } - -finish: - seccomp_release(seccomp); - return r; -#else - return 0; -#endif - -} static int setup_propagate(const char *root) { const char *p, *q; @@ -2988,7 +2890,7 @@ static int outer_child( if (r < 0) return r; - r = setup_seccomp(); + r = setup_seccomp(arg_caps_retain); if (r < 0) return r; @@ -3272,9 +3174,9 @@ static int load_settings(void) { if (settings->capability != 0) log_warning("Ignoring Capability= setting, file %s is not trusted.", p); } else - arg_retain |= plus; + arg_caps_retain |= plus; - arg_retain &= ~settings->drop_capability; + arg_caps_retain &= ~settings->drop_capability; } if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index f68c4a41ac..502e98d9dc 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -166,11 +166,11 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen r = sd_bus_message_append(m, "v", "b", r); - } else if (streq(field, "MemoryLimit")) { + } else if (STR_IN_SET(field, "MemoryLow", "MemoryHigh", "MemoryMax", "MemoryLimit")) { uint64_t bytes; - if (isempty(eq) || streq(eq, "infinity")) - bytes = (uint64_t) -1; + if (isempty(eq) || streq(eq, "max") || streq(eq, "infinity")) + bytes = CGROUP_LIMIT_MAX; else { r = parse_size(eq, 1024, &bytes); if (r < 0) { diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c index 86feefcb65..c301c6a64f 100644 --- a/src/systemctl/systemctl.c +++ b/src/systemctl/systemctl.c @@ -3499,6 +3499,9 @@ typedef struct UnitStatusInfo { /* CGroup */ uint64_t memory_current; + uint64_t memory_low; + uint64_t memory_high; + uint64_t memory_max; uint64_t memory_limit; uint64_t cpu_usage_nsec; uint64_t tasks_current; @@ -3781,10 +3784,30 @@ static void print_status_info( printf(" Memory: %s", format_bytes(buf, sizeof(buf), i->memory_current)); - if (i->memory_limit != (uint64_t) -1) - printf(" (limit: %s)\n", format_bytes(buf, sizeof(buf), i->memory_limit)); - else - printf("\n"); + if (i->memory_low > 0 || i->memory_high != CGROUP_LIMIT_MAX || i->memory_max != CGROUP_LIMIT_MAX || + i->memory_limit != CGROUP_LIMIT_MAX) { + const char *prefix = ""; + + printf(" ("); + if (i->memory_low > 0) { + printf("%slow: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_low)); + prefix = " "; + } + if (i->memory_high != CGROUP_LIMIT_MAX) { + printf("%shigh: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_high)); + prefix = " "; + } + if (i->memory_max != CGROUP_LIMIT_MAX) { + printf("%smax: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_max)); + prefix = " "; + } + if (i->memory_limit != CGROUP_LIMIT_MAX) { + printf("%slimit: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_limit)); + prefix = " "; + } + printf(")"); + } + printf("\n"); } if (i->cpu_usage_nsec != (uint64_t) -1) { @@ -4013,6 +4036,12 @@ static int status_property(const char *name, sd_bus_message *m, UnitStatusInfo * i->assert_timestamp = (usec_t) u; else if (streq(name, "MemoryCurrent")) i->memory_current = u; + else if (streq(name, "MemoryLow")) + i->memory_low = u; + else if (streq(name, "MemoryHigh")) + i->memory_high = u; + else if (streq(name, "MemoryMax")) + i->memory_max = u; else if (streq(name, "MemoryLimit")) i->memory_limit = u; else if (streq(name, "TasksCurrent")) @@ -4506,6 +4535,8 @@ static int show_one( _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; UnitStatusInfo info = { .memory_current = (uint64_t) -1, + .memory_high = CGROUP_LIMIT_MAX, + .memory_max = CGROUP_LIMIT_MAX, .memory_limit = (uint64_t) -1, .cpu_usage_nsec = (uint64_t) -1, .tasks_current = (uint64_t) -1, |