summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am2
-rw-r--r--man/systemd.resource-control.xml71
-rw-r--r--src/basic/missing.h12
-rw-r--r--src/core/cgroup.c63
-rw-r--r--src/core/cgroup.h4
-rw-r--r--src/core/dbus-cgroup.c82
-rw-r--r--src/core/dbus-execute.c2
-rw-r--r--src/core/load-fragment-gperf.gperf.m43
-rw-r--r--src/core/load-fragment.c25
-rw-r--r--src/login/org.freedesktop.login1.policy.in2
-rw-r--r--src/nspawn/nspawn-patch-uid.c15
-rw-r--r--src/nspawn/nspawn-seccomp.c143
-rw-r--r--src/nspawn/nspawn-seccomp.h24
-rw-r--r--src/nspawn/nspawn.c112
-rw-r--r--src/shared/bus-unit-util.c6
-rw-r--r--src/systemctl/systemctl.c39
16 files changed, 434 insertions, 171 deletions
diff --git a/Makefile.am b/Makefile.am
index 305099ab66..f8e1fac967 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3016,6 +3016,8 @@ systemd_nspawn_SOURCES = \
src/nspawn/nspawn-expose-ports.h \
src/nspawn/nspawn-cgroup.c \
src/nspawn/nspawn-cgroup.h \
+ src/nspawn/nspawn-seccomp.c \
+ src/nspawn/nspawn-seccomp.h \
src/nspawn/nspawn-register.c \
src/nspawn/nspawn-register.h \
src/nspawn/nspawn-setuid.c \
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
index 066f2cc19b..570619a743 100644
--- a/man/systemd.resource-control.xml
+++ b/man/systemd.resource-control.xml
@@ -114,6 +114,13 @@
prefixed ones. On unified hierarchy, IO resource control also applies to buffered writes.</para>
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>Memory</option></term>
+ <listitem>
+ <para><varname>MemoryMax</varname> replaces <varname>MemoryLimit</varname>. <varname>MemoryLow</varname>
+ and <varname>MemoryHigh</varname> are effective only on unified hierarchy.</para>
+ </listitem>
+ </varlistentry>
</variablelist>
</para>
@@ -213,6 +220,67 @@
</varlistentry>
<varlistentry>
+ <term><varname>MemoryLow=<replaceable>bytes</replaceable></varname></term>
+
+ <listitem>
+ <para>Specify the best-effort memory usage protection of the executed processes in this unit. If the memory
+ usages of this unit and all its ancestors are below their low boundaries, this unit's memory won't be
+ reclaimed as long as memory can be reclaimed from unprotected units.</para>
+
+ <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is
+ parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. This controls the
+ <literal>memory.low</literal> control group attribute. For details about this control group attribute, see
+ <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para>
+
+ <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+ <para>This setting is supported only if the unified control group hierarchy is used.</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><varname>MemoryHigh=<replaceable>bytes</replaceable></varname></term>
+
+ <listitem>
+ <para>Specify the high limit on memory usage of the executed processes in this unit. Memory usage may go
+ above the limit if unavoidable, but the processes are heavily slowed down and memory is taken away
+ aggressively in such cases. This is the main mechanism to control memory usage of a unit.</para>
+
+ <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is
+ parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. If assigned the
+ special value <literal>max</literal>, no memory limit is applied. This controls the
+ <literal>memory.high</literal> control group attribute. For details about this control group attribute, see
+ <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para>
+
+ <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+ <para>This setting is supported only if the unified control group hierarchy is used.</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><varname>MemoryMax=<replaceable>bytes</replaceable></varname></term>
+
+ <listitem>
+ <para>Specify the absolute limit on memory usage of the executed processes in this unit. If memory usage
+ cannot be contained under the limit, out-of-memory killer is invoked inside the unit. It is recommended to
+ use <varname>MemoryHigh=</varname> as the main control mechanism and use <varname>MemoryMax=</varname> as the
+ last line of defense.</para>
+
+ <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is
+ parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. If assigned the
+ special value <literal>max</literal>, no memory limit is applied. This controls the
+ <literal>memory.max</literal> control group attribute. For details about this control group attribute, see
+ <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para>
+
+ <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+ <para>This setting is supported only if the unified control group hierarchy is used. Use
+ <varname>MemoryLimit=</varname> on systems using the legacy control group hierarchy.</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><varname>MemoryLimit=<replaceable>bytes</replaceable></varname></term>
<listitem>
@@ -230,6 +298,9 @@
url="https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt">memory.txt</ulink>.</para>
<para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+ <para>This setting is supported only if the legacy control group hierarchy is used. Use
+ <varname>MemoryMax=</varname> on systems using the unified control group hierarchy.</para>
</listitem>
</varlistentry>
diff --git a/src/basic/missing.h b/src/basic/missing.h
index 651e414395..2077ada72d 100644
--- a/src/basic/missing.h
+++ b/src/basic/missing.h
@@ -453,6 +453,18 @@ struct btrfs_ioctl_quota_ctl_args {
#define MQUEUE_MAGIC 0x19800202
#endif
+#ifndef SECURITYFS_MAGIC
+#define SECURITYFS_MAGIC 0x73636673
+#endif
+
+#ifndef TRACEFS_MAGIC
+#define TRACEFS_MAGIC 0x74726163
+#endif
+
+#ifndef BPF_FS_MAGIC
+#define BPF_FS_MAGIC 0xcafe4a11
+#endif
+
#ifndef MS_MOVE
#define MS_MOVE 8192
#endif
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 0fb63b1bd1..fbe69df4e9 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -46,7 +46,10 @@ void cgroup_context_init(CGroupContext *c) {
c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
c->cpu_quota_per_sec_usec = USEC_INFINITY;
- c->memory_limit = (uint64_t) -1;
+ c->memory_high = CGROUP_LIMIT_MAX;
+ c->memory_max = CGROUP_LIMIT_MAX;
+
+ c->memory_limit = CGROUP_LIMIT_MAX;
c->io_weight = CGROUP_WEIGHT_INVALID;
c->startup_io_weight = CGROUP_WEIGHT_INVALID;
@@ -147,6 +150,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
"%sStartupIOWeight=%" PRIu64 "\n"
"%sBlockIOWeight=%" PRIu64 "\n"
"%sStartupBlockIOWeight=%" PRIu64 "\n"
+ "%sMemoryLow=%" PRIu64 "\n"
+ "%sMemoryHigh=%" PRIu64 "\n"
+ "%sMemoryMax=%" PRIu64 "\n"
"%sMemoryLimit=%" PRIu64 "\n"
"%sTasksMax=%" PRIu64 "\n"
"%sDevicePolicy=%s\n"
@@ -163,6 +169,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
prefix, c->startup_io_weight,
prefix, c->blockio_weight,
prefix, c->startup_blockio_weight,
+ prefix, c->memory_low,
+ prefix, c->memory_high,
+ prefix, c->memory_max,
prefix, c->memory_limit,
prefix, c->tasks_max,
prefix, cgroup_device_policy_to_string(c->device_policy),
@@ -496,6 +505,23 @@ static unsigned cgroup_apply_blkio_device_limit(const char *path, const char *de
return n;
}
+static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
+ return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX;
+}
+
+static void cgroup_apply_unified_memory_limit(const char *path, const char *file, uint64_t v) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
+ int r;
+
+ if (v != CGROUP_LIMIT_MAX)
+ xsprintf(buf, "%" PRIu64 "\n", v);
+
+ r = cg_set_attribute("memory", path, file, buf);
+ if (r < 0)
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set %s on %s: %m", file, path);
+}
+
void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
bool is_root;
int r;
@@ -662,26 +688,30 @@ void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, M
}
if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
- if (c->memory_limit != (uint64_t) -1) {
- char buf[DECIMAL_STR_MAX(uint64_t) + 1];
-
- sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
+ if (cg_unified() > 0) {
+ uint64_t max = c->memory_max;
- if (cg_unified() <= 0)
- r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
+ if (cgroup_context_has_unified_memory_config(c))
+ max = c->memory_max;
else
- r = cg_set_attribute("memory", path, "memory.max", buf);
+ max = c->memory_limit;
+ cgroup_apply_unified_memory_limit(path, "memory.low", c->memory_low);
+ cgroup_apply_unified_memory_limit(path, "memory.high", c->memory_high);
+ cgroup_apply_unified_memory_limit(path, "memory.max", max);
} else {
- if (cg_unified() <= 0)
- r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+
+ if (c->memory_limit != CGROUP_LIMIT_MAX)
+ xsprintf(buf, "%" PRIu64 "\n", c->memory_limit);
else
- r = cg_set_attribute("memory", path, "memory.max", "max");
- }
+ xsprintf(buf, "%" PRIu64 "\n", c->memory_max);
- if (r < 0)
- log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
- "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
+ r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
+ if (r < 0)
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set memory.limit_in_bytes on %s: %m", path);
+ }
}
if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
@@ -778,7 +808,8 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) {
mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
if (c->memory_accounting ||
- c->memory_limit != (uint64_t) -1)
+ c->memory_limit != CGROUP_LIMIT_MAX ||
+ cgroup_context_has_unified_memory_config(c))
mask |= CGROUP_MASK_MEMORY;
if (c->device_allow ||
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index 2b1edbafc4..ff87adfba1 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -94,6 +94,10 @@ struct CGroupContext {
LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
+ uint64_t memory_low;
+ uint64_t memory_high;
+ uint64_t memory_max;
+
/* For legacy hierarchies */
uint64_t cpu_shares;
uint64_t startup_cpu_shares;
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
index eef1c47c14..27050b4507 100644
--- a/src/core/dbus-cgroup.c
+++ b/src/core/dbus-cgroup.c
@@ -228,6 +228,9 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0),
+ SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0),
+ SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0),
+ SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0),
SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0),
SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
@@ -260,7 +263,7 @@ static int bus_cgroup_set_transient_property(
if (mode != UNIT_CHECK) {
c->delegate = b;
- unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes" : "Delegate=no");
+ unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes\n" : "Delegate=no\n");
}
return 1;
@@ -295,7 +298,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->cpu_accounting = b;
unit_invalidate_cgroup(u, CGROUP_MASK_CPUACCT|CGROUP_MASK_CPU);
- unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes" : "CPUAccounting=no");
+ unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes\n" : "CPUAccounting=no\n");
}
return 1;
@@ -315,9 +318,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
if (shares == CGROUP_CPU_SHARES_INVALID)
- unit_write_drop_in_private(u, mode, name, "CPUShares=");
+ unit_write_drop_in_private(u, mode, name, "CPUShares=\n");
else
- unit_write_drop_in_private_format(u, mode, name, "CPUShares=%" PRIu64, shares);
+ unit_write_drop_in_private_format(u, mode, name, "CPUShares=%" PRIu64 "\n", shares);
}
return 1;
@@ -337,9 +340,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
if (shares == CGROUP_CPU_SHARES_INVALID)
- unit_write_drop_in_private(u, mode, name, "StartupCPUShares=");
+ unit_write_drop_in_private(u, mode, name, "StartupCPUShares=\n");
else
- unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%" PRIu64, shares);
+ unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%" PRIu64 "\n", shares);
}
return 1;
@@ -357,7 +360,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->cpu_quota_per_sec_usec = u64;
unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
- unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%", (double) (c->cpu_quota_per_sec_usec / 10000));
+ unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%\n", (double) (c->cpu_quota_per_sec_usec / 10000));
}
return 1;
@@ -372,7 +375,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->io_accounting = b;
unit_invalidate_cgroup(u, CGROUP_MASK_IO);
- unit_write_drop_in_private(u, mode, name, b ? "IOAccounting=yes" : "IOAccounting=no");
+ unit_write_drop_in_private(u, mode, name, b ? "IOAccounting=yes\n" : "IOAccounting=no\n");
}
return 1;
@@ -392,9 +395,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_IO);
if (weight == CGROUP_WEIGHT_INVALID)
- unit_write_drop_in_private(u, mode, name, "IOWeight=");
+ unit_write_drop_in_private(u, mode, name, "IOWeight=\n");
else
- unit_write_drop_in_private_format(u, mode, name, "IOWeight=%" PRIu64, weight);
+ unit_write_drop_in_private_format(u, mode, name, "IOWeight=%" PRIu64 "\n", weight);
}
return 1;
@@ -414,9 +417,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_IO);
if (weight == CGROUP_WEIGHT_INVALID)
- unit_write_drop_in_private(u, mode, name, "StartupIOWeight=");
+ unit_write_drop_in_private(u, mode, name, "StartupIOWeight=\n");
else
- unit_write_drop_in_private_format(u, mode, name, "StartupIOWeight=%" PRIu64, weight);
+ unit_write_drop_in_private_format(u, mode, name, "StartupIOWeight=%" PRIu64 "\n", weight);
}
return 1;
@@ -589,7 +592,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->blockio_accounting = b;
unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
- unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes" : "BlockIOAccounting=no");
+ unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes\n" : "BlockIOAccounting=no\n");
}
return 1;
@@ -609,9 +612,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
if (weight == CGROUP_BLKIO_WEIGHT_INVALID)
- unit_write_drop_in_private(u, mode, name, "BlockIOWeight=");
+ unit_write_drop_in_private(u, mode, name, "BlockIOWeight=\n");
else
- unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%" PRIu64, weight);
+ unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%" PRIu64 "\n", weight);
}
return 1;
@@ -623,7 +626,7 @@ int bus_cgroup_set_property(
if (r < 0)
return r;
- if (CGROUP_BLKIO_WEIGHT_IS_OK(weight))
+ if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight))
return sd_bus_error_set_errnof(error, EINVAL, "StartupBlockIOWeight value out of range");
if (mode != UNIT_CHECK) {
@@ -631,9 +634,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
if (weight == CGROUP_BLKIO_WEIGHT_INVALID)
- unit_write_drop_in_private(u, mode, name, "StartupBlockIOWeight=");
+ unit_write_drop_in_private(u, mode, name, "StartupBlockIOWeight=\n");
else
- unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%" PRIu64, weight);
+ unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%" PRIu64 "\n", weight);
}
return 1;
@@ -821,7 +824,32 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->memory_accounting = b;
unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY);
- unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes" : "MemoryAccounting=no");
+ unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes\n" : "MemoryAccounting=no\n");
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "MemoryLow", "MemoryHigh", "MemoryMax")) {
+ uint64_t v;
+
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+
+ if (mode != UNIT_CHECK) {
+ if (streq(name, "MemoryLow"))
+ c->memory_low = v;
+ else if (streq(name, "MemoryHigh"))
+ c->memory_high = v;
+ else
+ c->memory_max = v;
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY);
+
+ if (v == CGROUP_LIMIT_MAX)
+ unit_write_drop_in_private_format(u, mode, name, "%s=max\n", name);
+ else
+ unit_write_drop_in_private_format(u, mode, name, "%s=%" PRIu64 "\n", name, v);
}
return 1;
@@ -838,9 +866,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY);
if (limit == (uint64_t) -1)
- unit_write_drop_in_private(u, mode, name, "MemoryLimit=infinity");
+ unit_write_drop_in_private(u, mode, name, "MemoryLimit=infinity\n");
else
- unit_write_drop_in_private_format(u, mode, name, "MemoryLimit=%" PRIu64, limit);
+ unit_write_drop_in_private_format(u, mode, name, "MemoryLimit=%" PRIu64 "\n", limit);
}
return 1;
@@ -858,13 +886,9 @@ int bus_cgroup_set_property(
return -EINVAL;
if (mode != UNIT_CHECK) {
- char *buf;
-
c->device_policy = p;
unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
-
- buf = strjoina("DevicePolicy=", policy);
- unit_write_drop_in_private(u, mode, name, buf);
+ unit_write_drop_in_private_format(u, mode, name, "DevicePolicy=%s\n", policy);
}
return 1;
@@ -968,7 +992,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->tasks_accounting = b;
unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
- unit_write_drop_in_private(u, mode, name, b ? "TasksAccounting=yes" : "TasksAccounting=no");
+ unit_write_drop_in_private(u, mode, name, b ? "TasksAccounting=yes\n" : "TasksAccounting=no\n");
}
return 1;
@@ -985,9 +1009,9 @@ int bus_cgroup_set_property(
unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
if (limit == (uint64_t) -1)
- unit_write_drop_in_private(u, mode, name, "TasksMax=infinity");
+ unit_write_drop_in_private(u, mode, name, "TasksMax=infinity\n");
else
- unit_write_drop_in_private_format(u, mode, name, "TasksMax=%" PRIu64, limit);
+ unit_write_drop_in_private_format(u, mode, name, "TasksMax=%" PRIu64 "\n", limit);
}
return 1;
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 06943c6365..e21956def1 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -987,7 +987,7 @@ int bus_exec_context_set_transient_property(
}
c->working_directory_missing_ok = missing_ok;
- unit_write_drop_in_private_format(u, mode, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s);
+ unit_write_drop_in_private_format(u, mode, name, "WorkingDirectory=%s%s\n", missing_ok ? "-" : "", s);
}
return 1;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 8193418980..00bdc238ce 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -117,6 +117,9 @@ $1.CPUShares, config_parse_cpu_shares, 0,
$1.StartupCPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.startup_cpu_shares)
$1.CPUQuota, config_parse_cpu_quota, 0, offsetof($1, cgroup_context)
$1.MemoryAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.memory_accounting)
+$1.MemoryLow, config_parse_memory_limit, 0, offsetof($1, cgroup_context)
+$1.MemoryHigh, config_parse_memory_limit, 0, offsetof($1, cgroup_context)
+$1.MemoryMax, config_parse_memory_limit, 0, offsetof($1, cgroup_context)
$1.MemoryLimit, config_parse_memory_limit, 0, offsetof($1, cgroup_context)
$1.DeviceAllow, config_parse_device_allow, 0, offsetof($1, cgroup_context)
$1.DevicePolicy, config_parse_device_policy, 0, offsetof($1, cgroup_context.device_policy)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 86b4fb071b..09d3f65c77 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -2793,21 +2793,26 @@ int config_parse_memory_limit(
void *userdata) {
CGroupContext *c = data;
- uint64_t bytes;
+ uint64_t bytes = CGROUP_LIMIT_MAX;
int r;
- if (isempty(rvalue) || streq(rvalue, "infinity")) {
- c->memory_limit = (uint64_t) -1;
- return 0;
+ if (!isempty(rvalue) && !streq(rvalue, "infinity") && !streq(rvalue, "max")) {
+ r = parse_size(rvalue, 1024, &bytes);
+ if (r < 0 || bytes < 1) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Memory limit '%s' invalid. Ignoring.", rvalue);
+ return 0;
+ }
}
- r = parse_size(rvalue, 1024, &bytes);
- if (r < 0 || bytes < 1) {
- log_syntax(unit, LOG_ERR, filename, line, r, "Memory limit '%s' invalid. Ignoring.", rvalue);
- return 0;
- }
+ if (streq(lvalue, "MemoryLow"))
+ c->memory_low = bytes;
+ else if (streq(lvalue, "MemoryHigh"))
+ c->memory_high = bytes;
+ else if (streq(lvalue, "MemoryMax"))
+ c->memory_max = bytes;
+ else
+ c->memory_limit = bytes;
- c->memory_limit = bytes;
return 0;
}
diff --git a/src/login/org.freedesktop.login1.policy.in b/src/login/org.freedesktop.login1.policy.in
index 1fa6441629..66cbce393c 100644
--- a/src/login/org.freedesktop.login1.policy.in
+++ b/src/login/org.freedesktop.login1.policy.in
@@ -116,6 +116,8 @@
<_message>Explicit request is required to run programs as a non-logged-in user.</_message>
<defaults>
<allow_any>yes</allow_any>
+ <allow_inactive>yes</allow_inactive>
+ <allow_active>yes</allow_active>
</defaults>
</action>
diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c
index c7382d412d..cc79597c95 100644
--- a/src/nspawn/nspawn-patch-uid.c
+++ b/src/nspawn/nspawn-patch-uid.c
@@ -280,7 +280,13 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
return r > 0 || changed;
}
-static int is_procfs_sysfs_or_suchlike(int fd) {
+/*
+ * Check if the filesystem is fully compatible with user namespaces or
+ * UID/GID patching. Some filesystems in this list can be fully mounted inside
+ * user namespaces, however their inodes may relate to host resources or only
+ * valid in the global user namespace, therefore no patching should be applied.
+ */
+static int is_fs_fully_userns_compatible(int fd) {
struct statfs sfs;
assert(fd >= 0);
@@ -300,6 +306,9 @@ static int is_procfs_sysfs_or_suchlike(int fd) {
F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
}
@@ -311,8 +320,8 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
* probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
- * stop the recursion when we hit a procfs or sysfs file system. */
- r = is_procfs_sysfs_or_suchlike(fd);
+ * stop the recursion when we hit procfs, sysfs or some other special file systems. */
+ r = is_fs_fully_userns_compatible(fd);
if (r < 0)
goto finish;
if (r > 0) {
diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c
new file mode 100644
index 0000000000..2d145b68a7
--- /dev/null
+++ b/src/nspawn/nspawn-seccomp.c
@@ -0,0 +1,143 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <linux/netlink.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+
+#ifdef HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+
+#include "log.h"
+
+#ifdef HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+
+#include "nspawn-seccomp.h"
+
+#ifdef HAVE_SECCOMP
+
+static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx,
+ uint64_t cap_list_retain) {
+ unsigned i;
+ int r;
+ static const struct {
+ uint64_t capability;
+ int syscall_num;
+ } blacklist[] = {
+ { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
+ { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
+ { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
+ { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
+ { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
+ { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
+ { CAP_SYS_MODULE, SCMP_SYS(init_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
+ { CAP_SYSLOG, SCMP_SYS(syslog) },
+ };
+
+ for (i = 0; i < ELEMENTSOF(blacklist); i++) {
+ if (cap_list_retain & (1ULL << blacklist[i].capability))
+ continue;
+
+ r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
+ if (r == -EFAULT)
+ continue; /* unknown syscall */
+ if (r < 0) {
+ log_error_errno(r, "Failed to block syscall: %m");
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int setup_seccomp(uint64_t cap_list_retain) {
+ scmp_filter_ctx seccomp;
+ int r;
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return log_oom();
+
+ r = seccomp_add_secondary_archs(seccomp);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
+ goto finish;
+ }
+
+ r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
+ if (r < 0)
+ goto finish;
+
+ /*
+ Audit is broken in containers, much of the userspace audit
+ hookup will fail if running inside a container. We don't
+ care and just turn off creation of audit sockets.
+
+ This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
+ with EAFNOSUPPORT which audit userspace uses as indication
+ that audit is disabled in the kernel.
+ */
+
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 2,
+ SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
+ SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
+ if (r < 0) {
+ log_error_errno(r, "Failed to add audit seccomp rule: %m");
+ goto finish;
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0) {
+ log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
+ goto finish;
+ }
+
+ r = seccomp_load(seccomp);
+ if (r == -EINVAL) {
+ log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
+ r = 0;
+ goto finish;
+ }
+ if (r < 0) {
+ log_error_errno(r, "Failed to install seccomp audit filter: %m");
+ goto finish;
+ }
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
+#else
+
+int setup_seccomp(uint64_t cap_list_retain) {
+ return 0;
+}
+
+#endif
diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h
new file mode 100644
index 0000000000..5bde16faf9
--- /dev/null
+++ b/src/nspawn/nspawn-seccomp.h
@@ -0,0 +1,24 @@
+#pragma once
+
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+
+int setup_seccomp(uint64_t cap_list_retain);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index ac11bcea5a..b421c182ce 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -26,9 +26,6 @@
#include <linux/loop.h>
#include <pwd.h>
#include <sched.h>
-#ifdef HAVE_SECCOMP
-#include <seccomp.h>
-#endif
#ifdef HAVE_SELINUX
#include <selinux/selinux.h>
#endif
@@ -82,15 +79,13 @@
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
+#include "nspawn-seccomp.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "ptyfwd.h"
#include "random-util.h"
#include "rm-rf.h"
-#ifdef HAVE_SECCOMP
-#include "seccomp-util.h"
-#endif
#include "selinux-util.h"
#include "signal-util.h"
#include "socket-util.h"
@@ -136,7 +131,7 @@ static StartMode arg_start_mode = START_PID1;
static bool arg_ephemeral = false;
static LinkJournal arg_link_journal = LINK_AUTO;
static bool arg_link_journal_try = false;
-static uint64_t arg_retain =
+static uint64_t arg_caps_retain =
(1ULL << CAP_CHOWN) |
(1ULL << CAP_DAC_OVERRIDE) |
(1ULL << CAP_DAC_READ_SEARCH) |
@@ -1075,7 +1070,7 @@ static int parse_argv(int argc, char *argv[]) {
if (mask_all_settings)
arg_settings_mask = _SETTINGS_MASK_ALL;
- arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
r = detect_unified_cgroup_hierarchy();
if (r < 0)
@@ -1632,7 +1627,7 @@ static int setup_journal(const char *directory) {
}
static int drop_capabilities(void) {
- return capability_bounding_set_drop(arg_retain, false);
+ return capability_bounding_set_drop(arg_caps_retain, false);
}
static int reset_audit_loginuid(void) {
@@ -1667,99 +1662,6 @@ static int reset_audit_loginuid(void) {
return 0;
}
-static int setup_seccomp(void) {
-
-#ifdef HAVE_SECCOMP
- static const struct {
- uint64_t capability;
- int syscall_num;
- } blacklist[] = {
- { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
- { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
- { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
- { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
- { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
- { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
- { CAP_SYS_MODULE, SCMP_SYS(init_module) },
- { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
- { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
- { CAP_SYSLOG, SCMP_SYS(syslog) },
- };
-
- scmp_filter_ctx seccomp;
- unsigned i;
- int r;
-
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return log_oom();
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0) {
- log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
- goto finish;
- }
-
- for (i = 0; i < ELEMENTSOF(blacklist); i++) {
- if (arg_retain & (1ULL << blacklist[i].capability))
- continue;
-
- r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
- if (r == -EFAULT)
- continue; /* unknown syscall */
- if (r < 0) {
- log_error_errno(r, "Failed to block syscall: %m");
- goto finish;
- }
- }
-
- /*
- Audit is broken in containers, much of the userspace audit
- hookup will fail if running inside a container. We don't
- care and just turn off creation of audit sockets.
-
- This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
- with EAFNOSUPPORT which audit userspace uses as indication
- that audit is disabled in the kernel.
- */
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EAFNOSUPPORT),
- SCMP_SYS(socket),
- 2,
- SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
- SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
- if (r < 0) {
- log_error_errno(r, "Failed to add audit seccomp rule: %m");
- goto finish;
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
- goto finish;
- }
-
- r = seccomp_load(seccomp);
- if (r == -EINVAL) {
- log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
- r = 0;
- goto finish;
- }
- if (r < 0) {
- log_error_errno(r, "Failed to install seccomp audit filter: %m");
- goto finish;
- }
-
-finish:
- seccomp_release(seccomp);
- return r;
-#else
- return 0;
-#endif
-
-}
static int setup_propagate(const char *root) {
const char *p, *q;
@@ -2988,7 +2890,7 @@ static int outer_child(
if (r < 0)
return r;
- r = setup_seccomp();
+ r = setup_seccomp(arg_caps_retain);
if (r < 0)
return r;
@@ -3272,9 +3174,9 @@ static int load_settings(void) {
if (settings->capability != 0)
log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
} else
- arg_retain |= plus;
+ arg_caps_retain |= plus;
- arg_retain &= ~settings->drop_capability;
+ arg_caps_retain &= ~settings->drop_capability;
}
if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index f68c4a41ac..502e98d9dc 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -166,11 +166,11 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
r = sd_bus_message_append(m, "v", "b", r);
- } else if (streq(field, "MemoryLimit")) {
+ } else if (STR_IN_SET(field, "MemoryLow", "MemoryHigh", "MemoryMax", "MemoryLimit")) {
uint64_t bytes;
- if (isempty(eq) || streq(eq, "infinity"))
- bytes = (uint64_t) -1;
+ if (isempty(eq) || streq(eq, "max") || streq(eq, "infinity"))
+ bytes = CGROUP_LIMIT_MAX;
else {
r = parse_size(eq, 1024, &bytes);
if (r < 0) {
diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c
index 86feefcb65..c301c6a64f 100644
--- a/src/systemctl/systemctl.c
+++ b/src/systemctl/systemctl.c
@@ -3499,6 +3499,9 @@ typedef struct UnitStatusInfo {
/* CGroup */
uint64_t memory_current;
+ uint64_t memory_low;
+ uint64_t memory_high;
+ uint64_t memory_max;
uint64_t memory_limit;
uint64_t cpu_usage_nsec;
uint64_t tasks_current;
@@ -3781,10 +3784,30 @@ static void print_status_info(
printf(" Memory: %s", format_bytes(buf, sizeof(buf), i->memory_current));
- if (i->memory_limit != (uint64_t) -1)
- printf(" (limit: %s)\n", format_bytes(buf, sizeof(buf), i->memory_limit));
- else
- printf("\n");
+ if (i->memory_low > 0 || i->memory_high != CGROUP_LIMIT_MAX || i->memory_max != CGROUP_LIMIT_MAX ||
+ i->memory_limit != CGROUP_LIMIT_MAX) {
+ const char *prefix = "";
+
+ printf(" (");
+ if (i->memory_low > 0) {
+ printf("%slow: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_low));
+ prefix = " ";
+ }
+ if (i->memory_high != CGROUP_LIMIT_MAX) {
+ printf("%shigh: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_high));
+ prefix = " ";
+ }
+ if (i->memory_max != CGROUP_LIMIT_MAX) {
+ printf("%smax: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_max));
+ prefix = " ";
+ }
+ if (i->memory_limit != CGROUP_LIMIT_MAX) {
+ printf("%slimit: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_limit));
+ prefix = " ";
+ }
+ printf(")");
+ }
+ printf("\n");
}
if (i->cpu_usage_nsec != (uint64_t) -1) {
@@ -4013,6 +4036,12 @@ static int status_property(const char *name, sd_bus_message *m, UnitStatusInfo *
i->assert_timestamp = (usec_t) u;
else if (streq(name, "MemoryCurrent"))
i->memory_current = u;
+ else if (streq(name, "MemoryLow"))
+ i->memory_low = u;
+ else if (streq(name, "MemoryHigh"))
+ i->memory_high = u;
+ else if (streq(name, "MemoryMax"))
+ i->memory_max = u;
else if (streq(name, "MemoryLimit"))
i->memory_limit = u;
else if (streq(name, "TasksCurrent"))
@@ -4506,6 +4535,8 @@ static int show_one(
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
UnitStatusInfo info = {
.memory_current = (uint64_t) -1,
+ .memory_high = CGROUP_LIMIT_MAX,
+ .memory_max = CGROUP_LIMIT_MAX,
.memory_limit = (uint64_t) -1,
.cpu_usage_nsec = (uint64_t) -1,
.tasks_current = (uint64_t) -1,