summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am11
-rw-r--r--NEWS99
-rw-r--r--src/core/execute.c176
-rw-r--r--src/core/load-fragment.c54
-rw-r--r--src/nspawn/nspawn-seccomp.c18
-rw-r--r--src/shared/condition.c9
-rw-r--r--src/shared/seccomp-util.c185
-rw-r--r--src/shared/seccomp-util.h37
-rw-r--r--src/test/test-seccomp.c103
-rw-r--r--src/update-done/update-done.c15
10 files changed, 439 insertions, 268 deletions
diff --git a/Makefile.am b/Makefile.am
index bad6a2d18a..a707d4a899 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1558,6 +1558,11 @@ tests += \
test-acl-util
endif
+if HAVE_SECCOMP
+tests += \
+ test-seccomp
+endif
+
EXTRA_DIST += \
test/a.service \
test/basic.target \
@@ -2026,6 +2031,12 @@ test_acl_util_SOURCES = \
test_acl_util_LDADD = \
libsystemd-shared.la
+test_seccomp_SOURCES = \
+ src/test/test-seccomp.c
+
+test_seccomp_LDADD = \
+ libsystemd-shared.la
+
test_namespace_LDADD = \
libcore.la
diff --git a/NEWS b/NEWS
index 87cc4f48c0..2a1edbe766 100644
--- a/NEWS
+++ b/NEWS
@@ -35,14 +35,14 @@ CHANGES WITH 232 in spe
ProtectSystem=strict enabled, so they are not able to make any
permanent modifications to the system.
- The nss-systemd module also always resolves root and nobody, making
+ * The nss-systemd module also always resolves root and nobody, making
it possible to have no /etc/passwd or /etc/group files in minimal
- container systems.
+ container or chroot environments.
* Services may be started with their own user namespace using the new
- PrivateUsers= option. Only root, nobody, and the uid/gid under which
- the service is running are mapped. All other users are mapped to
- nobody.
+ boolean PrivateUsers= option. Only root, nobody, and the uid/gid
+ under which the service is running are mapped. All other users are
+ mapped to nobody.
* Support for the cgroup namespace has been added to systemd-nspawn. If
supported by kernel, the container system started by systemd-nspawn
@@ -57,12 +57,22 @@ CHANGES WITH 232 in spe
options. This controller requires out-of-tree patches for the kernel
and the support is provisional.
- * .automount units may now be transient.
-
- * systemd-mount is a new tool which wraps mount(8) to pull in
- additional dependencies through transient .mount and .automount
- units. For example, this automatically runs fsck on the block device
- before mounting, and allows the automount logic to be used.
+ * Mount and automount units may now be created transiently
+ (i.e. dynamically at runtime via the bus API, instead of requiring
+ unit files in the file system).
+
+ * systemd-mount is a new tool which may mount file systems – much like
+ mount(8), optionally pulling in additional dependencies through
+ transient .mount and .automount units. For example, this tool
+ automatically runs fsck on a backing block device before mounting,
+ and allows the automount logic to be used dynamically from the
+ command line for establishing mount points. This tool is particularly
+ useful when dealing with removable media, as it will ensure fsck is
+ run – if necessary – before the first access and that the file system
+ is quickly unmounted after each access by utilizing the automount
+ logic. This maximizes the chance that the file system on the
+ removable media stays in a clean state, and if it isn't in a clean
+ state is fixed automatically.
* LazyUnmount=yes option for mount units has been added to expose the
umount --lazy option. Similarly, ForceUnmount=yes exposes the --force
@@ -75,6 +85,12 @@ CHANGES WITH 232 in spe
mount the EFI partition on systems where /boot is used for something
else.
+ * When operating on GPT disk images for containers, systemd-nspawn will
+ now mount the ESP to /boot or /efi according to the same rules as PID
+ 1 running on a host. This allows tools like "bootctl" to operate
+ correctly within such containers, in order to make container images
+ bootable on physical systems.
+
* disk/by-id and disk/by-path symlinks are now created for NVMe drives.
* Two new user session targets have been added to support running
@@ -95,7 +111,7 @@ CHANGES WITH 232 in spe
the top of the process hierarchy (which is usually the init process
of the container).
- * systemd-journal-gatewayd learned the --directory option to serve
+ * systemd-journal-gatewayd learned the --directory= option to serve
files from the specified location.
* journalctl --root=… can be used to peruse the journal in the
@@ -112,23 +128,26 @@ CHANGES WITH 232 in spe
a click rate that is different than the one for the vertical wheel.
* systemd-run gained a new --wait option that makes service execution
- synchronous.
+ synchronous. (Specifically, the command will not return until the
+ specified service binary exited.)
- systemctl gained a new --wait option that causes the start command to
+ * systemctl gained a new --wait option that causes the start command to
wait until the units being started have terminated again.
- * A new journal output mode "short-full" has been added which uses
+ * A new journal output mode "short-full" has been added which displays
timestamps with abbreviated English day names and adds a timezone
- suffix. Those timestamps include more information and can be parsed
- by journalctl.
+ suffix. Those timestamps include more information than the default
+ "short" output mode, and can be passed directly to journalctl's
+ --since= and --until= options.
* /etc/resolv.conf will be bind-mounted into containers started by
systemd-nspawn, if possible, so any changes to resolv.conf contents
are automatically propagated to the container.
* The number of instances for socket-activated services originating
- from a single IP can be limited with MaxConnectionsPerSource=,
- extending the existing setting of MaxConnections.
+ from a single IP address can be limited with
+ MaxConnectionsPerSource=, extending the existing setting of
+ MaxConnections=.
* systemd-networkd gained support for vcan ("Virtual CAN") interface
configuration.
@@ -143,21 +162,23 @@ CHANGES WITH 232 in spe
GenericReceiveOffload=, LargeReceiveOffload= options in the
[Link] section of .link files.
- Spanning Tree Protocol enablement, Priority, Aging Time, and the
- Default Port VLAN ID can be configured for bridge devices using the
- new STP=, Priority=, AgeingTimeSec=, and DefaultPVID= settings in the
- [Bridge] section of .netdev files.
+ * The Spanning Tree Protocol, Priority, Aging Time, and the Default
+ Port VLAN ID can be configured for bridge devices using the new STP=,
+ Priority=, AgeingTimeSec=, and DefaultPVID= settings in the [Bridge]
+ section of .netdev files.
- The route table to which routes received over DHCP or RA should be
+ * The route table to which routes received over DHCP or RA should be
added can be configured with the new RouteTable= option in the [DHCP]
and [IPv6AcceptRA] sections of .network files.
- Address Resolution Protocol can be disabled on links managed by
+ * The Address Resolution Protocol can be disabled on links managed by
systemd-networkd using the ARP=no setting in the [Link] section of
.network files.
- * $SERVICE_RESULT, $EXIT_CODE, $EXIT_STATUS are set for ExecStop= and
- ExecStopPost= commands.
+ * New environment variables $SERVICE_RESULT, $EXIT_CODE and
+ $EXIT_STATUS are set for ExecStop= and ExecStopPost= commands, and
+ encode information about the result and exit codes of the current
+ service runtime cycle.
* systemd-sysctl will now configure kernel parameters in the order
they occur in the configuration files. This matches what sysctl
@@ -184,6 +205,30 @@ CHANGES WITH 232 in spe
$SYSTEMD_NSPAWN_SHARE_NS_UTS may be used to control the unsharing of
individual namespaces.
+ * "machinectl list" now shows the IP address of running containers in
+ the output, as well as OS release information.
+
+ * "loginctl list" now shows the TTY of each session in the output.
+
+ * sd-bus gained new API calls sd_bus_track_set_recursive(),
+ sd_bus_track_get_recursive(), sd_bus_track_count_name(),
+ sd_bus_track_count_sender(). They permit usage of sd_bus_track peer
+ tracking objects in a "recursive" mode, where a single client can be
+ counted multiple times, if it takes multiple references.
+
+ * sd-bus gained new API calls sd_bus_set_exit_on_disconnect() and
+ sd_bus_get_exit_on_disconnect(). They may be used to to make a
+ process using sd-bus automatically exit if the bus connection is
+ severed.
+
+ * Bus clients of the service manager may now "pin" loaded units into
+ memory, by taking an explicit reference on them. This is useful to
+ ensure the client can retrieve runtime data about the service even
+ after the service completed execution. Taking such a reference is
+ available only for privileged clients and should be helpful to watch
+ running services in a race-free manner, and in particular collect
+ information about exit statuses and results.
+
CHANGES WITH 231:
* In service units the various ExecXYZ= settings have been extended
diff --git a/src/core/execute.c b/src/core/execute.c
index 53356c3c06..5e7d7c25d7 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1185,18 +1185,19 @@ static void rename_process_from_path(const char *path) {
#ifdef HAVE_SECCOMP
static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
- if (!is_seccomp_available()) {
- log_open();
- log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
- log_close();
- return true;
- }
- return false;
+
+ if (is_seccomp_available())
+ return false;
+
+ log_open();
+ log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
+ log_close();
+ return true;
}
static int apply_seccomp(const Unit* u, const ExecContext *c) {
uint32_t negative_action, action;
- scmp_filter_ctx *seccomp;
+ scmp_filter_ctx seccomp;
Iterator i;
void *id;
int r;
@@ -1247,7 +1248,7 @@ finish:
}
static int apply_address_families(const Unit* u, const ExecContext *c) {
- scmp_filter_ctx *seccomp;
+ scmp_filter_ctx seccomp;
Iterator i;
int r;
@@ -1256,13 +1257,9 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- r = seccomp_add_secondary_archs(seccomp);
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
- goto finish;
+ return r;
if (c->address_families_whitelist) {
int af, first = 0, last = 0;
@@ -1359,10 +1356,6 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
}
}
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
r = seccomp_load(seccomp);
finish:
@@ -1371,7 +1364,7 @@ finish:
}
static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
- scmp_filter_ctx *seccomp;
+ scmp_filter_ctx seccomp;
int r;
assert(c);
@@ -1379,13 +1372,9 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c)
if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- r = seccomp_add_secondary_archs(seccomp);
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
- goto finish;
+ return r;
r = seccomp_rule_add(
seccomp,
@@ -1405,10 +1394,6 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c)
if (r < 0)
goto finish;
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
r = seccomp_load(seccomp);
finish:
@@ -1423,7 +1408,7 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
SCHED_IDLE,
};
- scmp_filter_ctx *seccomp;
+ scmp_filter_ctx seccomp;
unsigned i;
int r, p, max_policy = 0;
@@ -1432,13 +1417,9 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "RestrictRealtime="))
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- r = seccomp_add_secondary_archs(seccomp);
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
- goto finish;
+ return r;
/* Determine the highest policy constant we want to allow */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
@@ -1482,10 +1463,6 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
if (r < 0)
goto finish;
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
r = seccomp_load(seccomp);
finish:
@@ -1494,7 +1471,7 @@ finish:
}
static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
- scmp_filter_ctx *seccomp;
+ scmp_filter_ctx seccomp;
int r;
assert(c);
@@ -1505,13 +1482,9 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- r = seccomp_add_secondary_archs(seccomp);
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
- goto finish;
+ return r;
r = seccomp_rule_add(
seccomp,
@@ -1521,10 +1494,6 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
if (r < 0)
goto finish;
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
r = seccomp_load(seccomp);
finish:
@@ -1533,56 +1502,17 @@ finish:
}
static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
- static const int module_syscalls[] = {
- SCMP_SYS(delete_module),
- SCMP_SYS(finit_module),
- SCMP_SYS(init_module),
- };
-
- scmp_filter_ctx *seccomp;
- unsigned i;
- int r;
-
assert(c);
- /* Turn of module syscalls on ProtectKernelModules=yes */
+ /* Turn off module syscalls on ProtectKernelModules=yes */
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0)
- goto finish;
-
- for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
- r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
- module_syscalls[i], 0);
- if (r < 0)
- goto finish;
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
}
static int apply_private_devices(Unit *u, const ExecContext *c) {
- const SystemCallFilterSet *set;
- scmp_filter_ctx *seccomp;
- const char *sys;
- bool syscalls_found = false;
- int r;
-
assert(c);
/* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
@@ -1590,61 +1520,7 @@ static int apply_private_devices(Unit *u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "PrivateDevices="))
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0)
- goto finish;
-
- for (set = syscall_filter_sets; set->set_name; set++)
- if (streq(set->set_name, "@raw-io")) {
- syscalls_found = true;
- break;
- }
-
- /* We should never fail here */
- if (!syscalls_found) {
- r = -EOPNOTSUPP;
- goto finish;
- }
-
- NULSTR_FOREACH(sys, set->value) {
- int id;
- bool add = true;
-
-#ifndef __NR_s390_pci_mmio_read
- if (streq(sys, "s390_pci_mmio_read"))
- add = false;
-#endif
-#ifndef __NR_s390_pci_mmio_write
- if (streq(sys, "s390_pci_mmio_write"))
- add = false;
-#endif
-
- if (!add)
- continue;
-
- id = seccomp_syscall_resolve_name(sys);
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- id, 0);
- if (r < 0)
- goto finish;
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
}
#endif
@@ -1890,9 +1766,9 @@ static int setup_private_users(uid_t uid, gid_t gid) {
asprintf(&uid_map,
"0 0 1\n" /* Map root → root */
UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
- uid, uid); /* The case where the above is the same */
+ uid, uid);
else
- uid_map = strdup("0 0 1\n");
+ uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
if (!uid_map)
return -ENOMEM;
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 6f68e23340..118b39c1cf 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -2618,6 +2618,7 @@ int config_parse_documentation(const char *unit,
}
#ifdef HAVE_SECCOMP
+
static int syscall_filter_parse_one(
const char *unit,
const char *filename,
@@ -2628,27 +2629,29 @@ static int syscall_filter_parse_one(
bool warn) {
int r;
- if (*t == '@') {
- const SystemCallFilterSet *set;
+ if (t[0] == '@') {
+ const SyscallFilterSet *set;
+ const char *i;
- for (set = syscall_filter_sets; set->set_name; set++)
- if (streq(set->set_name, t)) {
- const char *sys;
+ set = syscall_filter_set_find(t);
+ if (!set) {
+ if (warn)
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Don't know system call group, ignoring: %s", t);
+ return 0;
+ }
- NULSTR_FOREACH(sys, set->value) {
- r = syscall_filter_parse_one(unit, filename, line, c, invert, sys, false);
- if (r < 0)
- return r;
- }
- break;
- }
+ NULSTR_FOREACH(i, set->value) {
+ r = syscall_filter_parse_one(unit, filename, line, c, invert, i, false);
+ if (r < 0)
+ return r;
+ }
} else {
int id;
id = seccomp_syscall_resolve_name(t);
if (id == __NR_SCMP_ERROR) {
if (warn)
- log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse system call, ignoring: %s", t);
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse system call, ignoring: %s", t);
return 0;
}
@@ -2662,8 +2665,9 @@ static int syscall_filter_parse_one(
if (r < 0)
return log_oom();
} else
- set_remove(c->syscall_filter, INT_TO_PTR(id + 1));
+ (void) set_remove(c->syscall_filter, INT_TO_PTR(id + 1));
}
+
return 0;
}
@@ -2682,8 +2686,7 @@ int config_parse_syscall_filter(
ExecContext *c = data;
Unit *u = userdata;
bool invert = false;
- const char *word, *state;
- size_t l;
+ const char *p;
int r;
assert(filename);
@@ -2722,19 +2725,24 @@ int config_parse_syscall_filter(
}
}
- FOREACH_WORD_QUOTED(word, l, rvalue, state) {
- _cleanup_free_ char *t = NULL;
+ p = rvalue;
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
- t = strndup(word, l);
- if (!t)
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ break;
+ }
- r = syscall_filter_parse_one(unit, filename, line, c, invert, t, true);
+ r = syscall_filter_parse_one(unit, filename, line, c, invert, word, true);
if (r < 0)
return r;
}
- if (!isempty(state))
- log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring.");
/* Turn on NNP, but only if it wasn't configured explicitly
* before, and only if we are in user mode. */
diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c
index 44a0b397ab..03a397d30c 100644
--- a/src/nspawn/nspawn-seccomp.c
+++ b/src/nspawn/nspawn-seccomp.c
@@ -135,15 +135,9 @@ int setup_seccomp(uint64_t cap_list_retain) {
return 0;
}
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return log_oom();
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0) {
- log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
- goto finish;
- }
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate seccomp object: %m");
r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
if (r < 0)
@@ -171,12 +165,6 @@ int setup_seccomp(uint64_t cap_list_retain) {
goto finish;
}
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
- goto finish;
- }
-
r = seccomp_load(seccomp);
if (r < 0) {
log_error_errno(r, "Failed to install seccomp audit filter: %m");
diff --git a/src/shared/condition.c b/src/shared/condition.c
index f13fa6a9fd..69b4837e1f 100644
--- a/src/shared/condition.c
+++ b/src/shared/condition.c
@@ -329,9 +329,9 @@ static int condition_test_needs_update(Condition *c) {
uint64_t timestamp;
int r;
- r = parse_env_file(p, NULL, "TimestampNSec", &timestamp_str, NULL);
+ r = parse_env_file(p, NULL, "TIMESTAMP_NSEC", &timestamp_str, NULL);
if (r < 0) {
- log_error_errno(-r, "Failed to parse timestamp file '%s', using mtime: %m", p);
+ log_error_errno(r, "Failed to parse timestamp file '%s', using mtime: %m", p);
return true;
} else if (r == 0) {
log_debug("No data in timestamp file '%s', using mtime", p);
@@ -340,12 +340,11 @@ static int condition_test_needs_update(Condition *c) {
r = safe_atou64(timestamp_str, &timestamp);
if (r < 0) {
- log_error_errno(-r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m",
- timestamp_str, p);
+ log_error_errno(r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m", timestamp_str, p);
return true;
}
- other.st_mtim.tv_nsec = timestamp % NSEC_PER_SEC;
+ timespec_store(&other.st_mtim, timestamp);
}
return usr.st_mtim.tv_nsec > other.st_mtim.tv_nsec;
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index 8116c7671f..6252cd16a6 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -26,6 +26,7 @@
#include "macro.h"
#include "seccomp-util.h"
#include "string-util.h"
+#include "util.h"
const char* seccomp_arch_to_string(uint32_t c) {
@@ -73,7 +74,34 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) {
return 0;
}
-int seccomp_add_secondary_archs(scmp_filter_ctx *c) {
+int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
+ scmp_filter_ctx seccomp;
+ int r;
+
+ /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
+ * added by default, and NNP is turned off. */
+
+ seccomp = seccomp_init(default_action);
+ if (!seccomp)
+ return -ENOMEM;
+
+ r = seccomp_add_secondary_archs(seccomp);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ goto finish;
+
+ *ret = seccomp;
+ return 0;
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
+int seccomp_add_secondary_archs(scmp_filter_ctx c) {
#if defined(__i386__) || defined(__x86_64__)
int r;
@@ -110,7 +138,6 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) {
#endif
return 0;
-
}
static bool is_basic_seccomp_available(void) {
@@ -132,28 +159,30 @@ bool is_seccomp_available(void) {
return cached_enabled;
}
-const SystemCallFilterSet syscall_filter_sets[] = {
- {
+const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
+ [SYSCALL_FILTER_SET_CLOCK] = {
/* Clock */
- .set_name = "@clock",
+ .name = "@clock",
.value =
"adjtimex\0"
"clock_adjtime\0"
"clock_settime\0"
"settimeofday\0"
"stime\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_CPU_EMULATION] = {
/* CPU emulation calls */
- .set_name = "@cpu-emulation",
+ .name = "@cpu-emulation",
.value =
"modify_ldt\0"
"subpage_prot\0"
"switch_endian\0"
"vm86\0"
"vm86old\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_DEBUG] = {
/* Debugging/Performance Monitoring/Tracing */
- .set_name = "@debug",
+ .name = "@debug",
.value =
"lookup_dcookie\0"
"perf_event_open\0"
@@ -161,11 +190,14 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"process_vm_writev\0"
"ptrace\0"
"rtas\0"
+#ifdef __NR_s390_runtime_instr
"s390_runtime_instr\0"
+#endif
"sys_debug_setcontext\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_DEFAULT] = {
/* Default list */
- .set_name = "@default",
+ .name = "@default",
.value =
"execve\0"
"exit\0"
@@ -173,9 +205,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"getrlimit\0" /* make sure processes can query stack size and such */
"rt_sigreturn\0"
"sigreturn\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_IO_EVENT] = {
/* Event loop use */
- .set_name = "@io-event",
+ .name = "@io-event",
.value =
"_newselect\0"
"epoll_create1\0"
@@ -191,9 +224,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"ppoll\0"
"pselect6\0"
"select\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_IPC] = {
/* Message queues, SYSV IPC or other IPC: unusual */
- .set_name = "@ipc",
+ .name = "@ipc",
.value = "ipc\0"
"mq_getsetattr\0"
"mq_notify\0"
@@ -215,33 +249,36 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"shmctl\0"
"shmdt\0"
"shmget\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_KEYRING] = {
/* Keyring */
- .set_name = "@keyring",
+ .name = "@keyring",
.value =
"add_key\0"
"keyctl\0"
"request_key\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_MODULE] = {
/* Kernel module control */
- .set_name = "@module",
+ .name = "@module",
.value =
"delete_module\0"
"finit_module\0"
"init_module\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_MOUNT] = {
/* Mounting */
- .set_name = "@mount",
+ .name = "@mount",
.value =
"chroot\0"
"mount\0"
- "oldumount\0"
"pivot_root\0"
"umount2\0"
"umount\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_NETWORK_IO] = {
/* Network or Unix socket IO, should not be needed if not network facing */
- .set_name = "@network-io",
+ .name = "@network-io",
.value =
"accept4\0"
"accept\0"
@@ -264,9 +301,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"socket\0"
"socketcall\0"
"socketpair\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_OBSOLETE] = {
/* Unusual, obsolete or unimplemented, some unknown even to libseccomp */
- .set_name = "@obsolete",
+ .name = "@obsolete",
.value =
"_sysctl\0"
"afs_syscall\0"
@@ -292,9 +330,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"uselib\0"
"ustat\0"
"vserver\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_PRIVILEGED] = {
/* Nice grab-bag of all system calls which need superuser capabilities */
- .set_name = "@privileged",
+ .name = "@privileged",
.value =
"@clock\0"
"@module\0"
@@ -331,11 +370,12 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"setuid\0"
"swapoff\0"
"swapon\0"
- "sysctl\0"
+ "_sysctl\0"
"vhangup\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_PROCESS] = {
/* Process control, execution, namespaces */
- .set_name = "@process",
+ .name = "@process",
.value =
"arch_prctl\0"
"clone\0"
@@ -349,19 +389,90 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"tkill\0"
"unshare\0"
"vfork\0"
- }, {
+ },
+ [SYSCALL_FILTER_SET_RAW_IO] = {
/* Raw I/O ports */
- .set_name = "@raw-io",
+ .name = "@raw-io",
.value =
"ioperm\0"
"iopl\0"
"pciconfig_iobase\0"
"pciconfig_read\0"
"pciconfig_write\0"
+#ifdef __NR_s390_pci_mmio_read
"s390_pci_mmio_read\0"
+#endif
+#ifdef __NR_s390_pci_mmio_write
"s390_pci_mmio_write\0"
- }, {
- .set_name = NULL,
- .value = NULL
- }
+#endif
+ },
};
+
+const SyscallFilterSet *syscall_filter_set_find(const char *name) {
+ unsigned i;
+
+ if (isempty(name) || name[0] != '@')
+ return NULL;
+
+ for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
+ if (streq(syscall_filter_sets[i].name, name))
+ return syscall_filter_sets + i;
+
+ return NULL;
+}
+
+int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
+ const char *sys;
+ int r;
+
+ assert(seccomp);
+ assert(set);
+
+ NULSTR_FOREACH(sys, set->value) {
+ int id;
+
+ if (sys[0] == '@') {
+ const SyscallFilterSet *other;
+
+ other = syscall_filter_set_find(sys);
+ if (!other)
+ return -EINVAL;
+
+ r = seccomp_add_syscall_filter_set(seccomp, other, action);
+ } else {
+ id = seccomp_syscall_resolve_name(sys);
+ if (id == __NR_SCMP_ERROR)
+ return -EINVAL;
+
+ r = seccomp_rule_add(seccomp, action, id, 0);
+ }
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
+ scmp_filter_ctx seccomp;
+ int r;
+
+ assert(set);
+
+ /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
+
+ r = seccomp_init_conservative(&seccomp, default_action);
+ if (r < 0)
+ return r;
+
+ r = seccomp_add_syscall_filter_set(seccomp, set, action);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_load(seccomp);
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+
+}
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index cca7c17912..8050fc6fbf 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -20,18 +20,45 @@
***/
#include <seccomp.h>
+#include <stdbool.h>
#include <stdint.h>
const char* seccomp_arch_to_string(uint32_t c);
int seccomp_arch_from_string(const char *n, uint32_t *ret);
-int seccomp_add_secondary_archs(scmp_filter_ctx *c);
+int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action);
+
+int seccomp_add_secondary_archs(scmp_filter_ctx c);
bool is_seccomp_available(void);
-typedef struct SystemCallFilterSet {
- const char *set_name;
+typedef struct SyscallFilterSet {
+ const char *name;
const char *value;
-} SystemCallFilterSet;
+} SyscallFilterSet;
+
+enum {
+ SYSCALL_FILTER_SET_CLOCK,
+ SYSCALL_FILTER_SET_CPU_EMULATION,
+ SYSCALL_FILTER_SET_DEBUG,
+ SYSCALL_FILTER_SET_DEFAULT,
+ SYSCALL_FILTER_SET_IO_EVENT,
+ SYSCALL_FILTER_SET_IPC,
+ SYSCALL_FILTER_SET_KEYRING,
+ SYSCALL_FILTER_SET_MODULE,
+ SYSCALL_FILTER_SET_MOUNT,
+ SYSCALL_FILTER_SET_NETWORK_IO,
+ SYSCALL_FILTER_SET_OBSOLETE,
+ SYSCALL_FILTER_SET_PRIVILEGED,
+ SYSCALL_FILTER_SET_PROCESS,
+ SYSCALL_FILTER_SET_RAW_IO,
+ _SYSCALL_FILTER_SET_MAX
+};
+
+extern const SyscallFilterSet syscall_filter_sets[];
+
+const SyscallFilterSet *syscall_filter_set_find(const char *name);
+
+int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
-extern const SystemCallFilterSet syscall_filter_sets[];
+int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c
new file mode 100644
index 0000000000..0060ecdf02
--- /dev/null
+++ b/src/test/test-seccomp.c
@@ -0,0 +1,103 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <stdlib.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "fd-util.h"
+#include "macro.h"
+#include "process-util.h"
+#include "seccomp-util.h"
+
+static void test_seccomp_arch_to_string(void) {
+ uint32_t a, b;
+ const char *name;
+
+ a = seccomp_arch_native();
+ assert_se(a > 0);
+ name = seccomp_arch_to_string(a);
+ assert_se(name);
+ assert_se(seccomp_arch_from_string(name, &b) >= 0);
+ assert_se(a == b);
+}
+
+static void test_syscall_filter_set_find(void) {
+ assert_se(!syscall_filter_set_find(NULL));
+ assert_se(!syscall_filter_set_find(""));
+ assert_se(!syscall_filter_set_find("quux"));
+ assert_se(!syscall_filter_set_find("@quux"));
+
+ assert_se(syscall_filter_set_find("@clock") == syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK);
+ assert_se(syscall_filter_set_find("@default") == syscall_filter_sets + SYSCALL_FILTER_SET_DEFAULT);
+ assert_se(syscall_filter_set_find("@raw-io") == syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO);
+}
+
+static void test_filter_sets(void) {
+ unsigned i;
+ int r;
+
+ if (!is_seccomp_available())
+ return;
+
+ if (geteuid() != 0)
+ return;
+
+ for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) {
+ pid_t pid;
+
+ log_info("Testing %s", syscall_filter_sets[i].name);
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) { /* Child? */
+ int fd;
+
+ if (i == SYSCALL_FILTER_SET_DEFAULT) /* if we look at the default set, whitelist instead of blacklist */
+ r = seccomp_load_filter_set(SCMP_ACT_ERRNO(EPERM), syscall_filter_sets + i, SCMP_ACT_ALLOW);
+ else
+ r = seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EPERM));
+ if (r < 0)
+ _exit(EXIT_FAILURE);
+
+ /* Test the sycall filter with one random system call */
+ fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC);
+ if (IN_SET(i, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_DEFAULT))
+ assert_se(fd < 0 && errno == EPERM);
+ else {
+ assert_se(fd >= 0);
+ safe_close(fd);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn(syscall_filter_sets[i].name, pid, true) == EXIT_SUCCESS);
+ }
+}
+
+int main(int argc, char *argv[]) {
+
+ test_seccomp_arch_to_string();
+ test_syscall_filter_set_find();
+ test_filter_sets();
+
+ return 0;
+}
diff --git a/src/update-done/update-done.c b/src/update-done/update-done.c
index 5cc5abfddf..48c2a3fff4 100644
--- a/src/update-done/update-done.c
+++ b/src/update-done/update-done.c
@@ -18,6 +18,7 @@
***/
#include "fd-util.h"
+#include "fileio.h"
#include "io-util.h"
#include "selinux-util.h"
#include "util.h"
@@ -32,8 +33,8 @@ static int apply_timestamp(const char *path, struct timespec *ts) {
*ts,
*ts
};
- int fd = -1;
_cleanup_fclose_ FILE *f = NULL;
+ int fd = -1;
int r;
assert(path);
@@ -59,18 +60,20 @@ static int apply_timestamp(const char *path, struct timespec *ts) {
return log_error_errno(errno, "Failed to create/open timestamp file %s: %m", path);
}
- f = fdopen(fd, "w");
+ f = fdopen(fd, "we");
if (!f) {
safe_close(fd);
return log_error_errno(errno, "Failed to fdopen() timestamp file %s: %m", path);
}
(void) fprintf(f,
- "%s"
- "TimestampNSec=" NSEC_FMT "\n",
- MESSAGE, timespec_load_nsec(ts));
+ MESSAGE
+ "TIMESTAMP_NSEC=" NSEC_FMT "\n",
+ timespec_load_nsec(ts));
- fflush(f);
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write timestamp file: %m");
if (futimens(fd, twice) < 0)
return log_error_errno(errno, "Failed to update timestamp on %s: %m", path);