/***
  This file is part of systemd.
  Copyright 2014 Lennart Poettering
  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.
  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.
  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see .
***/
#include 
#include 
#include 
#include 
#include 
#include "alloc-util.h"
#include "macro.h"
#include "nsflags.h"
#include "seccomp-util.h"
#include "string-util.h"
#include "util.h"
const char* seccomp_arch_to_string(uint32_t c) {
        /* Maintain order used in .
         *
         * Names used here should be the same as those used for ConditionArchitecture=,
         * except for "subarchitectures" like x32. */
        switch(c) {
        case SCMP_ARCH_NATIVE:
                return "native";
        case SCMP_ARCH_X86:
                return "x86";
        case SCMP_ARCH_X86_64:
                return "x86-64";
        case SCMP_ARCH_X32:
                return "x32";
        case SCMP_ARCH_ARM:
                return "arm";
        case SCMP_ARCH_AARCH64:
                return "arm64";
        case SCMP_ARCH_MIPS:
                return "mips";
        case SCMP_ARCH_MIPS64:
                return "mips64";
        case SCMP_ARCH_MIPS64N32:
                return "mips64-n32";
        case SCMP_ARCH_MIPSEL:
                return "mips-le";
        case SCMP_ARCH_MIPSEL64:
                return "mips64-le";
        case SCMP_ARCH_MIPSEL64N32:
                return "mips64-le-n32";
        case SCMP_ARCH_PPC:
                return "ppc";
        case SCMP_ARCH_PPC64:
                return "ppc64";
        case SCMP_ARCH_PPC64LE:
                return "ppc64-le";
        case SCMP_ARCH_S390:
                return "s390";
        case SCMP_ARCH_S390X:
                return "s390x";
        default:
                return NULL;
        }
}
int seccomp_arch_from_string(const char *n, uint32_t *ret) {
        if (!n)
                return -EINVAL;
        assert(ret);
        if (streq(n, "native"))
                *ret = SCMP_ARCH_NATIVE;
        else if (streq(n, "x86"))
                *ret = SCMP_ARCH_X86;
        else if (streq(n, "x86-64"))
                *ret = SCMP_ARCH_X86_64;
        else if (streq(n, "x32"))
                *ret = SCMP_ARCH_X32;
        else if (streq(n, "arm"))
                *ret = SCMP_ARCH_ARM;
        else if (streq(n, "arm64"))
                *ret = SCMP_ARCH_AARCH64;
        else if (streq(n, "mips"))
                *ret = SCMP_ARCH_MIPS;
        else if (streq(n, "mips64"))
                *ret = SCMP_ARCH_MIPS64;
        else if (streq(n, "mips64-n32"))
                *ret = SCMP_ARCH_MIPS64N32;
        else if (streq(n, "mips-le"))
                *ret = SCMP_ARCH_MIPSEL;
        else if (streq(n, "mips64-le"))
                *ret = SCMP_ARCH_MIPSEL64;
        else if (streq(n, "mips64-le-n32"))
                *ret = SCMP_ARCH_MIPSEL64N32;
        else if (streq(n, "ppc"))
                *ret = SCMP_ARCH_PPC;
        else if (streq(n, "ppc64"))
                *ret = SCMP_ARCH_PPC64;
        else if (streq(n, "ppc64-le"))
                *ret = SCMP_ARCH_PPC64LE;
        else if (streq(n, "s390"))
                *ret = SCMP_ARCH_S390;
        else if (streq(n, "s390x"))
                *ret = SCMP_ARCH_S390X;
        else
                return -EINVAL;
        return 0;
}
int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
        scmp_filter_ctx seccomp;
        int r;
        /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
         * added by default, and NNP is turned off. */
        seccomp = seccomp_init(default_action);
        if (!seccomp)
                return -ENOMEM;
        r = seccomp_add_secondary_archs(seccomp);
        if (r < 0)
                goto finish;
        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
        if (r < 0)
                goto finish;
        *ret = seccomp;
        return 0;
finish:
        seccomp_release(seccomp);
        return r;
}
int seccomp_add_secondary_archs(scmp_filter_ctx ctx) {
        /* Add in all possible secondary archs we are aware of that
         * this kernel might support. */
        static const int seccomp_arches[] = {
#if defined(__i386__) || defined(__x86_64__)
                SCMP_ARCH_X86,
                SCMP_ARCH_X86_64,
                SCMP_ARCH_X32,
#elif defined(__arm__) || defined(__aarch64__)
                SCMP_ARCH_ARM,
                SCMP_ARCH_AARCH64,
#elif defined(__arm__) || defined(__aarch64__)
                SCMP_ARCH_ARM,
                SCMP_ARCH_AARCH64,
#elif defined(__mips__) || defined(__mips64__)
                SCMP_ARCH_MIPS,
                SCMP_ARCH_MIPS64,
                SCMP_ARCH_MIPS64N32,
                SCMP_ARCH_MIPSEL,
                SCMP_ARCH_MIPSEL64,
                SCMP_ARCH_MIPSEL64N32,
#elif defined(__powerpc__) || defined(__powerpc64__)
                SCMP_ARCH_PPC,
                SCMP_ARCH_PPC64,
                SCMP_ARCH_PPC64LE,
#elif defined(__s390__) || defined(__s390x__)
                SCMP_ARCH_S390,
                SCMP_ARCH_S390X,
#endif
        };
        unsigned i;
        int r;
        for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) {
                r = seccomp_arch_add(ctx, seccomp_arches[i]);
                if (r < 0 && r != -EEXIST)
                        return r;
        }
        return 0;
}
static bool is_basic_seccomp_available(void) {
        int r;
        r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
        return r >= 0;
}
static bool is_seccomp_filter_available(void) {
        int r;
        r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
        return r < 0 && errno == EFAULT;
}
bool is_seccomp_available(void) {
        static int cached_enabled = -1;
        if (cached_enabled < 0)
                cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available();
        return cached_enabled;
}
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
        [SYSCALL_FILTER_SET_DEFAULT] = {
                .name = "@default",
                .help = "System calls that are always permitted",
                .value =
                "clock_getres\0"
                "clock_gettime\0"
                "clock_nanosleep\0"
                "execve\0"
                "exit\0"
                "exit_group\0"
                "getrlimit\0"      /* make sure processes can query stack size and such */
                "gettimeofday\0"
                "nanosleep\0"
                "pause\0"
                "rt_sigreturn\0"
                "sigreturn\0"
                "time\0"
        },
        [SYSCALL_FILTER_SET_BASIC_IO] = {
                .name = "@basic-io",
                .help = "Basic IO",
                .value =
                "close\0"
                "dup2\0"
                "dup3\0"
                "dup\0"
                "lseek\0"
                "pread64\0"
                "preadv\0"
                "pwrite64\0"
                "pwritev\0"
                "read\0"
                "readv\0"
                "write\0"
                "writev\0"
        },
        [SYSCALL_FILTER_SET_CLOCK] = {
                .name = "@clock",
                .help = "Change the system time",
                .value =
                "adjtimex\0"
                "clock_adjtime\0"
                "clock_settime\0"
                "settimeofday\0"
                "stime\0"
        },
        [SYSCALL_FILTER_SET_CPU_EMULATION] = {
                .name = "@cpu-emulation",
                .help = "System calls for CPU emulation functionality",
                .value =
                "modify_ldt\0"
                "subpage_prot\0"
                "switch_endian\0"
                "vm86\0"
                "vm86old\0"
        },
        [SYSCALL_FILTER_SET_DEBUG] = {
                .name = "@debug",
                .help = "Debugging, performance monitoring and tracing functionality",
                .value =
                "lookup_dcookie\0"
                "perf_event_open\0"
                "process_vm_readv\0"
                "process_vm_writev\0"
                "ptrace\0"
                "rtas\0"
#ifdef __NR_s390_runtime_instr
                "s390_runtime_instr\0"
#endif
                "sys_debug_setcontext\0"
        },
        [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
                .name = "@file-system",
                .help = "File system operations",
                .value =
                "access\0"
                "chdir\0"
                "chmod\0"
                "close\0"
                "creat\0"
                "faccessat\0"
                "fallocate\0"
                "fchdir\0"
                "fchmod\0"
                "fchmodat\0"
                "fcntl64\0"
                "fcntl\0"
                "fgetxattr\0"
                "flistxattr\0"
                "fsetxattr\0"
                "fstat64\0"
                "fstat\0"
                "fstatat64\0"
                "fstatfs64\0"
                "fstatfs\0"
                "ftruncate64\0"
                "ftruncate\0"
                "futimesat\0"
                "getcwd\0"
                "getdents64\0"
                "getdents\0"
                "getxattr\0"
                "inotify_add_watch\0"
                "inotify_init1\0"
                "inotify_rm_watch\0"
                "lgetxattr\0"
                "link\0"
                "linkat\0"
                "listxattr\0"
                "llistxattr\0"
                "lremovexattr\0"
                "lsetxattr\0"
                "lstat64\0"
                "lstat\0"
                "mkdir\0"
                "mkdirat\0"
                "mknod\0"
                "mknodat\0"
                "mmap2\0"
                "mmap\0"
                "newfstatat\0"
                "open\0"
                "openat\0"
                "readlink\0"
                "readlinkat\0"
                "removexattr\0"
                "rename\0"
                "renameat2\0"
                "renameat\0"
                "rmdir\0"
                "setxattr\0"
                "stat64\0"
                "stat\0"
                "statfs\0"
                "symlink\0"
                "symlinkat\0"
                "truncate64\0"
                "truncate\0"
                "unlink\0"
                "unlinkat\0"
                "utimensat\0"
                "utimes\0"
        },
        [SYSCALL_FILTER_SET_IO_EVENT] = {
                .name = "@io-event",
                .help = "Event loop system calls",
                .value =
                "_newselect\0"
                "epoll_create1\0"
                "epoll_create\0"
                "epoll_ctl\0"
                "epoll_ctl_old\0"
                "epoll_pwait\0"
                "epoll_wait\0"
                "epoll_wait_old\0"
                "eventfd2\0"
                "eventfd\0"
                "poll\0"
                "ppoll\0"
                "pselect6\0"
                "select\0"
        },
        [SYSCALL_FILTER_SET_IPC] = {
                .name = "@ipc",
                .help = "SysV IPC, POSIX Message Queues or other IPC",
                .value =
                "ipc\0"
                "memfd_create\0"
                "mq_getsetattr\0"
                "mq_notify\0"
                "mq_open\0"
                "mq_timedreceive\0"
                "mq_timedsend\0"
                "mq_unlink\0"
                "msgctl\0"
                "msgget\0"
                "msgrcv\0"
                "msgsnd\0"
                "pipe2\0"
                "pipe\0"
                "process_vm_readv\0"
                "process_vm_writev\0"
                "semctl\0"
                "semget\0"
                "semop\0"
                "semtimedop\0"
                "shmat\0"
                "shmctl\0"
                "shmdt\0"
                "shmget\0"
        },
        [SYSCALL_FILTER_SET_KEYRING] = {
                .name = "@keyring",
                .help = "Kernel keyring access",
                .value =
                "add_key\0"
                "keyctl\0"
                "request_key\0"
        },
        [SYSCALL_FILTER_SET_MODULE] = {
                .name = "@module",
                .help = "Loading and unloading of kernel modules",
                .value =
                "delete_module\0"
                "finit_module\0"
                "init_module\0"
        },
        [SYSCALL_FILTER_SET_MOUNT] = {
                .name = "@mount",
                .help = "Mounting and unmounting of file systems",
                .value =
                "chroot\0"
                "mount\0"
                "pivot_root\0"
                "umount2\0"
                "umount\0"
        },
        [SYSCALL_FILTER_SET_NETWORK_IO] = {
                .name = "@network-io",
                .help = "Network or Unix socket IO, should not be needed if not network facing",
                .value =
                "accept4\0"
                "accept\0"
                "bind\0"
                "connect\0"
                "getpeername\0"
                "getsockname\0"
                "getsockopt\0"
                "listen\0"
                "recv\0"
                "recvfrom\0"
                "recvmmsg\0"
                "recvmsg\0"
                "send\0"
                "sendmmsg\0"
                "sendmsg\0"
                "sendto\0"
                "setsockopt\0"
                "shutdown\0"
                "socket\0"
                "socketcall\0"
                "socketpair\0"
        },
        [SYSCALL_FILTER_SET_OBSOLETE] = {
                /* some unknown even to libseccomp */
                .name = "@obsolete",
                .help = "Unusual, obsolete or unimplemented system calls",
                .value =
                "_sysctl\0"
                "afs_syscall\0"
                "break\0"
                "create_module\0"
                "ftime\0"
                "get_kernel_syms\0"
                "getpmsg\0"
                "gtty\0"
                "lock\0"
                "mpx\0"
                "prof\0"
                "profil\0"
                "putpmsg\0"
                "query_module\0"
                "security\0"
                "sgetmask\0"
                "ssetmask\0"
                "stty\0"
                "sysfs\0"
                "tuxcall\0"
                "ulimit\0"
                "uselib\0"
                "ustat\0"
                "vserver\0"
        },
        [SYSCALL_FILTER_SET_PRIVILEGED] = {
                .name = "@privileged",
                .help = "All system calls which need super-user capabilities",
                .value =
                "@clock\0"
                "@module\0"
                "@raw-io\0"
                "acct\0"
                "bdflush\0"
                "bpf\0"
                "capset\0"
                "chown32\0"
                "chown\0"
                "chroot\0"
                "fchown32\0"
                "fchown\0"
                "fchownat\0"
                "kexec_file_load\0"
                "kexec_load\0"
                "lchown32\0"
                "lchown\0"
                "nfsservctl\0"
                "pivot_root\0"
                "quotactl\0"
                "reboot\0"
                "setdomainname\0"
                "setfsuid32\0"
                "setfsuid\0"
                "setgroups32\0"
                "setgroups\0"
                "sethostname\0"
                "setresuid32\0"
                "setresuid\0"
                "setreuid32\0"
                "setreuid\0"
                "setuid32\0"
                "setuid\0"
                "swapoff\0"
                "swapon\0"
                "_sysctl\0"
                "vhangup\0"
        },
        [SYSCALL_FILTER_SET_PROCESS] = {
                .name = "@process",
                .help = "Process control, execution, namespaceing operations",
                .value =
                "arch_prctl\0"
                "clone\0"
                "execveat\0"
                "fork\0"
                "kill\0"
                "prctl\0"
                "setns\0"
                "tgkill\0"
                "tkill\0"
                "unshare\0"
                "vfork\0"
        },
        [SYSCALL_FILTER_SET_RAW_IO] = {
                .name = "@raw-io",
                .help = "Raw I/O port access",
                .value =
                "ioperm\0"
                "iopl\0"
                "pciconfig_iobase\0"
                "pciconfig_read\0"
                "pciconfig_write\0"
#ifdef __NR_s390_pci_mmio_read
                "s390_pci_mmio_read\0"
#endif
#ifdef __NR_s390_pci_mmio_write
                "s390_pci_mmio_write\0"
#endif
        },
        [SYSCALL_FILTER_SET_RESOURCES] = {
                /* Alter resource settings */
                .name = "@resources",
                .value =
                "sched_setparam\0"
                "sched_setscheduler\0"
                "sched_setaffinity\0"
                "setpriority\0"
                "setrlimit\0"
                "set_mempolicy\0"
                "migrate_pages\0"
                "move_pages\0"
                "mbind\0"
                "sched_setattr\0"
                "prlimit64\0"
        },
};
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
        unsigned i;
        if (isempty(name) || name[0] != '@')
                return NULL;
        for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
                if (streq(syscall_filter_sets[i].name, name))
                        return syscall_filter_sets + i;
        return NULL;
}
int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
        const char *sys;
        int r;
        assert(seccomp);
        assert(set);
        NULSTR_FOREACH(sys, set->value) {
                int id;
                if (sys[0] == '@') {
                        const SyscallFilterSet *other;
                        other = syscall_filter_set_find(sys);
                        if (!other)
                                return -EINVAL;
                        r = seccomp_add_syscall_filter_set(seccomp, other, action);
                } else {
                        id = seccomp_syscall_resolve_name(sys);
                        if (id == __NR_SCMP_ERROR)
                                return -EINVAL;
                        r = seccomp_rule_add(seccomp, action, id, 0);
                }
                if (r < 0)
                        return r;
        }
        return 0;
}
int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
        scmp_filter_ctx seccomp;
        int r;
        assert(set);
        /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
        r = seccomp_init_conservative(&seccomp, default_action);
        if (r < 0)
                return r;
        r = seccomp_add_syscall_filter_set(seccomp, set, action);
        if (r < 0)
                goto finish;
        r = seccomp_load(seccomp);
finish:
        seccomp_release(seccomp);
        return r;
}
int seccomp_restrict_namespaces(unsigned long retain) {
        scmp_filter_ctx seccomp;
        unsigned i;
        int r;
        if (log_get_max_level() >= LOG_DEBUG) {
                _cleanup_free_ char *s = NULL;
                (void) namespace_flag_to_string_many(retain, &s);
                log_debug("Restricting namespace to: %s.", strna(s));
        }
        /* NOOP? */
        if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
                return 0;
        r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
        if (r < 0)
                return r;
        if ((retain & NAMESPACE_FLAGS_ALL) == 0)
                /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
                 * altogether. */
                r = seccomp_rule_add(
                                seccomp,
                                SCMP_ACT_ERRNO(EPERM),
                                SCMP_SYS(setns),
                                0);
        else
                /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
                 * special invocation with a zero flags argument, right here. */
                r = seccomp_rule_add(
                                seccomp,
                                SCMP_ACT_ERRNO(EPERM),
                                SCMP_SYS(setns),
                                1,
                                SCMP_A1(SCMP_CMP_EQ, 0));
        if (r < 0)
                goto finish;
        for (i = 0; namespace_flag_map[i].name; i++) {
                unsigned long f;
                f = namespace_flag_map[i].flag;
                if ((retain & f) == f) {
                        log_debug("Permitting %s.", namespace_flag_map[i].name);
                        continue;
                }
                log_debug("Blocking %s.", namespace_flag_map[i].name);
                r = seccomp_rule_add(
                                seccomp,
                                SCMP_ACT_ERRNO(EPERM),
                                SCMP_SYS(unshare),
                                1,
                                SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
                if (r < 0)
                        goto finish;
                r = seccomp_rule_add(
                                seccomp,
                                SCMP_ACT_ERRNO(EPERM),
                                SCMP_SYS(clone),
                                1,
                                SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
                if (r < 0)
                        goto finish;
                if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
                        r = seccomp_rule_add(
                                        seccomp,
                                        SCMP_ACT_ERRNO(EPERM),
                                        SCMP_SYS(setns),
                                        1,
                                        SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
                        if (r < 0)
                                goto finish;
                }
        }
        r = seccomp_load(seccomp);
finish:
        seccomp_release(seccomp);
        return r;
}