Initial import

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
commit: 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree: 5e910f0e82173f4ef4f51111366a3f1299037a7b /Documentation/prctl
7 files changed, 581 insertions, 0 deletions
diff --git a/Documentation/prctl/.gitignore b/Documentation/prctl/.gitignore
new file mode 100644
index 000000000..0b5c27447
--- /dev/null
+++ b/Documentation/prctl/.gitignore
@@ -0,0 +1,3 @@
+disable-tsc-ctxt-sw-stress-test
+disable-tsc-on-off-stress-test
+disable-tsc-test
diff --git a/Documentation/prctl/Makefile b/Documentation/prctl/Makefile
new file mode 100644
index 000000000..2948b7b12
--- /dev/null
+++ b/Documentation/prctl/Makefile
@@ -0,0 +1,8 @@
+# List of programs to build
+hostprogs-$(CONFIG_X86) := disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test disable-tsc-test
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_disable-tsc-ctxt-sw-stress-test.o += -I$(objtree)/usr/include
+HOSTCFLAGS_disable-tsc-on-off-stress-test.o += -I$(objtree)/usr/include
+HOSTCFLAGS_disable-tsc-test.o += -I$(objtree)/usr/include
diff --git a/Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c b/Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c
new file mode 100644
index 000000000..81fdd425a
--- /dev/null
+++ b/Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c
@@ -0,0 +1,97 @@
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Tests if the control register is updated correctly
+ * at context switches
+ *
+ * Warning: this test will cause a very high load for a few seconds
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <wait.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE		1   /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV		2   /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+static uint64_t rdtsc(void)
+{
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+static void sigsegv_expect(int sig)
+{
+	/* */
+}
+
+static void segvtask(void)
+{
+	if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
+	{
+		perror("prctl");
+		exit(0);
+	}
+	signal(SIGSEGV, sigsegv_expect);
+	alarm(10);
+	rdtsc();
+	fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
+	exit(0);
+}
+
+
+static void sigsegv_fail(int sig)
+{
+	fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
+	exit(0);
+}
+
+static void rdtsctask(void)
+{
+	if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
+	{
+		perror("prctl");
+		exit(0);
+	}
+	signal(SIGSEGV, sigsegv_fail);
+	alarm(10);
+	for(;;) rdtsc();
+}
+
+
+int main(int argc, char **argv)
+{
+	int n_tasks = 100, i;
+
+	fprintf(stderr, "[No further output means we're allright]\n");
+
+	for (i=0; i<n_tasks; i++)
+		if (fork() == 0)
+		{
+			if (i & 1)
+				segvtask();
+			else
+				rdtsctask();
+		}
+
+	for (i=0; i<n_tasks; i++)
+		wait(NULL);
+
+	exit(0);
+}
+
diff --git a/Documentation/prctl/disable-tsc-on-off-stress-test.c b/Documentation/prctl/disable-tsc-on-off-stress-test.c
new file mode 100644
index 000000000..4d83a2762
--- /dev/null
+++ b/Documentation/prctl/disable-tsc-on-off-stress-test.c
@@ -0,0 +1,96 @@
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Tests if the control register is updated correctly
+ * when set with prctl()
+ *
+ * Warning: this test will cause a very high load for a few seconds
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <wait.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE		1   /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV		2   /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+/* snippet from wikipedia :-) */
+
+static uint64_t rdtsc(void)
+{
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+int should_segv = 0;
+
+static void sigsegv_cb(int sig)
+{
+	if (!should_segv)
+	{
+		fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
+		exit(0);
+	}
+	if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
+	{
+		perror("prctl");
+		exit(0);
+	}
+	should_segv = 0;
+
+	rdtsc();
+}
+
+static void task(void)
+{
+	signal(SIGSEGV, sigsegv_cb);
+	alarm(10);
+	for(;;)
+	{
+		rdtsc();
+		if (should_segv)
+		{
+			fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
+			exit(0);
+		}
+		if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
+		{
+			perror("prctl");
+			exit(0);
+		}
+		should_segv = 1;
+	}
+}
+
+
+int main(int argc, char **argv)
+{
+	int n_tasks = 100, i;
+
+	fprintf(stderr, "[No further output means we're allright]\n");
+
+	for (i=0; i<n_tasks; i++)
+		if (fork() == 0)
+			task();
+
+	for (i=0; i<n_tasks; i++)
+		wait(NULL);
+
+	exit(0);
+}
+
diff --git a/Documentation/prctl/disable-tsc-test.c b/Documentation/prctl/disable-tsc-test.c
new file mode 100644
index 000000000..2541e65cb
--- /dev/null
+++ b/Documentation/prctl/disable-tsc-test.c
@@ -0,0 +1,95 @@
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Basic test to test behaviour of PR_GET_TSC and PR_SET_TSC
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE		1   /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV		2   /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+const char *tsc_names[] =
+{
+	[0] = "[not set]",
+	[PR_TSC_ENABLE] = "PR_TSC_ENABLE",
+	[PR_TSC_SIGSEGV] = "PR_TSC_SIGSEGV",
+};
+
+static uint64_t rdtsc(void)
+{
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+static void sigsegv_cb(int sig)
+{
+	int tsc_val = 0;
+
+	printf("[ SIG_SEGV ]\n");
+	printf("prctl(PR_GET_TSC, &tsc_val); ");
+	fflush(stdout);
+
+	if ( prctl(PR_GET_TSC, &tsc_val) == -1)
+		perror("prctl");
+
+	printf("tsc_val == %s\n", tsc_names[tsc_val]);
+	printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
+	fflush(stdout);
+	if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
+		perror("prctl");
+
+	printf("rdtsc() == ");
+}
+
+int main(int argc, char **argv)
+{
+	int tsc_val = 0;
+
+	signal(SIGSEGV, sigsegv_cb);
+
+	printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+	printf("prctl(PR_GET_TSC, &tsc_val); ");
+	fflush(stdout);
+
+	if ( prctl(PR_GET_TSC, &tsc_val) == -1)
+		perror("prctl");
+
+	printf("tsc_val == %s\n", tsc_names[tsc_val]);
+	printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+	printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
+	fflush(stdout);
+
+	if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
+		perror("prctl");
+
+	printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+	printf("prctl(PR_SET_TSC, PR_TSC_SIGSEGV)\n");
+	fflush(stdout);
+
+	if ( prctl(PR_SET_TSC, PR_TSC_SIGSEGV) == -1)
+		perror("prctl");
+
+	printf("rdtsc() == ");
+	fflush(stdout);
+	printf("%llu\n", (unsigned long long)rdtsc());
+	fflush(stdout);
+
+	exit(EXIT_SUCCESS);
+}
+
diff --git a/Documentation/prctl/no_new_privs.txt b/Documentation/prctl/no_new_privs.txt
new file mode 100644
index 000000000..f7be84fba
--- /dev/null
+++ b/Documentation/prctl/no_new_privs.txt
@@ -0,0 +1,57 @@
+The execve system call can grant a newly-started program privileges that
+its parent did not have.  The most obvious examples are setuid/setgid
+programs and file capabilities.  To prevent the parent program from
+gaining these privileges as well, the kernel and user code must be
+careful to prevent the parent from doing anything that could subvert the
+child.  For example:
+
+ - The dynamic loader handles LD_* environment variables differently if
+   a program is setuid.
+
+ - chroot is disallowed to unprivileged processes, since it would allow
+   /etc/passwd to be replaced from the point of view of a process that
+   inherited chroot.
+
+ - The exec code has special handling for ptrace.
+
+These are all ad-hoc fixes.  The no_new_privs bit (since Linux 3.5) is a
+new, generic mechanism to make it safe for a process to modify its
+execution environment in a manner that persists across execve.  Any task
+can set no_new_privs.  Once the bit is set, it is inherited across fork,
+clone, and execve and cannot be unset.  With no_new_privs set, execve
+promises not to grant the privilege to do anything that could not have
+been done without the execve call.  For example, the setuid and setgid
+bits will no longer change the uid or gid; file capabilities will not
+add to the permitted set, and LSMs will not relax constraints after
+execve.
+
+To set no_new_privs, use prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0).
+
+Be careful, though: LSMs might also not tighten constraints on exec
+in no_new_privs mode.  (This means that setting up a general-purpose
+service launcher to set no_new_privs before execing daemons may
+interfere with LSM-based sandboxing.)
+
+Note that no_new_privs does not prevent privilege changes that do not
+involve execve.  An appropriately privileged task can still call
+setuid(2) and receive SCM_RIGHTS datagrams.
+
+There are two main use cases for no_new_privs so far:
+
+ - Filters installed for the seccomp mode 2 sandbox persist across
+   execve and can change the behavior of newly-executed programs.
+   Unprivileged users are therefore only allowed to install such filters
+   if no_new_privs is set.
+
+ - By itself, no_new_privs can be used to reduce the attack surface
+   available to an unprivileged user.  If everything running with a
+   given uid has no_new_privs set, then that uid will be unable to
+   escalate its privileges by directly attacking setuid, setgid, and
+   fcap-using binaries; it will need to compromise something without the
+   no_new_privs bit set first.
+
+In the future, other potentially dangerous kernel features could become
+available to unprivileged tasks if no_new_privs is set.  In principle,
+several options to unshare(2) and clone(2) would be safe when
+no_new_privs is set, and no_new_privs + chroot is considerable less
+dangerous than chroot by itself.
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
new file mode 100644
index 000000000..1e469ef75
--- /dev/null
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -0,0 +1,225 @@
+		SECure COMPuting with filters
+		=============================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the process.
+As system calls change and mature, bugs are found and eradicated.  A
+certain subset of userland applications benefit by having a reduced set
+of available system calls.  The resulting set reduces the total kernel
+surface exposed to the application.  System call filtering is meant for
+use with those applications.
+
+Seccomp filtering provides a means for a process to specify a filter for
+incoming system calls.  The filter is expressed as a Berkeley Packet
+Filter (BPF) program, as with socket filters, except that the data
+operated on is related to the system call being made: system call
+number and the system call arguments.  This allows for expressive
+filtering of system calls using a filter program language with a long
+history of being exposed to userland and a straightforward data set.
+
+Additionally, BPF makes it impossible for users of seccomp to fall prey
+to time-of-check-time-of-use (TOCTOU) attacks that are common in system
+call interposition frameworks.  BPF programs may not dereference
+pointers which constrains all filters to solely evaluating the system
+call arguments directly.
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox.  It provides a clearly defined
+mechanism for minimizing the exposed kernel surface.  It is meant to be
+a tool for sandbox developers to use.  Beyond that, policy for logical
+behavior and information flow should be managed with a combination of
+other system hardening techniques and, potentially, an LSM of your
+choosing.  Expressive, dynamic filters provide further options down this
+path (avoiding pathological sizes or selecting which of the multiplexed
+system calls in socketcall() is allowed, for instance) which could be
+construed, incorrectly, as a more complete sandboxing solution.
+
+Usage
+-----
+
+An additional seccomp mode is added and is enabled using the same
+prctl(2) call as the strict seccomp.  If the architecture has
+CONFIG_HAVE_ARCH_SECCOMP_FILTER, then filters may be added as below:
+
+PR_SET_SECCOMP:
+	Now takes an additional argument which specifies a new filter
+	using a BPF program.
+	The BPF program will be executed over struct seccomp_data
+	reflecting the system call number, arguments, and other
+	metadata.  The BPF program must then return one of the
+	acceptable values to inform the kernel which action should be
+	taken.
+
+	Usage:
+		prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog);
+
+	The 'prog' argument is a pointer to a struct sock_fprog which
+	will contain the filter program.  If the program is invalid, the
+	call will return -1 and set errno to EINVAL.
+
+	If fork/clone and execve are allowed by @prog, any child
+	processes will be constrained to the same filters and system
+	call ABI as the parent.
+
+	Prior to use, the task must call prctl(PR_SET_NO_NEW_PRIVS, 1) or
+	run with CAP_SYS_ADMIN privileges in its namespace.  If these are not
+	true, -EACCES will be returned.  This requirement ensures that filter
+	programs cannot be applied to child processes with greater privileges
+	than the task that installed them.
+
+	Additionally, if prctl(2) is allowed by the attached filter,
+	additional filters may be layered on which will increase evaluation
+	time, but allow for further decreasing the attack surface during
+	execution of a process.
+
+The above call returns 0 on success and non-zero on error.
+
+Return values
+-------------
+A seccomp filter may return any of the following values. If multiple
+filters exist, the return value for the evaluation of a given system
+call will always use the highest precedent value. (For example,
+SECCOMP_RET_KILL will always take precedence.)
+
+In precedence order, they are:
+
+SECCOMP_RET_KILL:
+	Results in the task exiting immediately without executing the
+	system call.  The exit status of the task (status & 0x7f) will
+	be SIGSYS, not SIGKILL.
+
+SECCOMP_RET_TRAP:
+	Results in the kernel sending a SIGSYS signal to the triggering
+	task without executing the system call.  siginfo->si_call_addr
+	will show the address of the system call instruction, and
+	siginfo->si_syscall and siginfo->si_arch will indicate which
+	syscall was attempted.  The program counter will be as though
+	the syscall happened (i.e. it will not point to the syscall
+	instruction).  The return value register will contain an arch-
+	dependent value -- if resuming execution, set it to something
+	sensible.  (The architecture dependency is because replacing
+	it with -ENOSYS could overwrite some useful information.)
+
+	The SECCOMP_RET_DATA portion of the return value will be passed
+	as si_errno.
+
+	SIGSYS triggered by seccomp will have a si_code of SYS_SECCOMP.
+
+SECCOMP_RET_ERRNO:
+	Results in the lower 16-bits of the return value being passed
+	to userland as the errno without executing the system call.
+
+SECCOMP_RET_TRACE:
+	When returned, this value will cause the kernel to attempt to
+	notify a ptrace()-based tracer prior to executing the system
+	call.  If there is no tracer present, -ENOSYS is returned to
+	userland and the system call is not executed.
+
+	A tracer will be notified if it requests PTRACE_O_TRACESECCOMP
+	using ptrace(PTRACE_SETOPTIONS).  The tracer will be notified
+	of a PTRACE_EVENT_SECCOMP and the SECCOMP_RET_DATA portion of
+	the BPF program return value will be available to the tracer
+	via PTRACE_GETEVENTMSG.
+
+	The tracer can skip the system call by changing the syscall number
+	to -1.  Alternatively, the tracer can change the system call
+	requested by changing the system call to a valid syscall number.  If
+	the tracer asks to skip the system call, then the system call will
+	appear to return the value that the tracer puts in the return value
+	register.
+
+	The seccomp check will not be run again after the tracer is
+	notified.  (This means that seccomp-based sandboxes MUST NOT
+	allow use of ptrace, even of other sandboxed processes, without
+	extreme care; ptracers can use this mechanism to escape.)
+
+SECCOMP_RET_ALLOW:
+	Results in the system call being executed.
+
+If multiple filters exist, the return value for the evaluation of a
+given system call will always use the highest precedent value.
+
+Precedence is only determined using the SECCOMP_RET_ACTION mask.  When
+multiple filters return values of the same precedence, only the
+SECCOMP_RET_DATA from the most recently installed filter will be
+returned.
+
+Pitfalls
+--------
+
+The biggest pitfall to avoid during use is filtering on system call
+number without checking the architecture value.  Why?  On any
+architecture that supports multiple system call invocation conventions,
+the system call numbers may vary based on the specific invocation.  If
+the numbers in the different calling conventions overlap, then checks in
+the filters may be abused.  Always check the arch value!
+
+Example
+-------
+
+The samples/seccomp/ directory contains both an x86-specific example
+and a more generic example of a higher level macro interface for BPF
+program generation.
+
+
+
+Adding architecture support
+-----------------------
+
+See arch/Kconfig for the authoritative requirements.  In general, if an
+architecture supports both ptrace_event and seccomp, it will be able to
+support seccomp filter with minor fixup: SIGSYS support and seccomp return
+value checking.  Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
+to its arch-specific Kconfig.
+
+
+
+Caveats
+-------
+
+The vDSO can cause some system calls to run entirely in userspace,
+leading to surprises when you run programs on different machines that
+fall back to real syscalls.  To minimize these surprises on x86, make
+sure you test with
+/sys/devices/system/clocksource/clocksource0/current_clocksource set to
+something like acpi_pm.
+
+On x86-64, vsyscall emulation is enabled by default.  (vsyscalls are
+legacy variants on vDSO calls.)  Currently, emulated vsyscalls will honor seccomp, with a few oddities:
+
+- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
+  the vsyscall entry for the given call and not the address after the
+  'syscall' instruction.  Any code which wants to restart the call
+  should be aware that (a) a ret instruction has been emulated and (b)
+  trying to resume the syscall will again trigger the standard vsyscall
+  emulation security checks, making resuming the syscall mostly
+  pointless.
+
+- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
+  but the syscall may not be changed to another system call using the
+  orig_rax register. It may only be changed to -1 order to skip the
+  currently emulated call. Any other change MAY terminate the process.
+  The rip value seen by the tracer will be the syscall entry address;
+  this is different from normal behavior.  The tracer MUST NOT modify
+  rip or rsp.  (Do not rely on other changes terminating the process.
+  They might work.  For example, on some kernels, choosing a syscall
+  that only exists in future kernels will be correctly emulated (by
+  returning -ENOSYS).
+
+To detect this quirky behavior, check for addr & ~0x0C00 ==
+0xFFFFFFFFFF600000.  (For SECCOMP_RET_TRACE, use rip.  For
+SECCOMP_RET_TRAP, use siginfo->si_call_addr.)  Do not check any other
+condition: future kernels may improve vsyscall emulation and current
+kernels in vsyscall=native mode will behave differently, but the
+instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
+cases.
+
+Note that modern systems are unlikely to use vsyscalls at all -- they
+are a legacy feature and they are considerably slower than standard
+syscalls.  New code will use the vDSO, and vDSO-issued system calls
+are indistinguishable from normal system calls.
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
commit	57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree	5e910f0e82173f4ef4f51111366a3f1299037a7b /Documentation/prctl